def dot_prod_attention(self, h_t: torch.Tensor, src_encoding: torch.Tensor, src_encoding_att_linear: torch.Tensor, mask: torch.Tensor=None) -> Tuple[torch.Tensor, torch.Tensor]: # (batch_size, src_sent_len) att_weight = torch.bmm(src_encoding_att_linear, h_t.unsqueeze(2)).squeeze(2) if mask is not None: att_weight.data.masked_fill_(mask.byte(), -float('inf')) softmaxed_att_weight = F.softmax(att_weight, dim=-1) att_view = (att_weight.size(0), 1, att_weight.size(1)) # (batch_size, hidden_size) ctx_vec = torch.bmm(softmaxed_att_weight.view(*att_view), src_encoding).squeeze(1) return ctx_vec, softmaxed_att_weight
def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ### YOUR CODE HERE (~3 Lines) ### TODO: ### 1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. ### 2. Split dec_state into its two parts (dec_hidden, dec_cell) ### 3. Compute the attention scores e_t, a Tensor shape (b, src_len). ### Note: b = batch_size, src_len = maximum source length, h = hidden size. ### ### Hints: ### - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched) ### - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched). ### - Use batched matrix multiplication (torch.bmm) to compute e_t. ### - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing. ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### ### Use the following docs to implement this functionality: ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor Unsqueeze: ### https://pytorch.org/docs/stable/torch.html#torch.unsqueeze ### Tensor Squeeze: ### https://pytorch.org/docs/stable/torch.html#torch.squeeze dec_state = self.decoder(Ybar_t, dec_state) (dec_hidden, dec_cell) = dec_state e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2) ### END YOUR CODE # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.byte(), -float('inf')) ### YOUR CODE HERE (~6 Lines) ### TODO: ### 1. Apply softmax to e_t to yield alpha_t ### 2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the ### attention output vector, a_t. #$$ Hints: ### - alpha_t is shape (b, src_len) ### - enc_hiddens is shape (b, src_len, 2h) ### - a_t should be shape (b, 2h) ### - You will need to do some squeezing and unsqueezing. ### Note: b = batch size, src_len = maximum source length, h = hidden size. ### ### 3. Concatenate dec_hidden with a_t to compute tensor U_t ### 4. Apply the combined output projection layer to U_t to compute tensor V_t ### 5. Compute tensor O_t by first applying the Tanh function and then the dropout layer. ### ### Use the following docs to implement this functionality: ### Softmax: ### https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor View: ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tanh: ### https://pytorch.org/docs/stable/torch.html#torch.tanh alpha_t = F.softmax(e_t, dim=1) a_t = alpha_t.unsqueeze(1).bmm(enc_hiddens).squeeze(1) U_t = torch.cat((a_t, dec_hidden), dim=1) V_t = self.combined_output_projection(U_t) O_t = self.dropout(torch.tanh(V_t)) ### END YOUR CODE combined_output = O_t return dec_state, combined_output, e_t
def draw_segmentation_masks( image: torch.Tensor, masks: torch.Tensor, alpha: float = 0.2, colors: Optional[List[Union[str, Tuple[int, int, int]]]] = None, ) -> torch.Tensor: """ Draws segmentation masks on given RGB image. The values of the input image should be uint8 between 0 and 255. Args: image (Tensor): Tensor of shape (3 x H x W) and dtype uint8. masks (Tensor): Tensor of shape (num_masks, H, W). Each containing probability of predicted class. alpha (float): Float number between 0 and 1 denoting factor of transpaerency of masks. colors (List[Union[str, Tuple[int, int, int]]]): List containing the colors of masks. The colors can be represented as `str` or `Tuple[int, int, int]`. Returns: img (Tensor[C, H, W]): Image Tensor of dtype uint8 with segmentation masks plotted. Example: See this notebook `attached <https://github.com/pytorch/vision/blob/master/examples/python/visualization_utils.ipynb>`_ """ if not isinstance(image, torch.Tensor): raise TypeError(f"Tensor expected, got {type(image)}") elif image.dtype != torch.uint8: raise ValueError(f"Tensor uint8 expected, got {image.dtype}") elif image.dim() != 3: raise ValueError("Pass individual images, not batches") elif image.size()[0] != 3: raise ValueError("Pass an RGB image. Other Image formats are not supported") num_masks = masks.size()[0] masks = masks.argmax(0) if colors is None: palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1]) colors_t = torch.as_tensor([i for i in range(num_masks)])[:, None] * palette color_arr = (colors_t % 255).numpy().astype("uint8") else: color_list = [] for color in colors: if isinstance(color, str): # This will automatically raise Error if rgb cannot be parsed. fill_color = ImageColor.getrgb(color) color_list.append(fill_color) elif isinstance(color, tuple): color_list.append(color) color_arr = np.array(color_list).astype("uint8") _, h, w = image.size() img_to_draw = Image.fromarray(masks.byte().cpu().numpy()).resize((w, h)) img_to_draw.putpalette(color_arr) img_to_draw = torch.from_numpy(np.array(img_to_draw.convert('RGB'))) img_to_draw = img_to_draw.permute((2, 0, 1)) return (image.float() * alpha + img_to_draw.float() * (1.0 - alpha)).to(dtype=torch.uint8)
def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None # new dec_state dec_state = self.decoder(Ybar_t, dec_state) (dec_hidden, dec_cell) = dec_state dec_hidden = torch.unsqueeze(dec_hidden, dim=2) # Attention scores e_t = torch.bmm(enc_hiddens_proj, dec_hidden) e_t = torch.squeeze(e_t, dim=2) # Set e_t to -inf where enc_masks has 1 # http://juditacs.github.io/2018/12/27/masked-attention.html if enc_masks is not None: e_t.data.masked_fill_(enc_masks.byte(), -float('inf')) # Attention distribution alpha_t = F.softmax(e_t, dim=1) alpha_t = torch.unsqueeze(alpha_t, dim=1) a_t = torch.bmm(alpha_t, enc_hiddens) a_t = torch.squeeze(a_t, dim=1) dec_hidden = torch.squeeze(dec_hidden, dim=2) U_t = torch.cat((dec_hidden, a_t), dim=1) V_t = self.combined_output_projection(U_t) O_t = torch.tanh(V_t) O_t = self.dropout(O_t) combined_output = O_t return dec_state, combined_output, e_t
def accuracy_thresh(y_pred:Tensor, y_true:Tensor, thresh:float=0.5, sigmoid:bool=True): "Compute accuracy when `y_pred` and `y_true` are the same size." if sigmoid: y_pred = y_pred.sigmoid() # return ((y_pred>thresh)==y_true.byte()).float().mean().item() return np.mean(((y_pred>thresh)==y_true.byte()).float().cpu().numpy(), axis=1).sum()
def _vocab_loss(self, generate_scores: torch.Tensor, copy_scores: torch.Tensor, target_tokens: torch.Tensor, target_alias_indices: torch.Tensor, mask: torch.Tensor, alias_indices: torch.Tensor, alias_tokens: torch.Tensor, mention_mask: torch.Tensor): batch_size, sequence_length, vocab_size = generate_scores.shape copy_sequence_length = copy_scores.shape[-1] # Flat sequences make life **much** easier. flattened_targets = target_tokens.view(batch_size * sequence_length, 1) flattened_mask = mask.view(-1, 1).byte() # In order to obtain proper log probabilities we create a mask to omit padding alias tokens # from the calculation. alias_mask = alias_indices.view(batch_size, sequence_length, -1).gt(0) score_mask = mask.new_ones(batch_size, sequence_length, vocab_size + copy_sequence_length) score_mask[:, :, vocab_size:] = alias_mask # The log-probability distribution is then given by taking the masked log softmax. concatenated_scores = torch.cat((generate_scores, copy_scores), dim=-1) log_probs = masked_log_softmax(concatenated_scores, score_mask) # GENERATE LOSS ### # The generated token loss is a simple cross-entropy calculation, we can just gather # the log probabilties... flattened_log_probs = log_probs.view(batch_size * sequence_length, -1) generate_log_probs_source_vocab = flattened_log_probs.gather( 1, flattened_targets) # ...except we need to ignore the contribution of UNK tokens that are copied (only when # computing the loss). To do that we create a mask which is 1 only if the token is not a # copied UNK (or padding). unks = target_tokens.eq(self._unk_index).view(-1, 1) copied = target_alias_indices.gt(0).view(-1, 1) generate_mask = ~(unks & copied) & flattened_mask # Since we are in log-space we apply the mask by addition. generate_log_probs_extended_vocab = generate_log_probs_source_vocab + ( generate_mask.float() + 1e-45).log() # COPY LOSS ### copy_log_probs = flattened_log_probs[:, vocab_size:] # When computing the loss we need to get the log probability of **only** the copied tokens. alias_indices = alias_indices.view(batch_size * sequence_length, -1) target_alias_indices = target_alias_indices.view(-1, 1) copy_mask = alias_indices.eq( target_alias_indices) & flattened_mask & target_alias_indices.gt(0) copy_log_probs = copy_log_probs + (copy_mask.float() + 1e-45).log() # COMBINED LOSS ### # The final loss term is computed using our log probs computed w.r.t to the entire # vocabulary. combined_log_probs_extended_vocab = torch.cat( (generate_log_probs_extended_vocab, copy_log_probs), dim=1) combined_log_probs_extended_vocab = torch.logsumexp( combined_log_probs_extended_vocab, dim=1) vocab_loss = -combined_log_probs_extended_vocab.sum() / (mask.sum() + 1e-13) # PERPLEXITY ### # Our perplexity terms are computed using the log probs computed w.r.t the source # vocabulary. combined_log_probs_source_vocab = torch.cat( (generate_log_probs_source_vocab, copy_log_probs), dim=1) combined_log_probs_source_vocab = torch.logsumexp( combined_log_probs_source_vocab, dim=1) # For UPP we penalize **only** p(UNK); not the copy probabilities! penalized_log_probs_source_vocab = generate_log_probs_source_vocab - self._unk_penalty * unks.float( ) penalized_log_probs_source_vocab = torch.cat( (penalized_log_probs_source_vocab, copy_log_probs), dim=1) penalized_log_probs_source_vocab = torch.logsumexp( penalized_log_probs_source_vocab, dim=1) kg_mask = (mention_mask * mask.byte()).view(-1) bg_mask = ((1 - mention_mask) * mask.byte()).view(-1) mask = (kg_mask | bg_mask) self._ppl(-combined_log_probs_source_vocab[mask].sum(), mask.float().sum() + 1e-13) self._upp(-penalized_log_probs_source_vocab[mask].sum(), mask.float().sum() + 1e-13) if kg_mask.any(): self._kg_ppl(-combined_log_probs_source_vocab[kg_mask].sum(), kg_mask.float().sum() + 1e-13) if bg_mask.any(): self._bg_ppl(-combined_log_probs_source_vocab[bg_mask].sum(), bg_mask.float().sum() + 1e-13) return vocab_loss
def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 # step 1 # applying decoder layer dec_state = self.decoder(Ybar_t, dec_state) # step 2 dec_hidden, dec_cell = dec_state # step 3 # (b, src_len, h) * (b, h, 1) -> (b, src_len, 1) e_t = torch.bmm(enc_hiddens_proj, dec_hidden.unsqueeze(dim=2)) e_t = torch.squeeze(e_t, dim=2) ### END YOUR CODE FROM ASSIGNMENT 4 # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.byte(), -float('inf')) ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 # step 1 alpha_t = nn.Softmax(dim=1)(e_t) # step 2 # (b, 1, src_len) * (b, src_len, 2h) -> (b, 1, 2h) a_t = torch.bmm(alpha_t.unsqueeze(dim=1), enc_hiddens) a_t = torch.squeeze(a_t, dim=1) # step 3 U_t = torch.cat((dec_hidden, a_t), dim=1) # step 4 # applying linear layer V_t = self.combined_output_projection(U_t) # step 5 O_t = self.dropout(torch.tanh(V_t)) ### END YOUR CODE FROM ASSIGNMENT 4 combined_output = O_t return dec_state, combined_output, e_t
def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ### YOUR CODE HERE (~3 Lines) ### TODO: ### 1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. ### 2. Split dec_state into its two parts (dec_hidden, dec_cell) ### 3. Compute the attention scores e_t, a Tensor shape (b, src_len). ### Note: b = batch_size, src_len = maximum source length, h = hidden size. ### ### Hints: ### - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched) ### - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched). ### - Use batched matrix multiplication (torch.bmm) to compute e_t. ### - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing. ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### ### Use the following docs to implement this functionality: ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor Unsqueeze: ### https://pytorch.org/docs/stable/torch.html#torch.unsqueeze ### Tensor Squeeze: ### https://pytorch.org/docs/stable/torch.html#torch.squeeze ################################################################################################################################################################### ############ ### Step 1: Apply the input (concatenation of word embedding input at current time-step and output at previous time-step) into Decoder LSTMCell to get new output at current time-step ### ############ # LSTMCell # Inputs: input, (h_0, c_0) # Outputs: (h_1, c_1) # Ybar_t: concatenated LSTM input of current time-step, shape (b, e+h) # dec_state as input contains both hidden state and cell state, hidden state and cell state both are shape (b, h) # dec_state as output: shape (2, b, h) dec_state = self.decoder(Ybar_t, dec_state) ############ ### Step 2: Split dec_state into its two parts (dec_hidden, dec_cell) ### ############ # dec_hidden, dec_cell: shape (b, h) (dec_hidden, dec_cell) = dec_state ############ ### Step 3: Compute attention score vector for the current time-step ### ############ # We multiply the hidden state vector “projection” of the entire Encoding network by the hidden state of the current time-step in Decoder network to get the attention score vector for the current time-step # enc_hiddens_proj: shape (b, src_len, h) # dec_hidden: shape (b, h) # torch.unsqueeze(dec_hidden, 2): shape (b, h, 1) # torch.bmm(input, mat2, out=None) → Tensor # If input is a (b×n×m) tensor, mat2 is a (b×m×p) tensor, out will be a (b×n×p) tensor. # enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)): shape (b, src_len, 1) # e_t: shape (b, src_len) # e_t contains the attentions score of each time-step in Encoding network on the current one time-step in Decoder network e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2) ################################################################################################################################################################### ### END YOUR CODE # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.byte(), -float('inf')) ### YOUR CODE HERE (~6 Lines) ### TODO: ### 1. Apply softmax to e_t to yield alpha_t ### 2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the ### attention output vector, a_t. #$$ Hints: ### - alpha_t is shape (b, src_len) ### - enc_hiddens is shape (b, src_len, 2h) ### - a_t should be shape (b, 2h) ### - You will need to do some squeezing and unsqueezing. ### Note: b = batch size, src_len = maximum source length, h = hidden size. ### ### 3. Concatenate dec_hidden with a_t to compute tensor U_t ### 4. Apply the combined output projection layer to U_t to compute tensor V_t ### 5. Compute tensor O_t by first applying the Tanh function and then the dropout layer. ### ### Use the following docs to implement this functionality: ### Softmax: ### https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor View: ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tanh: ### https://pytorch.org/docs/stable/torch.html#torch.tanh ################################################################################################################################################################### ############ ### Step 1: Compute attention distribution alpha_t for the current time-step ### ############ # Softmax converts all attentions scores into values between [0, 1] and add up to 1 # e_t: shape (b, src_len) # alpha_t: shape (b, src_len) alpha_t = F.softmax(e_t, dim=1) ############ ### Step 2: Compute attention output a_t for the current time-step ### ############ # We multiply the attention distribution vector by the hidden state vector of the entire Encoding network to get the attention output for the current time-step in Decoder network # torch.bmm(input, mat2, out=None) → Tensor # If input is a (b×n×m) tensor, mat2 is a (b×m×p) tensor, out will be a (b×n×p) tensor. # alpha_t: shape (b, src_len) # alpha_t.unsqueeze(1): shape (b, 1, src_len) # enc_hiddens: shape (b, src_len, 2h) # alpha_t.unsqueeze(1).bmm(enc_hiddens): shape (b, 1, 2h) # a_t: shape (b, 2h) a_t = alpha_t.unsqueeze(1).bmm(enc_hiddens).squeeze(1) ############ ### Step 3: Concatenate attention output a_t with the hidden state of current Decoder time-step ### ############ # U_t contains information from both the hidden state of current Decoder time-step and the attention from the Encoder network # dec_hidden: shape (b, h) # U_t: shape (b, 3h) U_t = torch.cat((a_t, dec_hidden), dim=1) ############ ### Step 4: We pass the concatenated result through a linear layer ### ############ V_t = self.combined_output_projection(U_t) ############ ### Step 5: We apply tanh activation for the linear layer output and apply dropout to obtain the combined output vector O_t ### ############ O_t = self.dropout(torch.tanh(V_t)) ################################################################################################################################################################### ### END YOUR CODE combined_output = O_t return dec_state, combined_output, e_t
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, token_type_ids: torch.Tensor, intent_label_ids: torch.Tensor, slot_labels_ids: torch.Tensor, pos_label_ids: torch.Tensor = None, np_label_ids: torch.Tensor = None, vp_label_ids: torch.Tensor = None, entity_label_ids: torch.Tensor = None, acronym_label_ids: torch.Tensor = None): outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids ) # sequence_output, pooled_output, (hidden_states), (attentions) sequence_output = outputs[0] pooled_output = outputs[1] # [CLS] if self.args.use_pos: # torch.cat([word_emb,pos_emb], dim=2) pos_output = self.pos_emb(pos_label_ids) sequence_output = torch.cat([sequence_output, pos_output], dim=2) if self.args.use_np: sequence_output = torch.cat( [sequence_output, np_label_ids.unsqueeze(2)], dim=2) if self.args.use_vp: sequence_output = torch.cat( [sequence_output, vp_label_ids.unsqueeze(2)], dim=2) if self.args.use_entity: sequence_output = torch.cat( [sequence_output, entity_label_ids.unsqueeze(2)], dim=2) if self.args.use_acronym: sequence_output = torch.cat( [sequence_output, acronym_label_ids.unsqueeze(2)], dim=2) if (self.args.use_pos or self.args.use_np or self.args.use_vp or self.args.use_entity or self.args.use_acronym): pooled_output = self.custom_pooler(sequence_output) intent_logits = self.intent_classifier(pooled_output) slot_logits = self.slot_classifier(sequence_output) total_loss = 0 # 1. Intent Softmax if intent_label_ids is not None: if self.num_intent_labels == 1: intent_loss_fct = nn.MSELoss() intent_loss = intent_loss_fct(intent_logits.view(-1), intent_label_ids.view(-1)) else: intent_loss_fct = nn.CrossEntropyLoss() intent_loss = intent_loss_fct( intent_logits.view(-1, self.num_intent_labels), intent_label_ids.view(-1), ) total_loss += intent_loss # 2. Slot Softmax if slot_labels_ids is not None: if self.args.use_crf: slot_loss = self.crf( slot_logits, slot_labels_ids, mask=attention_mask.byte(), reduction="mean", ) slot_loss = -1 * slot_loss # negative log-likelihood else: slot_loss_fct = nn.CrossEntropyLoss( ignore_index=self.args.ignore_index) # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = slot_logits.view( -1, self.num_slot_labels)[active_loss] active_labels = slot_labels_ids.view(-1)[active_loss] slot_loss = slot_loss_fct(active_logits, active_labels) else: slot_loss = slot_loss_fct( slot_logits.view(-1, self.num_slot_labels), slot_labels_ids.view(-1), ) total_loss += self.args.slot_loss_coef * slot_loss outputs = ((intent_logits, slot_logits), ) + outputs[ 2:] # add hidden states and attention if they are here outputs = (total_loss, ) + outputs return outputs # (loss), logits, (hidden_states), (attentions) # Logits is a tuple of intent and slot logits