Beispiel #1
0
    def dot_prod_attention(self, h_t: torch.Tensor, src_encoding: torch.Tensor, src_encoding_att_linear: torch.Tensor,
                           mask: torch.Tensor=None) -> Tuple[torch.Tensor, torch.Tensor]:
        # (batch_size, src_sent_len)
        att_weight = torch.bmm(src_encoding_att_linear, h_t.unsqueeze(2)).squeeze(2)

        if mask is not None:
            att_weight.data.masked_fill_(mask.byte(), -float('inf'))

        softmaxed_att_weight = F.softmax(att_weight, dim=-1)

        att_view = (att_weight.size(0), 1, att_weight.size(1))
        # (batch_size, hidden_size)
        ctx_vec = torch.bmm(softmaxed_att_weight.view(*att_view), src_encoding).squeeze(1)

        return ctx_vec, softmaxed_att_weight
Beispiel #2
0
    def step(
            self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor,
                                                         torch.Tensor],
            enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor
    ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length. 

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        ### YOUR CODE HERE (~3 Lines)
        ### TODO:
        ###     1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state.
        ###     2. Split dec_state into its two parts (dec_hidden, dec_cell)
        ###     3. Compute the attention scores e_t, a Tensor shape (b, src_len).
        ###        Note: b = batch_size, src_len = maximum source length, h = hidden size.
        ###
        ###       Hints:
        ###         - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched)
        ###         - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched).
        ###         - Use batched matrix multiplication (torch.bmm) to compute e_t.
        ###         - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing.
        ###         - When using the squeeze() function make sure to specify the dimension you want to squeeze
        ###             over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
        ###
        ### Use the following docs to implement this functionality:
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor Unsqueeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.unsqueeze
        ###     Tensor Squeeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.squeeze
        dec_state = self.decoder(Ybar_t, dec_state)
        (dec_hidden, dec_cell) = dec_state
        e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2)
        ### END YOUR CODE

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))

        ### YOUR CODE HERE (~6 Lines)
        ### TODO:
        ###     1. Apply softmax to e_t to yield alpha_t
        ###     2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the
        ###         attention output vector, a_t.
        #$$     Hints:
        ###           - alpha_t is shape (b, src_len)
        ###           - enc_hiddens is shape (b, src_len, 2h)
        ###           - a_t should be shape (b, 2h)
        ###           - You will need to do some squeezing and unsqueezing.
        ###     Note: b = batch size, src_len = maximum source length, h = hidden size.
        ###
        ###     3. Concatenate dec_hidden with a_t to compute tensor U_t
        ###     4. Apply the combined output projection layer to U_t to compute tensor V_t
        ###     5. Compute tensor O_t by first applying the Tanh function and then the dropout layer.
        ###
        ### Use the following docs to implement this functionality:
        ###     Softmax:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor View:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tanh:
        ###         https://pytorch.org/docs/stable/torch.html#torch.tanh
        alpha_t = F.softmax(e_t, dim=1)
        a_t = alpha_t.unsqueeze(1).bmm(enc_hiddens).squeeze(1)
        U_t = torch.cat((a_t, dec_hidden), dim=1)
        V_t = self.combined_output_projection(U_t)
        O_t = self.dropout(torch.tanh(V_t))
        ### END YOUR CODE

        combined_output = O_t
        return dec_state, combined_output, e_t
Beispiel #3
0
def draw_segmentation_masks(
    image: torch.Tensor,
    masks: torch.Tensor,
    alpha: float = 0.2,
    colors: Optional[List[Union[str, Tuple[int, int, int]]]] = None,
) -> torch.Tensor:

    """
    Draws segmentation masks on given RGB image.
    The values of the input image should be uint8 between 0 and 255.

    Args:
        image (Tensor): Tensor of shape (3 x H x W) and dtype uint8.
        masks (Tensor): Tensor of shape (num_masks, H, W). Each containing probability of predicted class.
        alpha (float): Float number between 0 and 1 denoting factor of transpaerency of masks.
        colors (List[Union[str, Tuple[int, int, int]]]): List containing the colors of masks. The colors can
            be represented as `str` or `Tuple[int, int, int]`.

    Returns:
        img (Tensor[C, H, W]): Image Tensor of dtype uint8 with segmentation masks plotted.

    Example:
        See this notebook
        `attached <https://github.com/pytorch/vision/blob/master/examples/python/visualization_utils.ipynb>`_
    """

    if not isinstance(image, torch.Tensor):
        raise TypeError(f"Tensor expected, got {type(image)}")
    elif image.dtype != torch.uint8:
        raise ValueError(f"Tensor uint8 expected, got {image.dtype}")
    elif image.dim() != 3:
        raise ValueError("Pass individual images, not batches")
    elif image.size()[0] != 3:
        raise ValueError("Pass an RGB image. Other Image formats are not supported")

    num_masks = masks.size()[0]
    masks = masks.argmax(0)

    if colors is None:
        palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])
        colors_t = torch.as_tensor([i for i in range(num_masks)])[:, None] * palette
        color_arr = (colors_t % 255).numpy().astype("uint8")
    else:
        color_list = []
        for color in colors:
            if isinstance(color, str):
                # This will automatically raise Error if rgb cannot be parsed.
                fill_color = ImageColor.getrgb(color)
                color_list.append(fill_color)
            elif isinstance(color, tuple):
                color_list.append(color)

        color_arr = np.array(color_list).astype("uint8")

    _, h, w = image.size()
    img_to_draw = Image.fromarray(masks.byte().cpu().numpy()).resize((w, h))
    img_to_draw.putpalette(color_arr)

    img_to_draw = torch.from_numpy(np.array(img_to_draw.convert('RGB')))
    img_to_draw = img_to_draw.permute((2, 0, 1))

    return (image.float() * alpha + img_to_draw.float() * (1.0 - alpha)).to(dtype=torch.uint8)
Beispiel #4
0
    def step(
            self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor,
                                                         torch.Tensor],
            enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor
    ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length. 

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        # new dec_state
        dec_state = self.decoder(Ybar_t, dec_state)
        (dec_hidden, dec_cell) = dec_state

        dec_hidden = torch.unsqueeze(dec_hidden, dim=2)
        # Attention scores
        e_t = torch.bmm(enc_hiddens_proj, dec_hidden)
        e_t = torch.squeeze(e_t, dim=2)

        # Set e_t to -inf where enc_masks has 1
        # http://juditacs.github.io/2018/12/27/masked-attention.html
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))

        # Attention distribution
        alpha_t = F.softmax(e_t, dim=1)

        alpha_t = torch.unsqueeze(alpha_t, dim=1)
        a_t = torch.bmm(alpha_t, enc_hiddens)
        a_t = torch.squeeze(a_t, dim=1)

        dec_hidden = torch.squeeze(dec_hidden, dim=2)
        U_t = torch.cat((dec_hidden, a_t), dim=1)

        V_t = self.combined_output_projection(U_t)

        O_t = torch.tanh(V_t)
        O_t = self.dropout(O_t)

        combined_output = O_t
        return dec_state, combined_output, e_t
Beispiel #5
0
def accuracy_thresh(y_pred:Tensor, y_true:Tensor, thresh:float=0.5, sigmoid:bool=True):
    "Compute accuracy when `y_pred` and `y_true` are the same size."
    if sigmoid: y_pred = y_pred.sigmoid()
#     return ((y_pred>thresh)==y_true.byte()).float().mean().item()
    return np.mean(((y_pred>thresh)==y_true.byte()).float().cpu().numpy(), axis=1).sum()
Beispiel #6
0
    def _vocab_loss(self, generate_scores: torch.Tensor,
                    copy_scores: torch.Tensor, target_tokens: torch.Tensor,
                    target_alias_indices: torch.Tensor, mask: torch.Tensor,
                    alias_indices: torch.Tensor, alias_tokens: torch.Tensor,
                    mention_mask: torch.Tensor):
        batch_size, sequence_length, vocab_size = generate_scores.shape
        copy_sequence_length = copy_scores.shape[-1]

        # Flat sequences make life **much** easier.
        flattened_targets = target_tokens.view(batch_size * sequence_length, 1)
        flattened_mask = mask.view(-1, 1).byte()

        # In order to obtain proper log probabilities we create a mask to omit padding alias tokens
        # from the calculation.
        alias_mask = alias_indices.view(batch_size, sequence_length, -1).gt(0)
        score_mask = mask.new_ones(batch_size, sequence_length,
                                   vocab_size + copy_sequence_length)
        score_mask[:, :, vocab_size:] = alias_mask

        # The log-probability distribution is then given by taking the masked log softmax.
        concatenated_scores = torch.cat((generate_scores, copy_scores), dim=-1)
        log_probs = masked_log_softmax(concatenated_scores, score_mask)

        # GENERATE LOSS ###
        # The generated token loss is a simple cross-entropy calculation, we can just gather
        # the log probabilties...
        flattened_log_probs = log_probs.view(batch_size * sequence_length, -1)
        generate_log_probs_source_vocab = flattened_log_probs.gather(
            1, flattened_targets)
        # ...except we need to ignore the contribution of UNK tokens that are copied (only when
        # computing the loss). To do that we create a mask which is 1 only if the token is not a
        # copied UNK (or padding).
        unks = target_tokens.eq(self._unk_index).view(-1, 1)
        copied = target_alias_indices.gt(0).view(-1, 1)
        generate_mask = ~(unks & copied) & flattened_mask
        # Since we are in log-space we apply the mask by addition.
        generate_log_probs_extended_vocab = generate_log_probs_source_vocab + (
            generate_mask.float() + 1e-45).log()

        # COPY LOSS ###
        copy_log_probs = flattened_log_probs[:, vocab_size:]
        # When computing the loss we need to get the log probability of **only** the copied tokens.
        alias_indices = alias_indices.view(batch_size * sequence_length, -1)
        target_alias_indices = target_alias_indices.view(-1, 1)
        copy_mask = alias_indices.eq(
            target_alias_indices) & flattened_mask & target_alias_indices.gt(0)
        copy_log_probs = copy_log_probs + (copy_mask.float() + 1e-45).log()

        # COMBINED LOSS ###
        # The final loss term is computed using our log probs computed w.r.t to the entire
        # vocabulary.
        combined_log_probs_extended_vocab = torch.cat(
            (generate_log_probs_extended_vocab, copy_log_probs), dim=1)
        combined_log_probs_extended_vocab = torch.logsumexp(
            combined_log_probs_extended_vocab, dim=1)
        vocab_loss = -combined_log_probs_extended_vocab.sum() / (mask.sum() +
                                                                 1e-13)

        # PERPLEXITY ###
        # Our perplexity terms are computed using the log probs computed w.r.t the source
        # vocabulary.
        combined_log_probs_source_vocab = torch.cat(
            (generate_log_probs_source_vocab, copy_log_probs), dim=1)
        combined_log_probs_source_vocab = torch.logsumexp(
            combined_log_probs_source_vocab, dim=1)

        # For UPP we penalize **only** p(UNK); not the copy probabilities!
        penalized_log_probs_source_vocab = generate_log_probs_source_vocab - self._unk_penalty * unks.float(
        )
        penalized_log_probs_source_vocab = torch.cat(
            (penalized_log_probs_source_vocab, copy_log_probs), dim=1)
        penalized_log_probs_source_vocab = torch.logsumexp(
            penalized_log_probs_source_vocab, dim=1)

        kg_mask = (mention_mask * mask.byte()).view(-1)
        bg_mask = ((1 - mention_mask) * mask.byte()).view(-1)
        mask = (kg_mask | bg_mask)

        self._ppl(-combined_log_probs_source_vocab[mask].sum(),
                  mask.float().sum() + 1e-13)
        self._upp(-penalized_log_probs_source_vocab[mask].sum(),
                  mask.float().sum() + 1e-13)
        if kg_mask.any():
            self._kg_ppl(-combined_log_probs_source_vocab[kg_mask].sum(),
                         kg_mask.float().sum() + 1e-13)
        if bg_mask.any():
            self._bg_ppl(-combined_log_probs_source_vocab[bg_mask].sum(),
                         bg_mask.float().sum() + 1e-13)

        return vocab_loss
Beispiel #7
0
    def step(
            self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor,
                                                         torch.Tensor],
            enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor
    ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.
        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length.
        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        ### COPY OVER YOUR CODE FROM ASSIGNMENT 4

        # step 1
        # applying decoder layer
        dec_state = self.decoder(Ybar_t, dec_state)

        # step 2
        dec_hidden, dec_cell = dec_state

        # step 3
        # (b, src_len, h) * (b, h, 1) -> (b, src_len, 1)
        e_t = torch.bmm(enc_hiddens_proj, dec_hidden.unsqueeze(dim=2))
        e_t = torch.squeeze(e_t, dim=2)

        ### END YOUR CODE FROM ASSIGNMENT 4

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))

        ### COPY OVER YOUR CODE FROM ASSIGNMENT 4

        # step 1
        alpha_t = nn.Softmax(dim=1)(e_t)

        # step 2
        # (b, 1, src_len) * (b, src_len, 2h) -> (b, 1, 2h)
        a_t = torch.bmm(alpha_t.unsqueeze(dim=1), enc_hiddens)
        a_t = torch.squeeze(a_t, dim=1)

        # step 3
        U_t = torch.cat((dec_hidden, a_t), dim=1)

        # step 4
        # applying linear layer
        V_t = self.combined_output_projection(U_t)

        # step 5
        O_t = self.dropout(torch.tanh(V_t))

        ### END YOUR CODE FROM ASSIGNMENT 4

        combined_output = O_t
        return dec_state, combined_output, e_t
Beispiel #8
0
    def step(
            self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor,
                                                         torch.Tensor],
            enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor
    ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length. 

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        ### YOUR CODE HERE (~3 Lines)
        ### TODO:
        ###     1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state.
        ###     2. Split dec_state into its two parts (dec_hidden, dec_cell)
        ###     3. Compute the attention scores e_t, a Tensor shape (b, src_len).
        ###        Note: b = batch_size, src_len = maximum source length, h = hidden size.
        ###
        ###       Hints:
        ###         - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched)
        ###         - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched).
        ###         - Use batched matrix multiplication (torch.bmm) to compute e_t.
        ###         - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing.
        ###         - When using the squeeze() function make sure to specify the dimension you want to squeeze
        ###             over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
        ###
        ### Use the following docs to implement this functionality:
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor Unsqueeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.unsqueeze
        ###     Tensor Squeeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.squeeze
        ###################################################################################################################################################################
        ############
        ### Step 1: Apply the input (concatenation of word embedding input at current time-step and output at previous time-step) into Decoder LSTMCell to get new output at current time-step ###
        ############

        # LSTMCell
        # Inputs: input, (h_0, c_0)
        # Outputs: (h_1, c_1)

        # Ybar_t: concatenated LSTM input of current time-step, shape (b, e+h)
        # dec_state as input contains both hidden state and cell state, hidden state and cell state both are shape (b, h)
        # dec_state as output: shape (2, b, h)

        dec_state = self.decoder(Ybar_t, dec_state)

        ############
        ### Step 2: Split dec_state into its two parts (dec_hidden, dec_cell) ###
        ############

        # dec_hidden, dec_cell: shape (b, h)
        (dec_hidden, dec_cell) = dec_state

        ############
        ### Step 3: Compute attention score vector for the current time-step ###
        ############

        # We multiply the hidden state vector “projection” of the entire Encoding network by the hidden state of the current time-step in Decoder network to get the attention score vector for the current time-step

        # enc_hiddens_proj: shape (b, src_len, h)
        # dec_hidden: shape (b, h)
        # torch.unsqueeze(dec_hidden, 2): shape (b, h, 1)
        # torch.bmm(input, mat2, out=None) → Tensor
        # If input is a (b×n×m) tensor, mat2 is a (b×m×p) tensor, out will be a (b×n×p) tensor.
        # enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)): shape (b, src_len, 1)
        # e_t: shape (b, src_len)
        # e_t contains the attentions score of each time-step in Encoding network on the current one time-step in Decoder network
        e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2)

        ###################################################################################################################################################################

        ### END YOUR CODE

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))

        ### YOUR CODE HERE (~6 Lines)
        ### TODO:
        ###     1. Apply softmax to e_t to yield alpha_t
        ###     2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the
        ###         attention output vector, a_t.
        #$$     Hints:
        ###           - alpha_t is shape (b, src_len)
        ###           - enc_hiddens is shape (b, src_len, 2h)
        ###           - a_t should be shape (b, 2h)
        ###           - You will need to do some squeezing and unsqueezing.
        ###     Note: b = batch size, src_len = maximum source length, h = hidden size.
        ###
        ###     3. Concatenate dec_hidden with a_t to compute tensor U_t
        ###     4. Apply the combined output projection layer to U_t to compute tensor V_t
        ###     5. Compute tensor O_t by first applying the Tanh function and then the dropout layer.
        ###
        ### Use the following docs to implement this functionality:
        ###     Softmax:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor View:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tanh:
        ###         https://pytorch.org/docs/stable/torch.html#torch.tanh
        ###################################################################################################################################################################
        ############
        ### Step 1: Compute attention distribution alpha_t for the current time-step ###
        ############

        # Softmax converts all attentions scores into values between [0, 1] and add up to 1

        # e_t: shape (b, src_len)
        # alpha_t: shape (b, src_len)
        alpha_t = F.softmax(e_t, dim=1)

        ############
        ### Step 2: Compute attention output a_t for the current time-step ###
        ############

        # We multiply the attention distribution vector by the hidden state vector of the entire Encoding network to get the attention output for the current time-step in Decoder network

        # torch.bmm(input, mat2, out=None) → Tensor
        # If input is a (b×n×m) tensor, mat2 is a (b×m×p) tensor, out will be a (b×n×p) tensor.
        # alpha_t: shape (b, src_len)
        # alpha_t.unsqueeze(1): shape (b, 1, src_len)
        # enc_hiddens: shape (b, src_len, 2h)
        # alpha_t.unsqueeze(1).bmm(enc_hiddens): shape (b, 1, 2h)
        # a_t: shape (b, 2h)
        a_t = alpha_t.unsqueeze(1).bmm(enc_hiddens).squeeze(1)

        ############
        ### Step 3: Concatenate attention output a_t with the hidden state of current Decoder time-step ###
        ############

        # U_t contains information from both the hidden state of current Decoder time-step and the attention from the Encoder network

        # dec_hidden: shape (b, h)
        # U_t: shape (b, 3h)
        U_t = torch.cat((a_t, dec_hidden), dim=1)

        ############
        ### Step 4: We pass the concatenated result through a linear layer ###
        ############

        V_t = self.combined_output_projection(U_t)

        ############
        ### Step 5: We apply tanh activation for the linear layer output and apply dropout to obtain the combined output vector O_t ###
        ############

        O_t = self.dropout(torch.tanh(V_t))

        ###################################################################################################################################################################

        ### END YOUR CODE

        combined_output = O_t
        return dec_state, combined_output, e_t
Beispiel #9
0
    def forward(self,
                input_ids: torch.Tensor,
                attention_mask: torch.Tensor,
                token_type_ids: torch.Tensor,
                intent_label_ids: torch.Tensor,
                slot_labels_ids: torch.Tensor,
                pos_label_ids: torch.Tensor = None,
                np_label_ids: torch.Tensor = None,
                vp_label_ids: torch.Tensor = None,
                entity_label_ids: torch.Tensor = None,
                acronym_label_ids: torch.Tensor = None):
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )  # sequence_output, pooled_output, (hidden_states), (attentions)
        sequence_output = outputs[0]
        pooled_output = outputs[1]  # [CLS]

        if self.args.use_pos:
            # torch.cat([word_emb,pos_emb], dim=2)
            pos_output = self.pos_emb(pos_label_ids)
            sequence_output = torch.cat([sequence_output, pos_output], dim=2)
        if self.args.use_np:
            sequence_output = torch.cat(
                [sequence_output, np_label_ids.unsqueeze(2)], dim=2)
        if self.args.use_vp:
            sequence_output = torch.cat(
                [sequence_output, vp_label_ids.unsqueeze(2)], dim=2)
        if self.args.use_entity:
            sequence_output = torch.cat(
                [sequence_output,
                 entity_label_ids.unsqueeze(2)], dim=2)
        if self.args.use_acronym:
            sequence_output = torch.cat(
                [sequence_output,
                 acronym_label_ids.unsqueeze(2)], dim=2)

        if (self.args.use_pos or self.args.use_np or self.args.use_vp
                or self.args.use_entity or self.args.use_acronym):
            pooled_output = self.custom_pooler(sequence_output)

        intent_logits = self.intent_classifier(pooled_output)
        slot_logits = self.slot_classifier(sequence_output)

        total_loss = 0
        # 1. Intent Softmax
        if intent_label_ids is not None:
            if self.num_intent_labels == 1:
                intent_loss_fct = nn.MSELoss()
                intent_loss = intent_loss_fct(intent_logits.view(-1),
                                              intent_label_ids.view(-1))
            else:
                intent_loss_fct = nn.CrossEntropyLoss()
                intent_loss = intent_loss_fct(
                    intent_logits.view(-1, self.num_intent_labels),
                    intent_label_ids.view(-1),
                )
            total_loss += intent_loss

        # 2. Slot Softmax
        if slot_labels_ids is not None:
            if self.args.use_crf:
                slot_loss = self.crf(
                    slot_logits,
                    slot_labels_ids,
                    mask=attention_mask.byte(),
                    reduction="mean",
                )
                slot_loss = -1 * slot_loss  # negative log-likelihood
            else:
                slot_loss_fct = nn.CrossEntropyLoss(
                    ignore_index=self.args.ignore_index)
                # Only keep active parts of the loss
                if attention_mask is not None:
                    active_loss = attention_mask.view(-1) == 1
                    active_logits = slot_logits.view(
                        -1, self.num_slot_labels)[active_loss]
                    active_labels = slot_labels_ids.view(-1)[active_loss]
                    slot_loss = slot_loss_fct(active_logits, active_labels)
                else:
                    slot_loss = slot_loss_fct(
                        slot_logits.view(-1, self.num_slot_labels),
                        slot_labels_ids.view(-1),
                    )
            total_loss += self.args.slot_loss_coef * slot_loss

        outputs = ((intent_logits, slot_logits), ) + outputs[
            2:]  # add hidden states and attention if they are here

        outputs = (total_loss, ) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions) # Logits is a tuple of intent and slot logits