Ejemplo n.º 1
0
    def _compute_attention(
        self,
        decoder_hidden_state: torch.LongTensor = None,
        encoder_outputs: torch.LongTensor = None,
        encoder_outputs_mask: torch.LongTensor = None
    ) -> (torch.Tensor, torch.Tensor):
        """Apply attention over encoder outputs and decoder state.
        Parameters
        ----------
        decoder_hidden_state : ``torch.LongTensor``
            A tensor of shape ``(batch_size, decoder_output_dim)``, which contains the current decoder hidden state to be used
            as the 'query' to the attention computation
            during the last time step.
        encoder_outputs : ``torch.LongTensor``
            A tensor of shape ``(batch_size, max_input_sequence_length, encoder_output_dim)``, which contains all the
            encoder hidden states of the source tokens, i.e., the 'keys' to the attention computation
        encoder_mask : ``torch.LongTensor``
            A tensor of shape (batch_size, max_input_sequence_length), which contains the mask of the encoded input.
            We want to avoid computing an attention score for positions of the source with zero-values (remember not all
            input sentences have the same length)

        Returns
        -------
        (torch.Tensor, torch.Tensor)
            A tensor of shape (batch_size, encoder_output_dim) that contains the attended encoder outputs (aka context vector),
            i.e., we have ``applied`` the attention scores on the encoder hidden states.

        Notes
        -----
            Don't forget to apply the final softmax over the **masked** encoder outputs!
        """

        # Ensure mask is also a FloatTensor. Or else the multiplication within
        # attention will complain.
        # shape: (batch_size, max_input_sequence_length)
        encoder_outputs_mask = encoder_outputs_mask.float()

        attention_weights = encoder_outputs.bmm(
            decoder_hidden_state.unsqueeze(-1)).squeeze(-1)

        # Main body of attention weights computation here

        # decoder hidden state 1, 400
        # encoder outputs 1, 14, 400
        # encoder_outputs_mask = 1, 14

        attention_probs = masked_softmax(attention_weights,
                                         encoder_outputs_mask)
        # attention weights = 1, 14

        context_vector = util.weighted_sum(encoder_outputs, attention_probs)

        return context_vector, attention_probs