Python masked_softmax Examples, allennlp.nn.util.masked_softmax Python Examples

Example #1

0

Show file

File: util_test.py Project: ziaridoy20/allennlp

    def test_masked_softmax_no_mask(self):
        # Testing the general unmasked 1D case.
        vector_1d = torch.FloatTensor([[1.0, 2.0, 3.0]])
        vector_1d_softmaxed = util.masked_softmax(vector_1d, None).data.numpy()
        assert_array_almost_equal(vector_1d_softmaxed,
                                  numpy.array([[0.090031, 0.244728, 0.665241]]))
        assert_almost_equal(1.0, numpy.sum(vector_1d_softmaxed), decimal=6)

        vector_1d = torch.FloatTensor([[1.0, 2.0, 5.0]])
        vector_1d_softmaxed = util.masked_softmax(vector_1d, None).data.numpy()
        assert_array_almost_equal(vector_1d_softmaxed,
                                  numpy.array([[0.017148, 0.046613, 0.93624]]))

        # Testing the unmasked 1D case where the input is all 0s.
        vector_zero = torch.FloatTensor([[0.0, 0.0, 0.0]])
        vector_zero_softmaxed = util.masked_softmax(vector_zero, None).data.numpy()
        assert_array_almost_equal(vector_zero_softmaxed,
                                  numpy.array([[0.33333334, 0.33333334, 0.33333334]]))

        # Testing the general unmasked batched case.
        matrix = torch.FloatTensor([[1.0, 2.0, 5.0], [1.0, 2.0, 3.0]])
        masked_matrix_softmaxed = util.masked_softmax(matrix, None).data.numpy()
        assert_array_almost_equal(masked_matrix_softmaxed,
                                  numpy.array([[0.01714783, 0.04661262, 0.93623955],
                                               [0.09003057, 0.24472847, 0.66524096]]))

        # Testing the unmasked batched case where one of the inputs are all 0s.
        matrix = torch.FloatTensor([[1.0, 2.0, 5.0], [0.0, 0.0, 0.0]])
        masked_matrix_softmaxed = util.masked_softmax(matrix, None).data.numpy()
        assert_array_almost_equal(masked_matrix_softmaxed,
                                  numpy.array([[0.01714783, 0.04661262, 0.93623955],
                                               [0.33333334, 0.33333334, 0.33333334]]))

Example #2

0

Show file

File: attention.py Project: apmoore1/allennlp

 def forward(self,  # pylint: disable=arguments-differ
             vector: torch.Tensor,
             matrix: torch.Tensor,
             matrix_mask: torch.Tensor = None) -> torch.Tensor:
     similarities = self._forward_internal(vector, matrix)
     if self._normalize:
         return masked_softmax(similarities, matrix_mask)
     else:
         return similarities

Example #3

0

Show file

File: attention.py Project: Jordan-Sauchuk/allennlp

 def forward(self,  # pylint: disable=arguments-differ
             vector: torch.Tensor,
             matrix: torch.Tensor,
             matrix_mask: torch.Tensor = None) -> torch.Tensor:
     tiled_vector = vector.unsqueeze(1).expand(vector.size()[0],
                                               matrix.size()[1],
                                               vector.size()[1])
     similarities = self._similarity_function(tiled_vector, matrix)
     if self._normalize:
         return masked_softmax(similarities, matrix_mask)
     else:
         return similarities

Example #4

0

Show file

    def forward(self, tokens: torch.Tensor, mask: torch.Tensor):  # pylint: disable=arguments-differ
        assert mask is not None
        batch_size, sequence_length, embedding_dim = tokens.size()

        attn_weights = tokens.view(batch_size * sequence_length, embedding_dim)
        attn_weights = torch.tanh(self._mlp(attn_weights))
        attn_weights = self._context_dot_product(attn_weights)
        attn_weights = attn_weights.view(batch_size,
                                         -1)  # batch_size x seq_len
        attn_weights = masked_softmax(attn_weights, mask)
        attn_weights = attn_weights.unsqueeze(2).expand(
            batch_size, sequence_length, embedding_dim)

        return torch.sum(tokens * attn_weights, 1)

Example #5

0

Show file

    def multihead_attention(self, memory, memory_mask):
        """
        Perform multi-head attention from 'Attention is All You Need'.
        Implementation of the attention mechanism from
        https://arxiv.org/abs/1706.03762.
        Args:
          memory: Memory tensor to perform attention on.
        Returns:
          new_memory: New memory tensor.
        """

        # First, a simple linear projection is used to construct queries
        qkv = self.qkv_projector(memory)
        # apply layernorm for every dim except the batch dim
        qkv = self.qkv_layernorm(qkv)

        # mem_slots needs to be dynamically computed since mem_slots got concatenated with inputs
        # example: self.mem_slots=10 and seq_length is 3, and then mem_slots is 10 + 1 = 11 for each 3 step forward pass
        # this is the same as self.mem_slots_plus_input, but defined to keep the sonnet implementation code style
        mem_slots = memory.shape[1]  # denoted as N

        # split the qkv to multiple heads H
        # [B, N, F] => [B, N, H, F/H]
        qkv_reshape = qkv.view(qkv.shape[0], mem_slots, self.num_heads,
                               self.qkv_size)

        # [B, N, H, F/H] => [B, H, N, F/H]
        qkv_transpose = qkv_reshape.permute(0, 2, 1, 3)

        # [B, H, N, key_size], [B, H, N, key_size], [B, H, N, value_size]
        q, k, v = torch.split(qkv_transpose,
                              [self.key_size, self.key_size, self.value_size],
                              -1)

        # scale q with d_k, the dimensionality of the key vectors
        q *= (self.key_size**-0.5)

        # make it [B, H, N, N]
        dot_product = torch.matmul(q, k.permute(0, 1, 3, 2))
        weights = util.masked_softmax(dot_product, memory_mask, dim=-1)
        # output is [B, H, N, V]
        output = torch.matmul(weights, v)

        # [B, H, N, V] => [B, N, H, V] => [B, N, H*V]
        output_transpose = output.permute(0, 2, 1, 3).contiguous()
        new_memory = output_transpose.view(
            (output_transpose.shape[0], output_transpose.shape[1], -1))
        if self.save_step_wise_attentions:
            return weights, new_memory
        return new_memory

Example #6

0

Show file

    def forward(
            self,  # pylint: disable=arguments-differ
            matrix: torch.Tensor,
            matrix_mask: torch.Tensor = None) -> torch.Tensor:

        #print(matrix.shape, matrix_mask.shape)
        similarities = self._forward_internal(
            self.query_vector(matrix, matrix_mask),
            self.matrix(matrix, matrix_mask))

        if self._normalize:
            similarities = masked_softmax(similarities, matrix_mask)

        return similarities

Example #7

0

Show file

File: model.py Project: shaharlinial/metaphor_detection

    def forward(self, inputs, lengths):
        # 1. run LSTM
        # apply dropout to the input
        # Shape of inputs: (batch_size, sequence_length, embedding_dim)
        embedded_input = self.dropout_on_input_to_LSTM(inputs)
        # Sort the embedded inputs by decreasing order of input length.
        # sorted_input shape: (batch_size, sequence_length, embedding_dim)
        (sorted_input, sorted_lengths, input_unsort_indices,
         _) = sort_batch_by_length(embedded_input, lengths)
        # Pack the sorted inputs with pack_padded_sequence.
        packed_input = pack_padded_sequence(sorted_input,
                                            sorted_lengths.data.tolist(),
                                            batch_first=True)
        # Run the input through the RNN.
        packed_sorted_output, _ = self.rnn(packed_input)
        # Unpack (pad) the input with pad_packed_sequence
        # Shape: (batch_size, sequence_length, hidden_size)
        sorted_output, _ = pad_packed_sequence(packed_sorted_output,
                                               batch_first=True)
        # Re-sort the packed sequence to restore the initial ordering
        # Shape: (batch_size, sequence_length, hidden_size)
        output = sorted_output[input_unsort_indices]

        # 2. use attention
        # Shape: (batch_size, sequence_length, 1)
        # Shape: (batch_size, sequence_length) after squeeze
        attention_logits = self.attention_weights(output).squeeze(dim=-1)
        mask_attention_logits = (attention_logits != 0).type(
            torch.cuda.FloatTensor if inputs.is_cuda else torch.FloatTensor)
        # Shape: (batch_size, sequence_length)
        softmax_attention_logits = masked_softmax(attention_logits,
                                                  mask_attention_logits)
        # Shape: (batch_size, 1, sequence_length)
        softmax_attention_logits = softmax_attention_logits.unsqueeze(dim=1)
        # Shape of input_encoding: (batch_size, 1, hidden_size )
        #    output: (batch_size, sequence_length, hidden_size)
        #    softmax_attention_logits: (batch_size, 1, sequence_length)
        input_encoding = torch.bmm(softmax_attention_logits, output)
        # Shape: (batch_size, hidden_size)
        input_encoding = input_encoding.squeeze(dim=1)

        # 3. run linear layer
        # apply dropout to input to the linear layer
        input_encoding = self.dropout_on_input_to_linear_layer(input_encoding)
        # Run the RNN encoding of the input through the output projection
        # to get scores for each of the classes.
        unnormalized_output = self.output_projection(input_encoding)
        # Normalize with log softmax
        output_distribution = F.log_softmax(unnormalized_output, dim=-1)
        return output_distribution

Example #8

0

Show file

File: gat.py Project: AromaR/RelEx

    def masked_self_attention(self, inputs, mask, adjacency):
        batch_size, seq_len, _ = inputs.size()

        # shape (num_heads * batch_size, seq_len, attention_dim)
        inputs = inputs.view(batch_size, seq_len, self._num_heads,
                             self._attention_dim)
        inputs = inputs.transpose(1, 2).contiguous()
        inputs = inputs.view(batch_size * self._num_heads, seq_len,
                             self._attention_dim)

        # shape (num_heads * batch_size, seq_len, seq_len)
        adjacency_per_head = adjacency \
            .unsqueeze(1) \
            .repeat(1, self._num_heads, 1, 1) \
            .view(batch_size * self._num_heads, seq_len, seq_len).byte()

        # shape (num_heads * batch_size, seq_len, seq_len)
        mask_per_head = mask.repeat(1, self._num_heads) \
                            .view(batch_size * self._num_heads, seq_len).float()
        mask_per_head = mask_per_head.unsqueeze(2)
        mask_per_head = mask_per_head.bmm(mask_per_head.transpose(1, 2)).byte()

        # Only attend on nodes visible in the adjacency matrix
        attention_mask = adjacency_per_head & mask_per_head
        attention_mask = self.att_dropout(attention_mask)

        similarities = self.matrix_attention(inputs, inputs)

        # shape (num_heads * batch_size, seq_len, seq_len)
        # Normalise the distributions, using the same mask for all heads.
        attention = masked_softmax(similarities,
                                   attention_mask,
                                   memory_efficient=True)

        # Take a weighted sum of the values with respect to the attention
        # distributions for each element in the num_heads * batch_size dimension.
        # shape (num_heads * batch_size, seq_len, attention_dim)
        outputs = weighted_sum(inputs, attention)

        # Reshape back to original shape (batch_size, timesteps, hidden_dim)

        # shape (batch_size, num_heads, timesteps, values_dim/num_heads)
        outputs = outputs.view(batch_size, self._num_heads, seq_len,
                               self._attention_dim)
        # shape (batch_size, seq_len, num_heads, values_dim/num_heads)
        outputs = outputs.transpose(1, 2).contiguous()
        # shape (batch_size, seq_len, hidden_dim)
        outputs = outputs.view(batch_size, seq_len, self._hidden_dim)

        return outputs

Example #9

0

Show file

    def forward(self, s1, s2, s1_mask, s2_mask):  # pylint: disable=arguments-differ
        """ """
        # Similarity matrix
        # Shape: (batch_size, s2_length, s1_length)
        similarity_mat = self._matrix_attention(s2, s1)

        # s2 representation
        # Shape: (batch_size, s2_length, s1_length)
        s2_s1_attn = util.masked_softmax(similarity_mat, s1_mask)
        # Shape: (batch_size, s2_length, encoding_dim)
        s2_s1_vectors = util.weighted_sum(s1, s2_s1_attn)
        # batch_size, seq_len, 4*enc_dim
        s2_w_context = torch.cat([s2, s2_s1_vectors], 2)

        # s1 representation, using same attn method as for the s2 representation
        s1_s2_attn = util.masked_softmax(similarity_mat.transpose(1, 2).contiguous(), s2_mask)
        # Shape: (batch_size, s1_length, encoding_dim)
        s1_s2_vectors = util.weighted_sum(s2, s1_s2_attn)
        s1_w_context = torch.cat([s1, s1_s2_vectors], 2)

        modeled_s1 = self._dropout(self._modeling_layer(s1_w_context, s1_mask))
        modeled_s2 = self._dropout(self._modeling_layer(s2_w_context, s2_mask))
        return modeled_s1, modeled_s2

Example #10

0

Show file

File: util_test.py Project: xumx/allennlp

    def test_masked_softmax_no_mask(self):
        # Testing the general unmasked 1D case.
        vector_1d = Variable(torch.FloatTensor([[1.0, 2.0, 3.0]]))
        vector_1d_softmaxed = masked_softmax(vector_1d, None).data.numpy()
        assert_array_almost_equal(
            vector_1d_softmaxed, numpy.array([[0.090031, 0.244728, 0.665241]]))
        assert_almost_equal(1.0, numpy.sum(vector_1d_softmaxed), decimal=6)

        vector_1d = Variable(torch.FloatTensor([[1.0, 2.0, 5.0]]))
        vector_1d_softmaxed = masked_softmax(vector_1d, None).data.numpy()
        assert_array_almost_equal(vector_1d_softmaxed,
                                  numpy.array([[0.017148, 0.046613, 0.93624]]))

        # Testing the unmasked 1D case where the input is all 0s.
        vector_zero = Variable(torch.FloatTensor([[0.0, 0.0, 0.0]]))
        vector_zero_softmaxed = masked_softmax(vector_zero, None).data.numpy()
        assert_array_almost_equal(
            vector_zero_softmaxed,
            numpy.array([[0.33333334, 0.33333334, 0.33333334]]))

        # Testing the general unmasked batched case.
        matrix = Variable(torch.FloatTensor([[1.0, 2.0, 5.0], [1.0, 2.0,
                                                               3.0]]))
        masked_matrix_softmaxed = masked_softmax(matrix, None).data.numpy()
        assert_array_almost_equal(
            masked_matrix_softmaxed,
            numpy.array([[0.01714783, 0.04661262, 0.93623955],
                         [0.09003057, 0.24472847, 0.66524096]]))

        # Testing the unmasked batched case where one of the inputs are all 0s.
        matrix = Variable(torch.FloatTensor([[1.0, 2.0, 5.0], [0.0, 0.0,
                                                               0.0]]))
        masked_matrix_softmaxed = masked_softmax(matrix, None).data.numpy()
        assert_array_almost_equal(
            masked_matrix_softmaxed,
            numpy.array([[0.01714783, 0.04661262, 0.93623955],
                         [0.33333334, 0.33333334, 0.33333334]]))

Example #11

0

Show file

    def forward(
            self,  # pylint: disable=arguments-differ
            inputs: torch.Tensor,
            mask: torch.Tensor = None) -> Dict[str, torch.Tensor]:
        """
        Parameters
        ----------
        inputs : ``torch.FloatTensor``, required.
            A tensor of shape (batch_size, timesteps, input_dim)
        mask : ``torch.FloatTensor``, optional (default = None).
            A tensor of shape (batch_size, timesteps).

        Returns
        -------
        A dictionary of outputs containing the following items:
        representation : ``torch.FloatTensor``
            The final representation produced by attention of the shape (batch_size, input_dim).
        penalty : ``torch.FloatTensor``
            The frobenius norm based regularization penalty.
        attention : ``torch.FloatTensor``
            The values of attention corrosponding to the different attention heads having
            the shape (batch_size, num_attention_heads, time_steps).
        """
        # Shape (batch_size, timesteps, attention_size)
        attention_matrix = torch.tanh(self._linear_inner(inputs))
        # Shape (batch_size, timesteps, num_attention_heads)
        attention_vector = self._linear_outer(attention_matrix)
        # Shape (batch_size, timesteps, num_attention_heads)
        if mask is not None:
            mask = mask.unsqueeze(
                2)  # For unsqueezing mask to have three dimensions

        attention = masked_softmax(attention_vector, mask, dim=1)

        batch_size = inputs.shape[0]
        outputs = {"attention": attention}

        if self._regularization_coeffecient:
            outputs[
                "regularization_loss"] = self.frobenius_regularization_penalty(
                    attention) / batch_size

        # Shape (batch_size, num_attention_heads, input_dim)
        attended_representation = attention.transpose(1, 2) @ inputs
        # Shape (batch_size, input_dim*num_attention_heads)
        outputs["representation"] = attended_representation.view(
            batch_size, -1)

        return outputs

Example #12

0

Show file

    def forward(self, input_: Tuple[torch.Tensor, torch.Tensor]):
        chars, lengths = input_
        batch_size, seq_len, max_chars = chars.size()

        chars = chars.view(batch_size * seq_len, -1)
        lengths = lengths.view(batch_size * seq_len)
        mask = get_mask_from_sequence_lengths(lengths, max_chars)
        chars = torch.autograd.Variable(chars, requires_grad=False)

        embeded_chars = self.embeddings(chars)
        output, _ = self.encoder_(embeded_chars)
        attentions = masked_softmax(self.attention(output).squeeze(-1), mask, dim=-1)
        output = torch.bmm(output.permute(0, 2, 1), attentions.unsqueeze(-1))

        return self.projection(output.view(batch_size, seq_len, -1))

Example #13

0

Show file

File: loss_utils.py Project: orperel/ALUQANet

def calc_entropy_loss_unmasked(select_probs_logits, mask):
    select_probs = util.masked_softmax(select_probs_logits, mask)
    epsilon = 1e-7
    select_probs[(select_probs < epsilon)] = epsilon
    select_probs[(select_probs > (1 - epsilon))] = 1 - epsilon

    legal_select_probs = select_probs
    entropies = -(legal_select_probs * torch.log(legal_select_probs) +
                  (1 - legal_select_probs) * torch.log(1 - legal_select_probs))

    seq_length = select_probs.shape[-1]
    mean_entropy_per_sentence = entropies.sum(dim=2) / seq_length
    total_entropy = mean_entropy_per_sentence.sum()

    return total_entropy

Example #14

0

Show file

File: san.py Project: titsuki/allennlp-book

    def _compute_memory(
            self, encoded_premise: torch.Tensor,
            encoded_hypothesis: torch.Tensor, premise_mask: torch.Tensor,
            hypothesis_mask: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        # Shape: (batch_size, premise_length, hypothesis_length)
        attention_matrix = self._matrix_attention(
            self._attention_feedforward(encoded_premise),
            self._attention_feedforward(encoded_hypothesis),
        )

        if self._dropout:
            attention_matrix = self._dropout(attention_matrix)

        # Shape: (batch_size, premise_length, hypothesis_length)
        p2h_attention = util.masked_softmax(attention_matrix, hypothesis_mask)
        # Shape: (batch_size, premise_length, embedding_dim)
        attended_hypothesis = util.weighted_sum(encoded_hypothesis,
                                                p2h_attention)

        # Shape: (batch_size, hypothesis_length, premise_length)
        h2p_attention = util.masked_softmax(
            attention_matrix.transpose(1, 2).contiguous(), premise_mask)
        # Shape: (batch_size, hypothesis_length, embedding_dim)
        attended_premise = util.weighted_sum(encoded_premise, h2p_attention)

        premise_memory = self._memory_encoder(
            torch.cat([encoded_premise, attended_hypothesis], dim=-1),
            premise_mask,
        )
        hypothesis_memory = self._memory_encoder(
            torch.cat([encoded_hypothesis, attended_premise], dim=-1),
            hypothesis_mask,
        )

        return premise_memory, hypothesis_memory

Example #15

0

Show file

 def forward(self, x, mask=None):
     """
     compute self-attention vector
         gamma = softmax(w^T X)
         r = sum(gamma_i * X_i)
     :param x: b * m * h
     :param mask: b * m
     :return: r: b * h
     """
     # b * m * 1
     gamma = util.masked_softmax(self.linear(x).squeeze(2), mask)
     # [b * 1 * m] * [b * m * h] = [b * 1 * h]
     # b * h
     r = gamma.unsqueeze(1).bmm(x).squeeze(1)
     return r

Example #16

0

Show file

File: bert_hierarchical_hard.py Project: hzyang95/c3-master

 def hard_sample(self, logits, use_gumbel, dim=-1, hard=True, mask=None):
     if use_gumbel:
         if self.training:
             probs = rep_layers.gumbel_softmax(logits,
                                               mask=mask,
                                               hard=hard,
                                               dim=dim)
             return probs
         else:
             probs = masked_softmax(logits, mask, dim=dim)
             index = probs.max(dim, keepdim=True)[1]
             y_hard = torch.zeros_like(logits).scatter_(dim, index, 1.0)
             return y_hard
     else:
         pass

Example #17

0

Show file

    def forward(self,
                inputs: Tensor,
                memory: Tensor = None,
                memory_mask: Tensor = None,
                state: Tensor = None):
        """
        :param inputs:  B x H
        :param memory: T x B x H if not batch_first
        :param memory_mask: T x B if not batch_first
        :param state: B x H
        :return:
        """
        if self.batch_first:
            memory = memory.transpose(0, 1)
            memory_mask = memory_mask.transpose(0, 1)

        assert inputs.size(0) == memory.size(1) == memory_mask.size(
            1), "inputs batch size does not match memory batch size"

        memory_time_length = memory.size(0)

        if state is None:
            state = inputs.new_zeros(inputs.size(0),
                                     self.cell.hidden_size,
                                     requires_grad=False)

        if self.use_state:
            hx = state
            if isinstance(state, tuple):
                hx = state[0]
            attention_input = torch.cat([inputs, hx], dim=-1)
            attention_input = attention_input.unsqueeze(0).expand(
                memory_time_length, -1, -1)  # T B H
        else:
            attention_input = inputs.unsqueeze(0).expand(
                memory_time_length, -1, -1)

        attention_logits = self.attention_w(
            torch.cat([attention_input, memory], dim=-1)).squeeze(-1)

        attention_scores = masked_softmax(attention_logits, memory_mask, dim=0)

        attention_vector = torch.sum(attention_scores.unsqueeze(-1) * memory,
                                     dim=0)

        new_input = torch.cat([inputs, attention_vector], dim=-1)

        return self.cell(new_input, state)

Example #18

0

Show file

File: pointer_generator_bak.py Project: wlhgtc/abstractive_summarization_allennlp

 def _decode_step_output(self,
                         decoder_hidden_state: torch.LongTensor = None,
                         encoder_outputs: torch.LongTensor = None,
                         encoder_outputs_mask: torch.LongTensor = None) -> torch.LongTensor:
     # encoder_outputs : (batch_size, input_sequence_length, encoder_output_dim)
     # Ensuring mask is also a FloatTensor. Or else the multiplication within attention will
     # complain.
     encoder_outputs_mask = encoder_outputs_mask.float()
     # (batch_size, input_sequence_length)
     input_weights_e = self._decoder_attention(decoder_hidden_state, encoder_outputs, encoder_outputs_mask)
     input_weights_a = masked_softmax(input_weights_e,encoder_outputs_mask)#F.softmax(input_weights_e,dim=-1)
     # (batch_size, encoder_output_dim)
     attended_input = weighted_sum(encoder_outputs, input_weights_a)
     #H*_t = sum(h_i*at_i)
     # (batch_size, encoder_output_dim + decoder_hidden_dim)
     return input_weights_e,input_weights_a,torch.cat((decoder_hidden_state,attended_input), -1)

Example #19

0

Show file

File: aluqa_experimental_model.py Project: orperel/ALUQANet

    def _count_module_per_sentence(self, passage_vector, sentences_vectors):
        # Shape: (batch_size, 10)
        sentences_count = sentences_vectors.shape[1]
        valid_sentences_mask = (1 - (torch.sum(sentences_vectors, dim=2) == 0).long()).unsqueeze(-1)
        tiled_passage_vector = passage_vector.unsqueeze(1).repeat(1, sentences_count, 1)
        sentence_keys = torch.cat([tiled_passage_vector, sentences_vectors], -1)
        count_per_sentence_logits = self._count_number_predictor(sentence_keys)

        count_probabilities = util.masked_softmax(count_per_sentence_logits, mask=valid_sentences_mask)
        count_classes = self.count_classes.to(count_probabilities.device)
        expected_count_per_sentence = torch.sum(count_probabilities * count_classes, dim=2)
        total_count_per_question = torch.sum(expected_count_per_sentence, dim=1)

        # Info about the best count number prediction
        # Shape: (batch_size,)
        return total_count_per_question, count_per_sentence_logits, valid_sentences_mask

Example #20

0

Show file

File: editor.py Project: isomap/factedit

    def _make_prob(self, state: Dict[str, torch.Tensor]) -> torch.Tensor:

        triple_token_ids = state["triple_token_ids"]
        batch_size, triple_length = triple_token_ids.size()

        hidden = self.P(self._get_query(state))

        gate_prob = self.G(hidden)
        gen_prob = util.masked_softmax(self.W(hidden),
                                       state["action_mask"],
                                       memory_efficient=True) * gate_prob

        copy_prob = self.COPY_ATTN(hidden, state["encoded_triple"],
                                   state["triple_mask"]) * (-gate_prob + 1)
        modified_prob_list: List[torch.Tensor] = []
        for i in range(triple_length):
            copy_prob_slice = copy_prob[:, i]
            token_slice = state["triple_tokens"][:, i]
            copy_to_add_mask = token_slice != self.OOV
            copy_to_add = copy_prob_slice.min(
                copy_to_add_mask.float()).unsqueeze(-1)
            gen_prob = gen_prob.scatter_add(-1, token_slice.unsqueeze(1),
                                            copy_to_add)

            if i < (triple_length - 1):
                future_occurrences = (
                    (triple_token_ids[:, i + 1:]
                     ) == triple_token_ids[:, i].unsqueeze(-1)).float()
                future_copy_prob = copy_prob[:, i + 1:].min(future_occurrences)
                copy_prob_slice += future_copy_prob.sum(-1)

            if i > 0:
                prev_occurrences = triple_token_ids[:, :
                                                    i] == triple_token_ids[:,
                                                                           i].unsqueeze(
                                                                               -1
                                                                           )
                duplicate_mask = (prev_occurrences.sum(-1) == 0).float()
                copy_prob_slice = copy_prob_slice.min(duplicate_mask)

            left_over_copy_prob = copy_prob_slice.min(
                (~copy_to_add_mask).float())
            modified_prob_list.append(left_over_copy_prob.unsqueeze(-1))

        modified_prob_list.insert(0, gen_prob)
        modified_prob = torch.cat(modified_prob_list, dim=-1)
        return modified_prob

Example #21

0

Show file

    def forward(self, tokens: torch.Tensor, mask: torch.Tensor):
        batch_size, sequence_length, _ = tokens.size()
        # Shape: (batch_size, sequence_length, sequence_length)
        similarity_matrix = self._matrix_attention(tokens, tokens)

        if self._num_attention_heads > 1:
            # In this case, the similarity matrix actually has shape
            # (batch_size, sequence_length, sequence_length, num_heads).  To make the rest of the
            # logic below easier, we'll permute this to
            # (batch_size, sequence_length, num_heads, sequence_length).
            similarity_matrix = similarity_matrix.permute(0, 1, 3, 2)

        # Shape: (batch_size, sequence_length, [num_heads,] sequence_length)
        intra_sentence_attention = util.masked_softmax(
            similarity_matrix.contiguous(), mask)

        # Shape: (batch_size, sequence_length, projection_dim)
        output_token_representation = self._projection(tokens)

        if self._num_attention_heads > 1:
            # We need to split and permute the output representation to be
            # (batch_size, num_heads, sequence_length, projection_dim / num_heads), so that we can
            # do a proper weighted sum with `intra_sentence_attention`.
            shape = list(output_token_representation.size())
            new_shape = shape[:-1] + [self._num_attention_heads, -1]
            # Shape: (batch_size, sequence_length, num_heads, projection_dim / num_heads)
            output_token_representation = output_token_representation.view(
                *new_shape)
            # Shape: (batch_size, num_heads, sequence_length, projection_dim / num_heads)
            output_token_representation = output_token_representation.permute(
                0, 2, 1, 3)

        # Shape: (batch_size, sequence_length, [num_heads,] projection_dim [/ num_heads])
        attended_sentence = util.weighted_sum(output_token_representation,
                                              intra_sentence_attention)

        if self._num_attention_heads > 1:
            # Here we concatenate the weighted representation for each head.  We'll accomplish this
            # just with a resize.
            # Shape: (batch_size, sequence_length, projection_dim)
            attended_sentence = attended_sentence.view(batch_size,
                                                       sequence_length, -1)

        # Shape: (batch_size, sequence_length, combination_dim)
        combined_tensors = util.combine_tensors(self._combination,
                                                [tokens, attended_sentence])
        return self._output_projection(combined_tensors)

Example #22

0

Show file

File: nlvr_decoder_step.py Project: ziyaoh/allennlp

 def _get_next_state_info_with_agenda(
     state: NlvrDecoderState, considered_actions: List[List[int]],
     action_logits: torch.Tensor, action_mask: torch.Tensor
 ) -> Tuple[List[List[Tuple[int, torch.LongTensor]]],
            List[List[torch.LongTensor]]]:
     """
     We return a list of log probabilities and checklists corresponding to next actions that are
     not padding. This method is applicable to the case where we do not have target action
     sequences an are relying on agendas for training.
     """
     considered_action_probs = nn_util.masked_softmax(
         action_logits, action_mask)
     # Mixing model scores and agenda selection probabilities to compute the probabilities of all
     # actions for the next step and the corresponding new checklists.
     # All action logprobs will keep track of logprob corresponding to each local action index
     # for each instance.
     all_action_logprobs: List[List[Tuple[int, torch.LongTensor]]] = []
     all_new_checklists: List[List[torch.LongTensor]] = []
     for group_index, instance_info in enumerate(
             zip(state.score, considered_action_probs, state.checklist)):
         (instance_score, instance_probs,
          instance_checklist) = instance_info
         terminal_actions = state.terminal_actions[
             group_index]  # (num_terminals, 1)
         # We will mix the model scores with agenda selection probabilities and compute their
         # logs to fill the following list with action indices and corresponding logprobs.
         instance_action_logprobs: List[Tuple[int, torch.Tensor]] = []
         instance_new_checklists: List[torch.LongTensor] = []
         for action_index, action_prob in enumerate(instance_probs):
             # This is the actual index of the action from the original list of actions.
             action = considered_actions[group_index][action_index]
             if action == -1:
                 # Ignoring padding.
                 continue
             # checklist_addition will have 1 only for the index corresponding to the current
             # action and we're adding 1.0 at the corresponding action index.
             checklist_addition = (terminal_actions == action
                                   ).float()  # (terminal_actions, 1)
             checklist_addition = checklist_addition.float(
             )  # (terminal_actions, 1)
             new_checklist = instance_checklist + checklist_addition  # (terminal_actions, 1)
             instance_new_checklists.append(new_checklist)
             logprob = instance_score + torch.log(action_prob + 1e-13)
             instance_action_logprobs.append((action_index, logprob))
         all_action_logprobs.append(instance_action_logprobs)
         all_new_checklists.append(instance_new_checklists)
     return all_action_logprobs, all_new_checklists

Example #23

0

Show file

    def _compute_attention(
            self,
            decoder_hidden_state: torch.LongTensor = None,
            encoder_outputs: torch.LongTensor = None,
            encoder_outputs_mask: torch.LongTensor = None) -> torch.Tensor:
        """Apply attention over encoder outputs and decoder state.
        Parameters
        ----------
        decoder_hidden_state : ``torch.LongTensor``
            A tensor of shape ``(batch_size, decoder_output_dim)``, which contains the current decoder hidden state to be used
            as the 'query' to the attention computation
            during the last time step.
        encoder_outputs : ``torch.LongTensor``
            A tensor of shape ``(batch_size, max_input_sequence_length, encoder_output_dim)``, which contains all the
            encoder hidden states of the source tokens, i.e., the 'keys' to the attention computation
        encoder_mask : ``torch.LongTensor``
            A tensor of shape (batch_size, max_input_sequence_length), which contains the mask of the encoded input.
            We want to avoid computing an attention score for positions of the source with zero-values (remember not all
            input sentences have the same length)

        Returns
        -------
        torch.Tensor
            A tensor of shape (batch_size, encoder_output_dim) that contains the attended encoder outputs (aka context vector),
            i.e., we have ``applied`` the attention scores on the encoder hidden states.

        Notes
        -----
            Don't forget to apply the final softmax over the **masked** encoder outputs!
        """

        # Ensure mask is also a FloatTensor. Or else the multiplication within
        # attention will complain.
        # shape: (batch_size, max_input_sequence_length)
        encoder_outputs_mask = encoder_outputs_mask.float()

        # Main body of attention weights computation here
        attention_scores = encoder_outputs.bmm(
            decoder_hidden_state.unsqueeze(-1)).squeeze(-1)
        masked_attention_scores = masked_softmax(attention_scores,
                                                 encoder_outputs_mask)
        attended_output = util.weighted_sum(encoder_outputs,
                                            masked_attention_scores)

        # masked_softmax()
        return attended_output, masked_attention_scores

Example #24

0

Show file

File: attentive_text_field_embedder.py Project: asiddhant/taskonomy-nlp

    def forward(self, tokens, num_wrapping_dims: int = 0) -> torch.Tensor:
        embedded_representations = []
        keys = sorted(self._token_embedders.keys())
        for key in keys:
            # Note: need to use getattr here so that the pytorch voodoo
            # with submodules works with multiple GPUs.
            if key in self.separate_embedder_keys:
                continue
            embedder = getattr(self, 'token_embedder_{}'.format(key))
            for _ in range(num_wrapping_dims):
                embedder = TimeDistributed(embedder)
            token_vectors = self.linear_layers[key](embedder(tokens))
            embedded_representations.append(token_vectors)

        mask = util.get_text_field_mask(tokens)
        embedded_representations = torch.stack(embedded_representations,
                                               dim=-2)
        query_emb = self.rnn_encoder(tokens, mask)
        similarities = torch.matmul(embedded_representations,
                                    query_emb.unsqueeze(-1)).squeeze(-1)
        similarities = util.masked_softmax(similarities, mask, dim=-1)
        combined_emb = torch.matmul(embedded_representations.transpose(2, 3),
                                    similarities.unsqueeze(-1)).squeeze(-1)

        if self.use_glove:
            embedder = getattr(self, 'token_embedder_tokens')
            for _ in range(num_wrapping_dims):
                embedder = TimeDistributed(self.glove_embedder)
            glove_emb = embedder(tokens['tokens'])
            combined_emb = torch.cat([combined_emb, glove_emb], dim=-1)

        if self.use_elmo:
            embedder = getattr(self, 'token_embedder_elmo')
            for _ in range(num_wrapping_dims):
                embedder = TimeDistributed(self.elmo_embedder)
            elmo_emb = embedder(tokens['elmo'])
            combined_emb = torch.cat([combined_emb, elmo_emb], dim=-1)

        if self.use_char:
            embedder = getattr(self, 'token_embedder_token_characters')
            for _ in range(num_wrapping_dims):
                embedder = TimeDistributed(self.char_embeddder)
            token_vectors = embedder(tokens['token_characters'])
            combined_emb = torch.cat([combined_emb, token_vectors], dim=-1)

        return combined_emb

Example #25

0

Show file

    def _compute_attention(self, encoder_outputs: torch.Tensor,
                           encoder_mask: torch.Tensor,
                           decoder_outputs: torch.Tensor) -> torch.Tensor:
        """
        Computes the attention-based decoder hidden representation by first
        computing the attention scores between the encoder and decoder hidden
        states, computing the attention context via a weighted average over
        the encoder hidden states, concatenating the decoder state with the
        context, and passing the result through the attention layer to project
        it back down to the decoder hidden state size.

        Parameters
        ----------
        encoder_outputs: ``torch.Tensor``, ``(batch_size, num_document_tokens, encoder_hidden_size)``
            The output from the encoder.
        encoder_mask: ``torch.Tensor``, ``(batch_size, num_document_tokens)``
            The document token mask.
        decoder_outputs: ``torch.Tensor``, ``(batch_size, num_summary_tokens, decoder_hidden_size)``
            The output from the decoder.

        Returns
        -------
        hidden: ``torch.Tensor``, ``(batch_size, num_summary_tokens, decoder_hidden_size)``
            The new decoder hidden state representation.
        attention_probabilities: ``torch.Tensor``, ``(batch_size, num_summary_tokens, num_document_tokens)``
            The attention probabilities over the document tokens for each summary token
        """
        # Compute the attention context
        # shape: (group_size, num_summary_tokens, num_document_tokens)
        attention_scores = self.attention(decoder_outputs, encoder_outputs)
        # shape: (group_size, num_summary_tokens, num_document_tokens)
        attention_probabilities = masked_softmax(attention_scores,
                                                 encoder_mask)
        # shape: (group_size, num_summary_tokens, encoder_hidden_size)
        attention_context = weighted_sum(encoder_outputs,
                                         attention_probabilities)

        # Concatenate the attention context with the decoder outputs
        # then project back to the decoder hidden size
        # shape: (group_size, num_summary_tokens, encoder_hidden_size + decoder_hidden_size)
        concat = torch.cat([attention_context, decoder_outputs], dim=2)

        # shape: (group_size, num_summary_tokens, decoder_hidden_size)
        projected_hidden = self.attention_layer(concat)
        return projected_hidden, attention_probabilities

Example #26

0

Show file

    def knowledge_self_attention(self, k, k_mask, do_sum=False):
        if k.dim() > 3:
            B, T, W, D = k.size()
            k = k.view(B, T * W, D)
            k_mask = k_mask.view(B, T * W)
        attn = self.knowledge_attn(k, k)
        attn = util.masked_softmax(attn, k_mask, memory_efficient=True)

        if k.dim() >= 3:
            k = k.contiguous().view(B * T, W, D)
            attn = attn.view(B * T, W, W * 2)

        logger.warn(k.shape)
        k = torch.bmm(attn, k)

        if do_sum:
            k = k.sum(dim=-2)
        return k.squeeze()

Example #27

0

Show file

File: layers.py Project: shyamalschandra/PathNet

    def forward(self, xinit: torch.FloatTensor,
                xmask: torch.LongTensor) -> torch.FloatTensor:
        """

        :param xinit: B * T * H
        :param xmask: B * T
        :return: B * H
        """
        if self._int_proj is not None:
            x = self._int_proj(xinit)
            x = x * xmask.unsqueeze(-1)
        else:
            x = xinit
        attn = self._projector(x)  # B * T * 1
        attn = attn.squeeze(-1)  # B * T
        attn = masked_softmax(attn, xmask, dim=-1)
        pooled = attn.unsqueeze(1).bmm(xinit).squeeze(1)  # B * H
        return pooled

Example #28

0

Show file

File: events.py Project: zxlzr/dygiepp

    def _compute_argument_scores(self,
                                 pairwise_embeddings,
                                 top_trig_scores,
                                 top_arg_scores,
                                 top_arg_mask,
                                 prepend_zeros=True):
        batch_size = pairwise_embeddings.size(0)
        max_num_trigs = pairwise_embeddings.size(1)
        max_num_args = pairwise_embeddings.size(2)
        feature_dim = self._argument_feedforward.input_dim

        embeddings_flat = pairwise_embeddings.view(-1, feature_dim)

        arguments_projected_flat = self._argument_feedforward(embeddings_flat)

        argument_scores_flat = self._argument_scorer(arguments_projected_flat)

        argument_scores = argument_scores_flat.view(batch_size, max_num_trigs,
                                                    max_num_args, -1)

        # Add the mention scores for each of the candidates.

        argument_scores += (top_trig_scores.unsqueeze(-1) +
                            top_arg_scores.transpose(1, 2).unsqueeze(-1))

        # Softmax correction to compare arguments.
        if self._softmax_correction:
            the_temp = torch.exp(self._softmax_log_temp)
            the_multiplier = torch.exp(self._softmax_log_multiplier)
            softmax_scores = util.masked_softmax(argument_scores / the_temp,
                                                 mask=top_arg_mask,
                                                 dim=2)
            argument_scores = argument_scores + the_multiplier * softmax_scores

        shape = [
            argument_scores.size(0),
            argument_scores.size(1),
            argument_scores.size(2), 1
        ]
        dummy_scores = argument_scores.new_zeros(*shape)

        if prepend_zeros:
            argument_scores = torch.cat([dummy_scores, argument_scores], -1)
        return argument_scores

Example #29

0

Show file

    def _compute_attention(self, sentence_encodings: torch.Tensor,
                           context_encodings: torch.Tensor,
                           context_mask: torch.Tensor) -> torch.Tensor:
        """
        Computes new sentence encodings using an attention mechanism between
        the original sentence encodings and some context encodings. The context
        encodings are not necessarily the context in the cloze task sense, but
        any vector over which the attention should be computed.

        Parameters
        ----------
        sentence_encodings: (batch_size, num_sents, hidden_dim)
            The original sentence encodings
        context_encodings: (batch_size, num_contexts, hidden_dim)
            The representation of each context item
        context_mask: (batch_size, num_contexts)
            The context item mask

        Returns
        -------
        The new sentence encodings: (batch_size, num_sents, hidden_dim)
        """
        if self.attention is None or self.attention_layer is None:
            raise Exception(
                '`attention` and `attention_layer` must not be `None` to use attention'
            )

        # shape: (batch_size, num_sents, num_context_tokens)
        attention_scores = self.attention(sentence_encodings,
                                          context_encodings)
        # shape: (batch_size, num_sents, num_context_tokens)
        attention_probabilities = masked_softmax(attention_scores,
                                                 context_mask)
        # shape: (batch_size, num_sents, hidden_size)
        attention_context = weighted_sum(context_encodings,
                                         attention_probabilities)

        # Concatenate the attention context with the sentence encodings
        # then project back to the sentence encoder hidden size
        # shape: (batch_size, num_sents, hidden_size * 2)
        concat = torch.cat([attention_context, sentence_encodings], dim=2)
        # shape: (batch_size, num_sents, hidden_size)
        projected_hidden = self.attention_layer(concat)
        return projected_hidden

Example #30

0

Show file

File: editor.py Project: isomap/factedit

    def _get_log_likelihood(self, state: Dict[str, torch.Tensor],
                            target_actions: torch.Tensor,
                            target_to_source: torch.Tensor) -> torch.Tensor:
        hidden = self.P(self._get_query(state))
        gate_prob = self.G(hidden).squeeze(1)

        gen_prob = util.masked_softmax(self.W(hidden), state["action_mask"], memory_efficient=True)\
            .gather(1, target_actions.unsqueeze(1)).squeeze(1)
        gen_mask = (target_actions != self.OOV) | (target_to_source.sum(dim=-1)
                                                   == 0)
        gen_prob = gen_prob.min(gen_mask.float())

        copy_prob = self.COPY_ATTN(hidden, state["encoded_triple"], state["triple_mask"])\
            .masked_fill(~target_to_source, 0.).sum(dim=-1)

        step_prob = gen_prob * gate_prob + copy_prob * (-gate_prob + 1)
        step_log_likelihood = step_prob.clamp(1e-30).log()

        return step_log_likelihood

Example #31

0

Show file

    def forward(self, x, x_mask):
        """
            Args:
                x: batch * len1 * dim1
                x_mask: batch * len1 (1 for padding, 0 for true)
            Output:
                matched_seq: batch * len1 * dim1
        """
        scores = x.bmm(x.transpose(2, 1))
        x_len = x.size(1)
        for i in range(x_len):
            scores[:, i, i] = 0

        x_mask = x_mask.unsqueeze(1).expand(scores.size())

        alpha = util.masked_softmax(scores, x_mask)

        matched_seq = alpha.bmm(x)
        return matched_seq

Example #32

0

Show file

    def forward(
        self,
        sequence_tensor: torch.FloatTensor,
        span_indices: torch.LongTensor,
        span_indices_mask: torch.BoolTensor = None,
    ) -> torch.FloatTensor:
        # shape (batch_size, sequence_length, 1)

        global_attention_logits = torch.matmul(
            sequence_tensor,
            torch.zeros(self.input_dim, 1).to_device(sequence_tensor.device()))

        # shape (batch_size, sequence_length, embedding_dim + 1)
        concat_tensor = torch.cat([sequence_tensor, global_attention_logits],
                                  -1)

        concat_output, span_mask = util.batched_span_select(
            concat_tensor, span_indices)

        print(span_mask)

        # Shape: (batch_size, num_spans, max_batch_span_width, embedding_dim)
        span_embeddings = concat_output[:, :, :, :-1]
        # Shape: (batch_size, num_spans, max_batch_span_width)
        span_attention_logits = concat_output[:, :, :, -1]

        # Shape: (batch_size, num_spans, max_batch_span_width)
        span_attention_weights = util.masked_softmax(span_attention_logits,
                                                     span_mask)

        # Do a weighted sum of the embedded spans with
        # respect to the normalised attention distributions.
        # Shape: (batch_size, num_spans, embedding_dim)
        attended_text_embeddings = util.weighted_sum(span_embeddings,
                                                     span_attention_weights)

        if span_indices_mask is not None:
            # Above we were masking the widths of spans with respect to the max
            # span width in the batch. Here we are masking the spans which were
            # originally passed in as padding.
            return attended_text_embeddings * span_indices_mask.unsqueeze(-1)

        return attended_text_embeddings

Example #33

0

Show file

File: model_kv.py Project: viswanathgs/r2c

    def attended_omcs_embeddings(self, vcr_embs):
        projected_embs = self.normalize_embedding(
            self.omcs_mlp(vcr_embs).view(-1, vcr_embs.shape[-1]))
        n, d = projected_embs.size()
        device = projected_embs.get_device()

        def swig_ptr_from_FloatTensor(x):
            assert x.is_contiguous()
            assert x.dtype == torch.float32
            return faiss.cast_integer_to_float_ptr(x.storage().data_ptr())

        def swig_ptr_from_LongTensor(x):
            assert x.is_contiguous()
            assert x.dtype == torch.int64, 'dtype=%s' % x.dtype
            return faiss.cast_integer_to_long_ptr(x.storage().data_ptr())

        D = torch.empty((n, self.k), dtype=torch.float32, device=device)
        I = torch.empty((n, self.k), dtype=torch.int64, device=device)
        torch.cuda.synchronize()
        self.omcs_index.at(device).search_c(
            n,
            swig_ptr_from_FloatTensor(projected_embs),
            self.k,
            swig_ptr_from_FloatTensor(D),
            swig_ptr_from_LongTensor(I),
        )
        torch.cuda.synchronize()

        # Compute softmax of similarity scores.
        # Only use those with cosine similarity scores above thresh.
        # TODO (viswanath): Use scaled-dot-product attn w/o normalization?
        mask = (D >= self.similarity_thresh)
        attention_wts = masked_softmax(D, mask)

        # Fetch the nearest found embeddings and then apply attention
        # using the computed weights.
        nearest_omcs_embs = self.omcs_embs[I]  # (n, k, d)
        attended_omcs_embs = torch.einsum('nk,nkd->nd',
                                          (attention_wts, nearest_omcs_embs))

        # Reshape to match original vcr_embs
        return attended_omcs_embs.view(*vcr_embs.shape[:-1], -1)

Example #34

0

Show file

File: intra_sentence_attention.py Project: apmoore1/allennlp

    def forward(self, tokens: torch.Tensor, mask: torch.Tensor):  # pylint: disable=arguments-differ
        batch_size, sequence_length, _ = tokens.size()
        # Shape: (batch_size, sequence_length, sequence_length)
        similarity_matrix = self._matrix_attention(tokens, tokens)

        if self._num_attention_heads > 1:
            # In this case, the similarity matrix actually has shape
            # (batch_size, sequence_length, sequence_length, num_heads).  To make the rest of the
            # logic below easier, we'll permute this to
            # (batch_size, sequence_length, num_heads, sequence_length).
            similarity_matrix = similarity_matrix.permute(0, 1, 3, 2)

        # Shape: (batch_size, sequence_length, [num_heads,] sequence_length)
        intra_sentence_attention = util.masked_softmax(similarity_matrix.contiguous(), mask)

        # Shape: (batch_size, sequence_length, projection_dim)
        output_token_representation = self._projection(tokens)

        if self._num_attention_heads > 1:
            # We need to split and permute the output representation to be
            # (batch_size, num_heads, sequence_length, projection_dim / num_heads), so that we can
            # do a proper weighted sum with `intra_sentence_attention`.
            shape = list(output_token_representation.size())
            new_shape = shape[:-1] + [self._num_attention_heads, -1]
            # Shape: (batch_size, sequence_length, num_heads, projection_dim / num_heads)
            output_token_representation = output_token_representation.view(*new_shape)
            # Shape: (batch_size, num_heads, sequence_length, projection_dim / num_heads)
            output_token_representation = output_token_representation.permute(0, 2, 1, 3)

        # Shape: (batch_size, sequence_length, [num_heads,] projection_dim [/ num_heads])
        attended_sentence = util.weighted_sum(output_token_representation,
                                              intra_sentence_attention)

        if self._num_attention_heads > 1:
            # Here we concatenate the weighted representation for each head.  We'll accomplish this
            # just with a resize.
            # Shape: (batch_size, sequence_length, projection_dim)
            attended_sentence = attended_sentence.view(batch_size, sequence_length, -1)

        # Shape: (batch_size, sequence_length, combination_dim)
        combined_tensors = util.combine_tensors(self._combination, [tokens, attended_sentence])
        return self._output_projection(combined_tensors)

Example #35

0

Show file

File: nlvr_decoder_step.py Project: pyknife/allennlp

 def _get_next_state_info_with_agenda(
         state: NlvrDecoderState,
         considered_actions: List[List[int]],
         action_logits: torch.Tensor,
         action_mask: torch.Tensor) -> Tuple[List[List[Tuple[int, torch.LongTensor]]],
                                             List[List[ChecklistState]]]:
     """
     We return a list of log probabilities and checklist states corresponding to next actions that are
     not padding. This method is applicable to the case where we do not have target action
     sequences and are relying on agendas for training.
     """
     considered_action_probs = nn_util.masked_softmax(action_logits, action_mask)
     # Mixing model scores and agenda selection probabilities to compute the probabilities of all
     # actions for the next step and the corresponding new checklists.
     # All action logprobs will keep track of logprob corresponding to each local action index
     # for each instance.
     all_action_logprobs: List[List[Tuple[int, torch.LongTensor]]] = []
     all_new_checklist_states: List[List[ChecklistState]] = []
     for group_index, instance_info in enumerate(zip(state.score,
                                                     considered_action_probs,
                                                     state.checklist_state)):
         (instance_score, instance_probs, instance_checklist_state) = instance_info
         # We will mix the model scores with agenda selection probabilities and compute their
         # logs to fill the following list with action indices and corresponding logprobs.
         instance_action_logprobs: List[Tuple[int, torch.Tensor]] = []
         instance_new_checklist_states: List[ChecklistState] = []
         for action_index, action_prob in enumerate(instance_probs):
             # This is the actual index of the action from the original list of actions.
             action = considered_actions[group_index][action_index]
             if action == -1:
                 # Ignoring padding.
                 continue
             new_checklist_state = instance_checklist_state.update(action)  # (terminal_actions, 1)
             instance_new_checklist_states.append(new_checklist_state)
             logprob = instance_score + torch.log(action_prob + 1e-13)
             instance_action_logprobs.append((action_index, logprob))
         all_action_logprobs.append(instance_action_logprobs)
         all_new_checklist_states.append(instance_new_checklist_states)
     return all_action_logprobs, all_new_checklist_states

Example #36

0

Show file

File: expected_risk_minimization.py Project: pyknife/allennlp

    def decode(self,
               initial_state: DecoderState,
               decode_step: DecoderStep,
               supervision: Callable[[StateType], torch.Tensor]) -> Dict[str, torch.Tensor]:
        cost_function = supervision
        finished_states = self._get_finished_states(initial_state, decode_step)
        loss = initial_state.score[0].new_zeros(1)
        finished_model_scores = self._get_model_scores_by_batch(finished_states)
        finished_costs = self._get_costs_by_batch(finished_states, cost_function)
        for batch_index in finished_model_scores:
            # Finished model scores are log-probabilities of the predicted sequences. We convert
            # log probabilities into probabilities and re-normalize them to compute expected cost under
            # the distribution approximated by the beam search.

            costs = torch.cat([tensor.view(-1) for tensor in finished_costs[batch_index]])
            logprobs = torch.cat([tensor.view(-1) for tensor in finished_model_scores[batch_index]])
            # Unmasked softmax of log probabilities will convert them into probabilities and
            # renormalize them.
            renormalized_probs = nn_util.masked_softmax(logprobs, None)
            loss += renormalized_probs.dot(costs)
        mean_loss = loss / len(finished_model_scores)
        return {'loss': mean_loss,
                'best_action_sequences': self._get_best_action_sequences(finished_states)}

Example #37

0

Show file

File: bidaf.py Project: apmoore1/allennlp

    def forward(self,  # type: ignore
                question: Dict[str, torch.LongTensor],
                passage: Dict[str, torch.LongTensor],
                span_start: torch.IntTensor = None,
                span_end: torch.IntTensor = None,
                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        question : Dict[str, torch.LongTensor]
            From a ``TextField``.
        passage : Dict[str, torch.LongTensor]
            From a ``TextField``.  The model assumes that this passage contains the answer to the
            question, and predicts the beginning and ending positions of the answer within the
            passage.
        span_start : ``torch.IntTensor``, optional
            From an ``IndexField``.  This is one of the things we are trying to predict - the
            beginning position of the answer with the passage.  This is an `inclusive` token index.
            If this is given, we will compute a loss that gets included in the output dictionary.
        span_end : ``torch.IntTensor``, optional
            From an ``IndexField``.  This is one of the things we are trying to predict - the
            ending position of the answer with the passage.  This is an `inclusive` token index.
            If this is given, we will compute a loss that gets included in the output dictionary.
        metadata : ``List[Dict[str, Any]]``, optional
            If present, this should contain the question ID, original passage text, and token
            offsets into the passage for each instance in the batch.  We use this for computing
            official metrics using the official SQuAD evaluation script.  The length of this list
            should be the batch size, and each dictionary should have the keys ``id``,
            ``original_passage``, and ``token_offsets``.  If you only want the best span string and
            don't care about official metrics, you can omit the ``id`` key.

        Returns
        -------
        An output dictionary consisting of:
        span_start_logits : torch.FloatTensor
            A tensor of shape ``(batch_size, passage_length)`` representing unnormalized log
            probabilities of the span start position.
        span_start_probs : torch.FloatTensor
            The result of ``softmax(span_start_logits)``.
        span_end_logits : torch.FloatTensor
            A tensor of shape ``(batch_size, passage_length)`` representing unnormalized log
            probabilities of the span end position (inclusive).
        span_end_probs : torch.FloatTensor
            The result of ``softmax(span_end_logits)``.
        best_span : torch.IntTensor
            The result of a constrained inference over ``span_start_logits`` and
            ``span_end_logits`` to find the most probable span.  Shape is ``(batch_size, 2)``
            and each offset is a token index.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        best_span_str : List[str]
            If sufficient metadata was provided for the instances in the batch, we also return the
            string from the original passage that the model thinks is the best answer to the
            question.
        """
        embedded_question = self._highway_layer(self._text_field_embedder(question))
        embedded_passage = self._highway_layer(self._text_field_embedder(passage))
        batch_size = embedded_question.size(0)
        passage_length = embedded_passage.size(1)
        question_mask = util.get_text_field_mask(question).float()
        passage_mask = util.get_text_field_mask(passage).float()
        question_lstm_mask = question_mask if self._mask_lstms else None
        passage_lstm_mask = passage_mask if self._mask_lstms else None

        encoded_question = self._dropout(self._phrase_layer(embedded_question, question_lstm_mask))
        encoded_passage = self._dropout(self._phrase_layer(embedded_passage, passage_lstm_mask))
        encoding_dim = encoded_question.size(-1)

        # Shape: (batch_size, passage_length, question_length)
        passage_question_similarity = self._matrix_attention(encoded_passage, encoded_question)
        # Shape: (batch_size, passage_length, question_length)
        passage_question_attention = util.masked_softmax(passage_question_similarity, question_mask)
        # Shape: (batch_size, passage_length, encoding_dim)
        passage_question_vectors = util.weighted_sum(encoded_question, passage_question_attention)

        # We replace masked values with something really negative here, so they don't affect the
        # max below.
        masked_similarity = util.replace_masked_values(passage_question_similarity,
                                                       question_mask.unsqueeze(1),
                                                       -1e7)
        # Shape: (batch_size, passage_length)
        question_passage_similarity = masked_similarity.max(dim=-1)[0].squeeze(-1)
        # Shape: (batch_size, passage_length)
        question_passage_attention = util.masked_softmax(question_passage_similarity, passage_mask)
        # Shape: (batch_size, encoding_dim)
        question_passage_vector = util.weighted_sum(encoded_passage, question_passage_attention)
        # Shape: (batch_size, passage_length, encoding_dim)
        tiled_question_passage_vector = question_passage_vector.unsqueeze(1).expand(batch_size,
                                                                                    passage_length,
                                                                                    encoding_dim)

        # Shape: (batch_size, passage_length, encoding_dim * 4)
        final_merged_passage = torch.cat([encoded_passage,
                                          passage_question_vectors,
                                          encoded_passage * passage_question_vectors,
                                          encoded_passage * tiled_question_passage_vector],
                                         dim=-1)

        modeled_passage = self._dropout(self._modeling_layer(final_merged_passage, passage_lstm_mask))
        modeling_dim = modeled_passage.size(-1)

        # Shape: (batch_size, passage_length, encoding_dim * 4 + modeling_dim))
        span_start_input = self._dropout(torch.cat([final_merged_passage, modeled_passage], dim=-1))
        # Shape: (batch_size, passage_length)
        span_start_logits = self._span_start_predictor(span_start_input).squeeze(-1)
        # Shape: (batch_size, passage_length)
        span_start_probs = util.masked_softmax(span_start_logits, passage_mask)

        # Shape: (batch_size, modeling_dim)
        span_start_representation = util.weighted_sum(modeled_passage, span_start_probs)
        # Shape: (batch_size, passage_length, modeling_dim)
        tiled_start_representation = span_start_representation.unsqueeze(1).expand(batch_size,
                                                                                   passage_length,
                                                                                   modeling_dim)

        # Shape: (batch_size, passage_length, encoding_dim * 4 + modeling_dim * 3)
        span_end_representation = torch.cat([final_merged_passage,
                                             modeled_passage,
                                             tiled_start_representation,
                                             modeled_passage * tiled_start_representation],
                                            dim=-1)
        # Shape: (batch_size, passage_length, encoding_dim)
        encoded_span_end = self._dropout(self._span_end_encoder(span_end_representation,
                                                                passage_lstm_mask))
        # Shape: (batch_size, passage_length, encoding_dim * 4 + span_end_encoding_dim)
        span_end_input = self._dropout(torch.cat([final_merged_passage, encoded_span_end], dim=-1))
        span_end_logits = self._span_end_predictor(span_end_input).squeeze(-1)
        span_end_probs = util.masked_softmax(span_end_logits, passage_mask)
        span_start_logits = util.replace_masked_values(span_start_logits, passage_mask, -1e7)
        span_end_logits = util.replace_masked_values(span_end_logits, passage_mask, -1e7)
        best_span = self.get_best_span(span_start_logits, span_end_logits)

        output_dict = {
                "passage_question_attention": passage_question_attention,
                "span_start_logits": span_start_logits,
                "span_start_probs": span_start_probs,
                "span_end_logits": span_end_logits,
                "span_end_probs": span_end_probs,
                "best_span": best_span,
                }

        # Compute the loss for training.
        if span_start is not None:
            loss = nll_loss(util.masked_log_softmax(span_start_logits, passage_mask), span_start.squeeze(-1))
            self._span_start_accuracy(span_start_logits, span_start.squeeze(-1))
            loss += nll_loss(util.masked_log_softmax(span_end_logits, passage_mask), span_end.squeeze(-1))
            self._span_end_accuracy(span_end_logits, span_end.squeeze(-1))
            self._span_accuracy(best_span, torch.stack([span_start, span_end], -1))
            output_dict["loss"] = loss

        # Compute the EM and F1 on SQuAD and add the tokenized input to the output.
        if metadata is not None:
            output_dict['best_span_str'] = []
            question_tokens = []
            passage_tokens = []
            for i in range(batch_size):
                question_tokens.append(metadata[i]['question_tokens'])
                passage_tokens.append(metadata[i]['passage_tokens'])
                passage_str = metadata[i]['original_passage']
                offsets = metadata[i]['token_offsets']
                predicted_span = tuple(best_span[i].detach().cpu().numpy())
                start_offset = offsets[predicted_span[0]][0]
                end_offset = offsets[predicted_span[1]][1]
                best_span_string = passage_str[start_offset:end_offset]
                output_dict['best_span_str'].append(best_span_string)
                answer_texts = metadata[i].get('answer_texts', [])
                if answer_texts:
                    self._squad_metrics(best_span_string, answer_texts)
            output_dict['question_tokens'] = question_tokens
            output_dict['passage_tokens'] = passage_tokens
        return output_dict

Example #38

0

Show file

File: esim.py Project: apmoore1/allennlp

    def forward(self,  # type: ignore
                premise: Dict[str, torch.LongTensor],
                hypothesis: Dict[str, torch.LongTensor],
                label: torch.IntTensor = None,
                metadata: List[Dict[str, Any]] = None  # pylint:disable=unused-argument
               ) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        premise : Dict[str, torch.LongTensor]
            From a ``TextField``
        hypothesis : Dict[str, torch.LongTensor]
            From a ``TextField``
        label : torch.IntTensor, optional (default = None)
            From a ``LabelField``
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            Metadata containing the original tokenization of the premise and
            hypothesis with 'premise_tokens' and 'hypothesis_tokens' keys respectively.

        Returns
        -------
        An output dictionary consisting of:

        label_logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_labels)`` representing unnormalised log
            probabilities of the entailment label.
        label_probs : torch.FloatTensor
            A tensor of shape ``(batch_size, num_labels)`` representing probabilities of the
            entailment label.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """
        embedded_premise = self._text_field_embedder(premise)
        embedded_hypothesis = self._text_field_embedder(hypothesis)
        premise_mask = get_text_field_mask(premise).float()
        hypothesis_mask = get_text_field_mask(hypothesis).float()

        # apply dropout for LSTM
        if self.rnn_input_dropout:
            embedded_premise = self.rnn_input_dropout(embedded_premise)
            embedded_hypothesis = self.rnn_input_dropout(embedded_hypothesis)

        # encode premise and hypothesis
        encoded_premise = self._encoder(embedded_premise, premise_mask)
        encoded_hypothesis = self._encoder(embedded_hypothesis, hypothesis_mask)

        # Shape: (batch_size, premise_length, hypothesis_length)
        similarity_matrix = self._matrix_attention(encoded_premise, encoded_hypothesis)

        # Shape: (batch_size, premise_length, hypothesis_length)
        p2h_attention = masked_softmax(similarity_matrix, hypothesis_mask)
        # Shape: (batch_size, premise_length, embedding_dim)
        attended_hypothesis = weighted_sum(encoded_hypothesis, p2h_attention)

        # Shape: (batch_size, hypothesis_length, premise_length)
        h2p_attention = masked_softmax(similarity_matrix.transpose(1, 2).contiguous(), premise_mask)
        # Shape: (batch_size, hypothesis_length, embedding_dim)
        attended_premise = weighted_sum(encoded_premise, h2p_attention)

        # the "enhancement" layer
        premise_enhanced = torch.cat(
                [encoded_premise, attended_hypothesis,
                 encoded_premise - attended_hypothesis,
                 encoded_premise * attended_hypothesis],
                dim=-1
        )
        hypothesis_enhanced = torch.cat(
                [encoded_hypothesis, attended_premise,
                 encoded_hypothesis - attended_premise,
                 encoded_hypothesis * attended_premise],
                dim=-1
        )

        # The projection layer down to the model dimension.  Dropout is not applied before
        # projection.
        projected_enhanced_premise = self._projection_feedforward(premise_enhanced)
        projected_enhanced_hypothesis = self._projection_feedforward(hypothesis_enhanced)

        # Run the inference layer
        if self.rnn_input_dropout:
            projected_enhanced_premise = self.rnn_input_dropout(projected_enhanced_premise)
            projected_enhanced_hypothesis = self.rnn_input_dropout(projected_enhanced_hypothesis)
        v_ai = self._inference_encoder(projected_enhanced_premise, premise_mask)
        v_bi = self._inference_encoder(projected_enhanced_hypothesis, hypothesis_mask)

        # The pooling layer -- max and avg pooling.
        # (batch_size, model_dim)
        v_a_max, _ = replace_masked_values(
                v_ai, premise_mask.unsqueeze(-1), -1e7
        ).max(dim=1)
        v_b_max, _ = replace_masked_values(
                v_bi, hypothesis_mask.unsqueeze(-1), -1e7
        ).max(dim=1)

        v_a_avg = torch.sum(v_ai * premise_mask.unsqueeze(-1), dim=1) / torch.sum(
                premise_mask, 1, keepdim=True
        )
        v_b_avg = torch.sum(v_bi * hypothesis_mask.unsqueeze(-1), dim=1) / torch.sum(
                hypothesis_mask, 1, keepdim=True
        )

        # Now concat
        # (batch_size, model_dim * 2 * 4)
        v_all = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1)

        # the final MLP -- apply dropout to input, and MLP applies to output & hidden
        if self.dropout:
            v_all = self.dropout(v_all)

        output_hidden = self._output_feedforward(v_all)
        label_logits = self._output_logit(output_hidden)
        label_probs = torch.nn.functional.softmax(label_logits, dim=-1)

        output_dict = {"label_logits": label_logits, "label_probs": label_probs}

        if label is not None:
            loss = self._loss(label_logits, label.long().view(-1))
            self._accuracy(label_logits, label)
            output_dict["loss"] = loss

        return output_dict

Example #39

0

Show file

File: decomposable_attention.py Project: apmoore1/allennlp

    def forward(self,  # type: ignore
                premise: Dict[str, torch.LongTensor],
                hypothesis: Dict[str, torch.LongTensor],
                label: torch.IntTensor = None,
                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        premise : Dict[str, torch.LongTensor]
            From a ``TextField``
        hypothesis : Dict[str, torch.LongTensor]
            From a ``TextField``
        label : torch.IntTensor, optional, (default = None)
            From a ``LabelField``
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            Metadata containing the original tokenization of the premise and
            hypothesis with 'premise_tokens' and 'hypothesis_tokens' keys respectively.
        Returns
        -------
        An output dictionary consisting of:

        label_logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_labels)`` representing unnormalised log
            probabilities of the entailment label.
        label_probs : torch.FloatTensor
            A tensor of shape ``(batch_size, num_labels)`` representing probabilities of the
            entailment label.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """
        embedded_premise = self._text_field_embedder(premise)
        embedded_hypothesis = self._text_field_embedder(hypothesis)
        premise_mask = get_text_field_mask(premise).float()
        hypothesis_mask = get_text_field_mask(hypothesis).float()

        if self._premise_encoder:
            embedded_premise = self._premise_encoder(embedded_premise, premise_mask)
        if self._hypothesis_encoder:
            embedded_hypothesis = self._hypothesis_encoder(embedded_hypothesis, hypothesis_mask)

        projected_premise = self._attend_feedforward(embedded_premise)
        projected_hypothesis = self._attend_feedforward(embedded_hypothesis)
        # Shape: (batch_size, premise_length, hypothesis_length)
        similarity_matrix = self._matrix_attention(projected_premise, projected_hypothesis)

        # Shape: (batch_size, premise_length, hypothesis_length)
        p2h_attention = masked_softmax(similarity_matrix, hypothesis_mask)
        # Shape: (batch_size, premise_length, embedding_dim)
        attended_hypothesis = weighted_sum(embedded_hypothesis, p2h_attention)

        # Shape: (batch_size, hypothesis_length, premise_length)
        h2p_attention = masked_softmax(similarity_matrix.transpose(1, 2).contiguous(), premise_mask)
        # Shape: (batch_size, hypothesis_length, embedding_dim)
        attended_premise = weighted_sum(embedded_premise, h2p_attention)

        premise_compare_input = torch.cat([embedded_premise, attended_hypothesis], dim=-1)
        hypothesis_compare_input = torch.cat([embedded_hypothesis, attended_premise], dim=-1)

        compared_premise = self._compare_feedforward(premise_compare_input)
        compared_premise = compared_premise * premise_mask.unsqueeze(-1)
        # Shape: (batch_size, compare_dim)
        compared_premise = compared_premise.sum(dim=1)

        compared_hypothesis = self._compare_feedforward(hypothesis_compare_input)
        compared_hypothesis = compared_hypothesis * hypothesis_mask.unsqueeze(-1)
        # Shape: (batch_size, compare_dim)
        compared_hypothesis = compared_hypothesis.sum(dim=1)

        aggregate_input = torch.cat([compared_premise, compared_hypothesis], dim=-1)
        label_logits = self._aggregate_feedforward(aggregate_input)
        label_probs = torch.nn.functional.softmax(label_logits, dim=-1)

        output_dict = {"label_logits": label_logits,
                       "label_probs": label_probs,
                       "h2p_attention": h2p_attention,
                       "p2h_attention": p2h_attention}

        if label is not None:
            loss = self._loss(label_logits, label.long().view(-1))
            self._accuracy(label_logits, label)
            output_dict["loss"] = loss

        if metadata is not None:
            output_dict["premise_tokens"] = [x["premise_tokens"] for x in metadata]
            output_dict["hypothesis_tokens"] = [x["hypothesis_tokens"] for x in metadata]

        return output_dict

Example #40

0

Show file

File: 3.main_BiDAF_Experiments.py Project: manuwhs/Trapyng

print ("-------------- SIMILARITY LAYER ---------------")

similarity_function = LinearSimilarity(
      combination = "x,y,x*y",
      tensor_1_dim =  200,
      tensor_2_dim = 200)

matrix_attention = LegacyMatrixAttention(similarity_function)

passage_question_similarity = matrix_attention(encoded_passage, encoded_question)
# Shape: (batch_size, passage_length, question_length)
print ("passage question similarity: ", passage_question_similarity.shape)


# Shape: (batch_size, passage_length, question_length)
passage_question_attention = util.masked_softmax(passage_question_similarity, question_mask)
# Shape: (batch_size, passage_length, encoding_dim)
passage_question_vectors = util.weighted_sum(encoded_question, passage_question_attention)

# We replace masked values with something really negative here, so they don't affect the
# max below.
masked_similarity = util.replace_masked_values(passage_question_similarity,
                                               question_mask.unsqueeze(1),
                                               -1e7)
# Shape: (batch_size, passage_length)
question_passage_similarity = masked_similarity.max(dim=-1)[0].squeeze(-1)
# Shape: (batch_size, passage_length)
question_passage_attention = util.masked_softmax(question_passage_similarity, passage_mask)
# Shape: (batch_size, encoding_dim)
question_passage_vector = util.weighted_sum(encoded_passage, question_passage_attention)
# Shape: (batch_size, passage_length, encoding_dim)

Example #41

0

Show file

File: multi_head_self_attention.py Project: apmoore1/allennlp

    def forward(self,  # pylint: disable=arguments-differ
                inputs: torch.Tensor,
                mask: torch.LongTensor = None) -> torch.FloatTensor:
        """
        Parameters
        ----------
        inputs : ``torch.FloatTensor``, required.
            A tensor of shape (batch_size, timesteps, input_dim)
        mask : ``torch.FloatTensor``, optional (default = None).
            A tensor of shape (batch_size, timesteps).

        Returns
        -------
        A tensor of shape (batch_size, timesteps, output_projection_dim),
        where output_projection_dim = input_dim by default.
        """
        num_heads = self._num_heads

        batch_size, timesteps, _ = inputs.size()
        if mask is None:
            mask = inputs.new_ones(batch_size, timesteps)

        # Shape (batch_size, timesteps, 2 * attention_dim + values_dim)
        combined_projection = self._combined_projection(inputs)
        # split by attention dim - if values_dim > attention_dim, we will get more
        # than 3 elements returned. All of the rest are the values vector, so we
        # just concatenate them back together again below.
        queries, keys, *values = combined_projection.split(self._attention_dim, -1)
        queries = queries.contiguous()
        keys = keys.contiguous()
        values = torch.cat(values, -1).contiguous()
        # Shape (num_heads * batch_size, timesteps, values_dim / num_heads)
        values_per_head = values.view(batch_size, timesteps, num_heads, int(self._values_dim/num_heads))
        values_per_head = values_per_head.transpose(1, 2).contiguous()
        values_per_head = values_per_head.view(batch_size * num_heads, timesteps, int(self._values_dim/num_heads))

        # Shape (num_heads * batch_size, timesteps, attention_dim / num_heads)
        queries_per_head = queries.view(batch_size, timesteps, num_heads, int(self._attention_dim/num_heads))
        queries_per_head = queries_per_head.transpose(1, 2).contiguous()
        queries_per_head = queries_per_head.view(batch_size * num_heads, timesteps, int(self._attention_dim/num_heads))

        # Shape (num_heads * batch_size, timesteps, attention_dim / num_heads)
        keys_per_head = keys.view(batch_size, timesteps, num_heads, int(self._attention_dim/num_heads))
        keys_per_head = keys_per_head.transpose(1, 2).contiguous()
        keys_per_head = keys_per_head.view(batch_size * num_heads, timesteps, int(self._attention_dim/num_heads))

        # shape (num_heads * batch_size, timesteps, timesteps)
        scaled_similarities = torch.bmm(queries_per_head, keys_per_head.transpose(1, 2)) / self._scale

        # shape (num_heads * batch_size, timesteps, timesteps)
        # Normalise the distributions, using the same mask for all heads.
        attention = masked_softmax(scaled_similarities, mask.repeat(1, num_heads).view(batch_size * num_heads, timesteps))
        attention = self._attention_dropout(attention)

        # Take a weighted sum of the values with respect to the attention
        # distributions for each element in the num_heads * batch_size dimension.
        # shape (num_heads * batch_size, timesteps, values_dim/num_heads)
        outputs = weighted_sum(values_per_head, attention)

        # Reshape back to original shape (batch_size, timesteps, values_dim)
        # shape (batch_size, num_heads, timesteps, values_dim/num_heads)
        outputs = outputs.view(batch_size, num_heads, timesteps, int(self._values_dim / num_heads))
        # shape (batch_size, timesteps, num_heads, values_dim/num_heads)
        outputs = outputs.transpose(1, 2).contiguous()
        # shape (batch_size, timesteps, values_dim)
        outputs = outputs.view(batch_size, timesteps, self._values_dim)

        # Project back to original input size.
        # shape (batch_size, timesteps, input_size)
        outputs = self._output_projection(outputs)
        return outputs

Example #42

0

Show file

File: util_test.py Project: ziaridoy20/allennlp

    def test_masked_softmax_masked(self):
        # Testing the general masked 1D case.
        vector_1d = torch.FloatTensor([[1.0, 2.0, 5.0]])
        mask_1d = torch.FloatTensor([[1.0, 0.0, 1.0]])
        vector_1d_softmaxed = util.masked_softmax(vector_1d, mask_1d).data.numpy()
        assert_array_almost_equal(vector_1d_softmaxed,
                                  numpy.array([[0.01798621, 0.0, 0.98201382]]))

        vector_1d = torch.FloatTensor([[0.0, 2.0, 3.0, 4.0]])
        mask_1d = torch.FloatTensor([[1.0, 0.0, 1.0, 1.0]])
        vector_1d_softmaxed = util.masked_softmax(vector_1d, mask_1d).data.numpy()
        assert_array_almost_equal(vector_1d_softmaxed,
                                  numpy.array([[0.01321289, 0.0, 0.26538793, 0.72139918]]))

        # Testing the masked 1D case where the input is all 0s and the mask
        # is not all 0s.
        vector_1d = torch.FloatTensor([[0.0, 0.0, 0.0, 0.0]])
        mask_1d = torch.FloatTensor([[0.0, 0.0, 0.0, 1.0]])
        vector_1d_softmaxed = util.masked_softmax(vector_1d, mask_1d).data.numpy()
        assert_array_almost_equal(vector_1d_softmaxed,
                                  numpy.array([[0, 0, 0, 1]]))

        # Testing the masked 1D case where the input is not all 0s
        # and the mask is all 0s.
        vector_1d = torch.FloatTensor([[0.0, 2.0, 3.0, 4.0]])
        mask_1d = torch.FloatTensor([[0.0, 0.0, 0.0, 0.0]])
        vector_1d_softmaxed = util.masked_softmax(vector_1d, mask_1d).data.numpy()
        assert_array_almost_equal(vector_1d_softmaxed,
                                  numpy.array([[0.0, 0.0, 0.0, 0.0]]))

        # Testing the masked 1D case where the input is all 0s and
        # the mask is all 0s.
        vector_1d = torch.FloatTensor([[0.0, 0.0, 0.0, 0.0]])
        mask_1d = torch.FloatTensor([[0.0, 0.0, 0.0, 0.0]])
        vector_1d_softmaxed = util.masked_softmax(vector_1d, mask_1d).data.numpy()
        assert_array_almost_equal(vector_1d_softmaxed,
                                  numpy.array([[0.0, 0.0, 0.0, 0.0]]))

        # Testing the masked 1D case where there are large elements in the
        # padding.
        vector_1d = torch.FloatTensor([[1.0, 1.0, 1e5]])
        mask_1d = torch.FloatTensor([[1.0, 1.0, 0.0]])
        vector_1d_softmaxed = util.masked_softmax(vector_1d, mask_1d).data.numpy()
        assert_array_almost_equal(vector_1d_softmaxed,
                                  numpy.array([[0.5, 0.5, 0]]))

        # Testing the general masked batched case.
        matrix = torch.FloatTensor([[1.0, 2.0, 5.0], [1.0, 2.0, 3.0]])
        mask = torch.FloatTensor([[1.0, 0.0, 1.0], [1.0, 1.0, 1.0]])
        masked_matrix_softmaxed = util.masked_softmax(matrix, mask).data.numpy()
        assert_array_almost_equal(masked_matrix_softmaxed,
                                  numpy.array([[0.01798621, 0.0, 0.98201382],
                                               [0.090031, 0.244728, 0.665241]]))

        # Testing the masked batch case where one of the inputs is all 0s but
        # none of the masks are all 0.
        matrix = torch.FloatTensor([[0.0, 0.0, 0.0], [1.0, 2.0, 3.0]])
        mask = torch.FloatTensor([[1.0, 0.0, 1.0], [1.0, 1.0, 1.0]])
        masked_matrix_softmaxed = util.masked_softmax(matrix, mask).data.numpy()
        assert_array_almost_equal(masked_matrix_softmaxed,
                                  numpy.array([[0.5, 0.0, 0.5],
                                               [0.090031, 0.244728, 0.665241]]))

        # Testing the masked batch case where one of the inputs is all 0s and
        # one of the masks are all 0.
        matrix = torch.FloatTensor([[0.0, 0.0, 0.0], [1.0, 2.0, 3.0]])
        mask = torch.FloatTensor([[1.0, 0.0, 1.0], [0.0, 0.0, 0.0]])
        masked_matrix_softmaxed = util.masked_softmax(matrix, mask).data.numpy()
        assert_array_almost_equal(masked_matrix_softmaxed,
                                  numpy.array([[0.5, 0.0, 0.5],
                                               [0.0, 0.0, 0.0]]))

        matrix = torch.FloatTensor([[0.0, 0.0, 0.0], [1.0, 2.0, 3.0]])
        mask = torch.FloatTensor([[0.0, 0.0, 0.0], [1.0, 0.0, 1.0]])
        masked_matrix_softmaxed = util.masked_softmax(matrix, mask).data.numpy()
        assert_array_almost_equal(masked_matrix_softmaxed,
                                  numpy.array([[0.0, 0.0, 0.0],
                                               [0.11920292, 0.0, 0.88079708]]))

Example #43

0

Show file

File: bimpm_matching.py Project: apmoore1/allennlp

    def forward(self,
                context_1: torch.Tensor,
                mask_1: torch.Tensor,
                context_2: torch.Tensor,
                mask_2: torch.Tensor) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
        # pylint: disable=arguments-differ
        """
        Given the forward (or backward) representations of sentence1 and sentence2, apply four bilateral
        matching functions between them in one direction.

        Parameters
        ----------
        context_1 : ``torch.Tensor``
            Tensor of shape (batch_size, seq_len1, hidden_dim) representing the encoding of the first sentence.
        mask_1 : ``torch.Tensor``
            Binary Tensor of shape (batch_size, seq_len1), indicating which
            positions in the first sentence are padding (0) and which are not (1).
        context_2 : ``torch.Tensor``
            Tensor of shape (batch_size, seq_len2, hidden_dim) representing the encoding of the second sentence.
        mask_2 : ``torch.Tensor``
            Binary Tensor of shape (batch_size, seq_len2), indicating which
            positions in the second sentence are padding (0) and which are not (1).

        Returns
        -------
        A tuple of matching vectors for the two sentences. Each of which is a list of
        matching vectors of shape (batch, seq_len, num_perspectives or 1)
        """
        assert (not mask_2.requires_grad) and (not mask_1.requires_grad)
        assert context_1.size(-1) == context_2.size(-1) == self.hidden_dim

        # (batch,)
        len_1 = get_lengths_from_binary_sequence_mask(mask_1)
        len_2 = get_lengths_from_binary_sequence_mask(mask_2)

        # (batch, seq_len*)
        mask_1, mask_2 = mask_1.float(), mask_2.float()

        # explicitly set masked weights to zero
        # (batch_size, seq_len*, hidden_dim)
        context_1 = context_1 * mask_1.unsqueeze(-1)
        context_2 = context_2 * mask_2.unsqueeze(-1)

        # array to keep the matching vectors for the two sentences
        matching_vector_1: List[torch.Tensor] = []
        matching_vector_2: List[torch.Tensor] = []

        # Step 0. unweighted cosine
        # First calculate the cosine similarities between each forward
        # (or backward) contextual embedding and every forward (or backward)
        # contextual embedding of the other sentence.

        # (batch, seq_len1, seq_len2)
        cosine_sim = F.cosine_similarity(context_1.unsqueeze(-2), context_2.unsqueeze(-3), dim=3)

        # (batch, seq_len*, 1)
        cosine_max_1 = masked_max(cosine_sim, mask_2.unsqueeze(-2), dim=2, keepdim=True)
        cosine_mean_1 = masked_mean(cosine_sim, mask_2.unsqueeze(-2), dim=2, keepdim=True)
        cosine_max_2 = masked_max(cosine_sim.permute(0, 2, 1), mask_1.unsqueeze(-2), dim=2, keepdim=True)
        cosine_mean_2 = masked_mean(cosine_sim.permute(0, 2, 1), mask_1.unsqueeze(-2), dim=2, keepdim=True)

        matching_vector_1.extend([cosine_max_1, cosine_mean_1])
        matching_vector_2.extend([cosine_max_2, cosine_mean_2])

        # Step 1. Full-Matching
        # Each time step of forward (or backward) contextual embedding of one sentence
        # is compared with the last time step of the forward (or backward)
        # contextual embedding of the other sentence
        if self.with_full_match:

            # (batch, 1, hidden_dim)
            if self.is_forward:
                # (batch, 1, hidden_dim)
                last_position_1 = (len_1 - 1).clamp(min=0)
                last_position_1 = last_position_1.view(-1, 1, 1).expand(-1, 1, self.hidden_dim)
                last_position_2 = (len_2 - 1).clamp(min=0)
                last_position_2 = last_position_2.view(-1, 1, 1).expand(-1, 1, self.hidden_dim)

                context_1_last = context_1.gather(1, last_position_1)
                context_2_last = context_2.gather(1, last_position_2)
            else:
                context_1_last = context_1[:, 0:1, :]
                context_2_last = context_2[:, 0:1, :]

            # (batch, seq_len*, num_perspectives)
            matching_vector_1_full = multi_perspective_match(context_1,
                                                             context_2_last,
                                                             self.full_match_weights)
            matching_vector_2_full = multi_perspective_match(context_2,
                                                             context_1_last,
                                                             self.full_match_weights_reversed)

            matching_vector_1.extend(matching_vector_1_full)
            matching_vector_2.extend(matching_vector_2_full)

        # Step 2. Maxpooling-Matching
        # Each time step of forward (or backward) contextual embedding of one sentence
        # is compared with every time step of the forward (or backward)
        # contextual embedding of the other sentence, and only the max value of each
        # dimension is retained.
        if self.with_maxpool_match:
            # (batch, seq_len1, seq_len2, num_perspectives)
            matching_vector_max = multi_perspective_match_pairwise(context_1,
                                                                   context_2,
                                                                   self.maxpool_match_weights)

            # (batch, seq_len*, num_perspectives)
            matching_vector_1_max = masked_max(matching_vector_max,
                                               mask_2.unsqueeze(-2).unsqueeze(-1),
                                               dim=2)
            matching_vector_1_mean = masked_mean(matching_vector_max,
                                                 mask_2.unsqueeze(-2).unsqueeze(-1),
                                                 dim=2)
            matching_vector_2_max = masked_max(matching_vector_max.permute(0, 2, 1, 3),
                                               mask_1.unsqueeze(-2).unsqueeze(-1),
                                               dim=2)
            matching_vector_2_mean = masked_mean(matching_vector_max.permute(0, 2, 1, 3),
                                                 mask_1.unsqueeze(-2).unsqueeze(-1),
                                                 dim=2)

            matching_vector_1.extend([matching_vector_1_max, matching_vector_1_mean])
            matching_vector_2.extend([matching_vector_2_max, matching_vector_2_mean])


        # Step 3. Attentive-Matching
        # Each forward (or backward) similarity is taken as the weight
        # of the forward (or backward) contextual embedding, and calculate an
        # attentive vector for the sentence by weighted summing all its
        # contextual embeddings.
        # Finally match each forward (or backward) contextual embedding
        # with its corresponding attentive vector.

        # (batch, seq_len1, seq_len2, hidden_dim)
        att_2 = context_2.unsqueeze(-3) * cosine_sim.unsqueeze(-1)

        # (batch, seq_len1, seq_len2, hidden_dim)
        att_1 = context_1.unsqueeze(-2) * cosine_sim.unsqueeze(-1)

        if self.with_attentive_match:
            # (batch, seq_len*, hidden_dim)
            att_mean_2 = masked_softmax(att_2.sum(dim=2), mask_1.unsqueeze(-1))
            att_mean_1 = masked_softmax(att_1.sum(dim=1), mask_2.unsqueeze(-1))

            # (batch, seq_len*, num_perspectives)
            matching_vector_1_att_mean = multi_perspective_match(context_1,
                                                                 att_mean_2,
                                                                 self.attentive_match_weights)
            matching_vector_2_att_mean = multi_perspective_match(context_2,
                                                                 att_mean_1,
                                                                 self.attentive_match_weights_reversed)
            matching_vector_1.extend(matching_vector_1_att_mean)
            matching_vector_2.extend(matching_vector_2_att_mean)

        # Step 4. Max-Attentive-Matching
        # Pick the contextual embeddings with the highest cosine similarity as the attentive
        # vector, and match each forward (or backward) contextual embedding with its
        # corresponding attentive vector.
        if self.with_max_attentive_match:
            # (batch, seq_len*, hidden_dim)
            att_max_2 = masked_max(att_2, mask_2.unsqueeze(-2).unsqueeze(-1), dim=2)
            att_max_1 = masked_max(att_1.permute(0, 2, 1, 3), mask_1.unsqueeze(-2).unsqueeze(-1), dim=2)

            # (batch, seq_len*, num_perspectives)
            matching_vector_1_att_max = multi_perspective_match(context_1,
                                                                att_max_2,
                                                                self.max_attentive_match_weights)
            matching_vector_2_att_max = multi_perspective_match(context_2,
                                                                att_max_1,
                                                                self.max_attentive_match_weights_reversed)

            matching_vector_1.extend(matching_vector_1_att_max)
            matching_vector_2.extend(matching_vector_2_att_max)

        return matching_vector_1, matching_vector_2

Example #44

0

Show file

File: dialog_qa.py Project: apmoore1/allennlp

    def forward(self,  # type: ignore
                question: Dict[str, torch.LongTensor],
                passage: Dict[str, torch.LongTensor],
                span_start: torch.IntTensor = None,
                span_end: torch.IntTensor = None,
                p1_answer_marker: torch.IntTensor = None,
                p2_answer_marker: torch.IntTensor = None,
                p3_answer_marker: torch.IntTensor = None,
                yesno_list: torch.IntTensor = None,
                followup_list: torch.IntTensor = None,
                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        question : Dict[str, torch.LongTensor]
            From a ``TextField``.
        passage : Dict[str, torch.LongTensor]
            From a ``TextField``.  The model assumes that this passage contains the answer to the
            question, and predicts the beginning and ending positions of the answer within the
            passage.
        span_start : ``torch.IntTensor``, optional
            From an ``IndexField``.  This is one of the things we are trying to predict - the
            beginning position of the answer with the passage.  This is an `inclusive` token index.
            If this is given, we will compute a loss that gets included in the output dictionary.
        span_end : ``torch.IntTensor``, optional
            From an ``IndexField``.  This is one of the things we are trying to predict - the
            ending position of the answer with the passage.  This is an `inclusive` token index.
            If this is given, we will compute a loss that gets included in the output dictionary.
        p1_answer_marker : ``torch.IntTensor``, optional
            This is one of the inputs, but only when num_context_answers > 0.
            This is a tensor that has a shape [batch_size, max_qa_count, max_passage_length].
            Most passage token will have assigned 'O', except the passage tokens belongs to the previous answer
            in the dialog, which will be assigned labels such as <1_start>, <1_in>, <1_end>.
            For more details, look into dataset_readers/util/make_reading_comprehension_instance_quac
        p2_answer_marker :  ``torch.IntTensor``, optional
            This is one of the inputs, but only when num_context_answers > 1.
            It is similar to p1_answer_marker, but marking previous previous answer in passage.
        p3_answer_marker :  ``torch.IntTensor``, optional
            This is one of the inputs, but only when num_context_answers > 2.
            It is similar to p1_answer_marker, but marking previous previous previous answer in passage.
        yesno_list :  ``torch.IntTensor``, optional
            This is one of the outputs that we are trying to predict.
            Three way classification (the yes/no/not a yes no question).
        followup_list :  ``torch.IntTensor``, optional
            This is one of the outputs that we are trying to predict.
            Three way classification (followup / maybe followup / don't followup).
        metadata : ``List[Dict[str, Any]]``, optional
            If present, this should contain the question ID, original passage text, and token
            offsets into the passage for each instance in the batch.  We use this for computing
            official metrics using the official SQuAD evaluation script.  The length of this list
            should be the batch size, and each dictionary should have the keys ``id``,
            ``original_passage``, and ``token_offsets``.  If you only want the best span string and
            don't care about official metrics, you can omit the ``id`` key.

        Returns
        -------
        An output dictionary consisting of the followings.
        Each of the followings is a nested list because first iterates over dialog, then questions in dialog.

        qid : List[List[str]]
            A list of list, consisting of question ids.
        followup : List[List[int]]
            A list of list, consisting of continuation marker prediction index.
            (y :yes, m: maybe follow up, n: don't follow up)
        yesno : List[List[int]]
            A list of list, consisting of affirmation marker prediction index.
            (y :yes, x: not a yes/no question, n: np)
        best_span_str : List[List[str]]
            If sufficient metadata was provided for the instances in the batch, we also return the
            string from the original passage that the model thinks is the best answer to the
            question.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """
        batch_size, max_qa_count, max_q_len, _ = question['token_characters'].size()
        total_qa_count = batch_size * max_qa_count
        qa_mask = torch.ge(followup_list, 0).view(total_qa_count)
        embedded_question = self._text_field_embedder(question, num_wrapping_dims=1)
        embedded_question = embedded_question.reshape(total_qa_count, max_q_len,
                                                      self._text_field_embedder.get_output_dim())
        embedded_question = self._variational_dropout(embedded_question)
        embedded_passage = self._variational_dropout(self._text_field_embedder(passage))
        passage_length = embedded_passage.size(1)

        question_mask = util.get_text_field_mask(question, num_wrapping_dims=1).float()
        question_mask = question_mask.reshape(total_qa_count, max_q_len)
        passage_mask = util.get_text_field_mask(passage).float()

        repeated_passage_mask = passage_mask.unsqueeze(1).repeat(1, max_qa_count, 1)
        repeated_passage_mask = repeated_passage_mask.view(total_qa_count, passage_length)

        if self._num_context_answers > 0:
            # Encode question turn number inside the dialog into question embedding.
            question_num_ind = util.get_range_vector(max_qa_count, util.get_device_of(embedded_question))
            question_num_ind = question_num_ind.unsqueeze(-1).repeat(1, max_q_len)
            question_num_ind = question_num_ind.unsqueeze(0).repeat(batch_size, 1, 1)
            question_num_ind = question_num_ind.reshape(total_qa_count, max_q_len)
            question_num_marker_emb = self._question_num_marker(question_num_ind)
            embedded_question = torch.cat([embedded_question, question_num_marker_emb], dim=-1)

            # Encode the previous answers in passage embedding.
            repeated_embedded_passage = embedded_passage.unsqueeze(1).repeat(1, max_qa_count, 1, 1). \
                view(total_qa_count, passage_length, self._text_field_embedder.get_output_dim())
            # batch_size * max_qa_count, passage_length, word_embed_dim
            p1_answer_marker = p1_answer_marker.view(total_qa_count, passage_length)
            p1_answer_marker_emb = self._prev_ans_marker(p1_answer_marker)
            repeated_embedded_passage = torch.cat([repeated_embedded_passage, p1_answer_marker_emb], dim=-1)
            if self._num_context_answers > 1:
                p2_answer_marker = p2_answer_marker.view(total_qa_count, passage_length)
                p2_answer_marker_emb = self._prev_ans_marker(p2_answer_marker)
                repeated_embedded_passage = torch.cat([repeated_embedded_passage, p2_answer_marker_emb], dim=-1)
                if self._num_context_answers > 2:
                    p3_answer_marker = p3_answer_marker.view(total_qa_count, passage_length)
                    p3_answer_marker_emb = self._prev_ans_marker(p3_answer_marker)
                    repeated_embedded_passage = torch.cat([repeated_embedded_passage, p3_answer_marker_emb],
                                                          dim=-1)

            repeated_encoded_passage = self._variational_dropout(self._phrase_layer(repeated_embedded_passage,
                                                                                    repeated_passage_mask))
        else:
            encoded_passage = self._variational_dropout(self._phrase_layer(embedded_passage, passage_mask))
            repeated_encoded_passage = encoded_passage.unsqueeze(1).repeat(1, max_qa_count, 1, 1)
            repeated_encoded_passage = repeated_encoded_passage.view(total_qa_count,
                                                                     passage_length,
                                                                     self._encoding_dim)

        encoded_question = self._variational_dropout(self._phrase_layer(embedded_question, question_mask))

        # Shape: (batch_size * max_qa_count, passage_length, question_length)
        passage_question_similarity = self._matrix_attention(repeated_encoded_passage, encoded_question)
        # Shape: (batch_size * max_qa_count, passage_length, question_length)
        passage_question_attention = util.masked_softmax(passage_question_similarity, question_mask)
        # Shape: (batch_size * max_qa_count, passage_length, encoding_dim)
        passage_question_vectors = util.weighted_sum(encoded_question, passage_question_attention)

        # We replace masked values with something really negative here, so they don't affect the
        # max below.
        masked_similarity = util.replace_masked_values(passage_question_similarity,
                                                       question_mask.unsqueeze(1),
                                                       -1e7)

        question_passage_similarity = masked_similarity.max(dim=-1)[0].squeeze(-1)
        question_passage_attention = util.masked_softmax(question_passage_similarity, repeated_passage_mask)
        # Shape: (batch_size * max_qa_count, encoding_dim)
        question_passage_vector = util.weighted_sum(repeated_encoded_passage, question_passage_attention)
        tiled_question_passage_vector = question_passage_vector.unsqueeze(1).expand(total_qa_count,
                                                                                    passage_length,
                                                                                    self._encoding_dim)

        # Shape: (batch_size * max_qa_count, passage_length, encoding_dim * 4)
        final_merged_passage = torch.cat([repeated_encoded_passage,
                                          passage_question_vectors,
                                          repeated_encoded_passage * passage_question_vectors,
                                          repeated_encoded_passage * tiled_question_passage_vector],
                                         dim=-1)

        final_merged_passage = F.relu(self._merge_atten(final_merged_passage))

        residual_layer = self._variational_dropout(self._residual_encoder(final_merged_passage,
                                                                          repeated_passage_mask))
        self_attention_matrix = self._self_attention(residual_layer, residual_layer)

        mask = repeated_passage_mask.reshape(total_qa_count, passage_length, 1) \
               * repeated_passage_mask.reshape(total_qa_count, 1, passage_length)
        self_mask = torch.eye(passage_length, passage_length, device=self_attention_matrix.device)
        self_mask = self_mask.reshape(1, passage_length, passage_length)
        mask = mask * (1 - self_mask)

        self_attention_probs = util.masked_softmax(self_attention_matrix, mask)

        # (batch, passage_len, passage_len) * (batch, passage_len, dim) -> (batch, passage_len, dim)
        self_attention_vecs = torch.matmul(self_attention_probs, residual_layer)
        self_attention_vecs = torch.cat([self_attention_vecs, residual_layer,
                                         residual_layer * self_attention_vecs],
                                        dim=-1)
        residual_layer = F.relu(self._merge_self_attention(self_attention_vecs))

        final_merged_passage = final_merged_passage + residual_layer
        # batch_size * maxqa_pair_len * max_passage_len * 200
        final_merged_passage = self._variational_dropout(final_merged_passage)
        start_rep = self._span_start_encoder(final_merged_passage, repeated_passage_mask)
        span_start_logits = self._span_start_predictor(start_rep).squeeze(-1)

        end_rep = self._span_end_encoder(torch.cat([final_merged_passage, start_rep], dim=-1),
                                         repeated_passage_mask)
        span_end_logits = self._span_end_predictor(end_rep).squeeze(-1)

        span_yesno_logits = self._span_yesno_predictor(end_rep).squeeze(-1)
        span_followup_logits = self._span_followup_predictor(end_rep).squeeze(-1)

        span_start_logits = util.replace_masked_values(span_start_logits, repeated_passage_mask, -1e7)
        # batch_size * maxqa_len_pair, max_document_len
        span_end_logits = util.replace_masked_values(span_end_logits, repeated_passage_mask, -1e7)

        best_span = self._get_best_span_yesno_followup(span_start_logits, span_end_logits,
                                                       span_yesno_logits, span_followup_logits,
                                                       self._max_span_length)

        output_dict: Dict[str, Any] = {}

        # Compute the loss.
        if span_start is not None:
            loss = nll_loss(util.masked_log_softmax(span_start_logits, repeated_passage_mask), span_start.view(-1),
                            ignore_index=-1)
            self._span_start_accuracy(span_start_logits, span_start.view(-1), mask=qa_mask)
            loss += nll_loss(util.masked_log_softmax(span_end_logits,
                                                     repeated_passage_mask), span_end.view(-1), ignore_index=-1)
            self._span_end_accuracy(span_end_logits, span_end.view(-1), mask=qa_mask)
            self._span_accuracy(best_span[:, 0:2],
                                torch.stack([span_start, span_end], -1).view(total_qa_count, 2),
                                mask=qa_mask.unsqueeze(1).expand(-1, 2).long())
            # add a select for the right span to compute loss
            gold_span_end_loc = []
            span_end = span_end.view(total_qa_count).squeeze().data.cpu().numpy()
            for i in range(0, total_qa_count):
                gold_span_end_loc.append(max(span_end[i] * 3 + i * passage_length * 3, 0))
                gold_span_end_loc.append(max(span_end[i] * 3 + i * passage_length * 3 + 1, 0))
                gold_span_end_loc.append(max(span_end[i] * 3 + i * passage_length * 3 + 2, 0))
            gold_span_end_loc = span_start.new(gold_span_end_loc)

            pred_span_end_loc = []
            for i in range(0, total_qa_count):
                pred_span_end_loc.append(max(best_span[i][1] * 3 + i * passage_length * 3, 0))
                pred_span_end_loc.append(max(best_span[i][1] * 3 + i * passage_length * 3 + 1, 0))
                pred_span_end_loc.append(max(best_span[i][1] * 3 + i * passage_length * 3 + 2, 0))
            predicted_end = span_start.new(pred_span_end_loc)

            _yesno = span_yesno_logits.view(-1).index_select(0, gold_span_end_loc).view(-1, 3)
            _followup = span_followup_logits.view(-1).index_select(0, gold_span_end_loc).view(-1, 3)
            loss += nll_loss(F.log_softmax(_yesno, dim=-1), yesno_list.view(-1), ignore_index=-1)
            loss += nll_loss(F.log_softmax(_followup, dim=-1), followup_list.view(-1), ignore_index=-1)

            _yesno = span_yesno_logits.view(-1).index_select(0, predicted_end).view(-1, 3)
            _followup = span_followup_logits.view(-1).index_select(0, predicted_end).view(-1, 3)
            self._span_yesno_accuracy(_yesno, yesno_list.view(-1), mask=qa_mask)
            self._span_followup_accuracy(_followup, followup_list.view(-1), mask=qa_mask)
            output_dict["loss"] = loss

        # Compute F1 and preparing the output dictionary.
        output_dict['best_span_str'] = []
        output_dict['qid'] = []
        output_dict['followup'] = []
        output_dict['yesno'] = []
        best_span_cpu = best_span.detach().cpu().numpy()
        for i in range(batch_size):
            passage_str = metadata[i]['original_passage']
            offsets = metadata[i]['token_offsets']
            f1_score = 0.0
            per_dialog_best_span_list = []
            per_dialog_yesno_list = []
            per_dialog_followup_list = []
            per_dialog_query_id_list = []
            for per_dialog_query_index, (iid, answer_texts) in enumerate(
                    zip(metadata[i]["instance_id"], metadata[i]["answer_texts_list"])):
                predicted_span = tuple(best_span_cpu[i * max_qa_count + per_dialog_query_index])

                start_offset = offsets[predicted_span[0]][0]
                end_offset = offsets[predicted_span[1]][1]

                yesno_pred = predicted_span[2]
                followup_pred = predicted_span[3]
                per_dialog_yesno_list.append(yesno_pred)
                per_dialog_followup_list.append(followup_pred)
                per_dialog_query_id_list.append(iid)

                best_span_string = passage_str[start_offset:end_offset]
                per_dialog_best_span_list.append(best_span_string)
                if answer_texts:
                    if len(answer_texts) > 1:
                        t_f1 = []
                        # Compute F1 over N-1 human references and averages the scores.
                        for answer_index in range(len(answer_texts)):
                            idxes = list(range(len(answer_texts)))
                            idxes.pop(answer_index)
                            refs = [answer_texts[z] for z in idxes]
                            t_f1.append(squad_eval.metric_max_over_ground_truths(squad_eval.f1_score,
                                                                                 best_span_string,
                                                                                 refs))
                        f1_score = 1.0 * sum(t_f1) / len(t_f1)
                    else:
                        f1_score = squad_eval.metric_max_over_ground_truths(squad_eval.f1_score,
                                                                            best_span_string,
                                                                            answer_texts)
                self._official_f1(100 * f1_score)
            output_dict['qid'].append(per_dialog_query_id_list)
            output_dict['best_span_str'].append(per_dialog_best_span_list)
            output_dict['yesno'].append(per_dialog_yesno_list)
            output_dict['followup'].append(per_dialog_followup_list)
        return output_dict

Example #45

0

Show file

File: biattentive_classification_network.py Project: pyknife/allennlp

    def forward(self,  # type: ignore
                tokens: Dict[str, torch.LongTensor],
                label: torch.LongTensor = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``.
        label : torch.LongTensor, optional (default = None)
            A variable representing the label for each instance in the batch.
        Returns
        -------
        An output dictionary consisting of:
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_classes)`` representing a
            distribution over the label classes for each instance.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """
        text_mask = util.get_text_field_mask(tokens).float()
        # Pop elmo tokens, since elmo embedder should not be present.
        elmo_tokens = tokens.pop("elmo", None)
        embedded_text = self._text_field_embedder(tokens)

        # Add the "elmo" key back to "tokens" if not None, since the tests and the
        # subsequent training epochs rely not being modified during forward()
        if elmo_tokens is not None:
            tokens["elmo"] = elmo_tokens

        # Create ELMo embeddings if applicable
        if self._elmo:
            if elmo_tokens is not None:
                elmo_representations = self._elmo(elmo_tokens)["elmo_representations"]
                # Pop from the end is more performant with list
                if self._use_integrator_output_elmo:
                    integrator_output_elmo = elmo_representations.pop()
                if self._use_input_elmo:
                    input_elmo = elmo_representations.pop()
                assert not elmo_representations
            else:
                raise ConfigurationError(
                        "Model was built to use Elmo, but input text is not tokenized for Elmo.")

        if self._use_input_elmo:
            embedded_text = torch.cat([embedded_text, input_elmo], dim=-1)

        dropped_embedded_text = self._embedding_dropout(embedded_text)
        pre_encoded_text = self._pre_encode_feedforward(dropped_embedded_text)
        encoded_tokens = self._encoder(pre_encoded_text, text_mask)

        # Compute biattention. This is a special case since the inputs are the same.
        attention_logits = encoded_tokens.bmm(encoded_tokens.permute(0, 2, 1).contiguous())
        attention_weights = util.last_dim_softmax(attention_logits, text_mask)
        encoded_text = util.weighted_sum(encoded_tokens, attention_weights)

        # Build the input to the integrator
        integrator_input = torch.cat([encoded_tokens,
                                      encoded_tokens - encoded_text,
                                      encoded_tokens * encoded_text], 2)
        integrated_encodings = self._integrator(integrator_input, text_mask)

        # Concatenate ELMo representations to integrated_encodings if specified
        if self._use_integrator_output_elmo:
            integrated_encodings = torch.cat([integrated_encodings,
                                              integrator_output_elmo], dim=-1)

        # Simple Pooling layers
        max_masked_integrated_encodings = util.replace_masked_values(
                integrated_encodings, text_mask.unsqueeze(2), -1e7)
        max_pool = torch.max(max_masked_integrated_encodings, 1)[0]
        min_masked_integrated_encodings = util.replace_masked_values(
                integrated_encodings, text_mask.unsqueeze(2), +1e7)
        min_pool = torch.min(min_masked_integrated_encodings, 1)[0]
        mean_pool = torch.sum(integrated_encodings, 1) / torch.sum(text_mask, 1, keepdim=True)

        # Self-attentive pooling layer
        # Run through linear projection. Shape: (batch_size, sequence length, 1)
        # Then remove the last dimension to get the proper attention shape (batch_size, sequence length).
        self_attentive_logits = self._self_attentive_pooling_projection(
                integrated_encodings).squeeze(2)
        self_weights = util.masked_softmax(self_attentive_logits, text_mask)
        self_attentive_pool = util.weighted_sum(integrated_encodings, self_weights)

        pooled_representations = torch.cat([max_pool, min_pool, mean_pool, self_attentive_pool], 1)
        pooled_representations_dropped = self._integrator_dropout(pooled_representations)

        logits = self._output_layer(pooled_representations_dropped)
        class_probabilities = F.softmax(logits, dim=-1)

        output_dict = {'logits': logits, 'class_probabilities': class_probabilities}
        if label is not None:
            loss = self.loss(logits, label)
            for metric in self.metrics.values():
                metric(logits, label)
            output_dict["loss"] = loss

        return output_dict

Example #46

0

Show file

File: self_attentive_span_extractor.py Project: apmoore1/allennlp

    def forward(self,
                sequence_tensor: torch.FloatTensor,
                span_indices: torch.LongTensor,
                sequence_mask: torch.LongTensor = None,
                span_indices_mask: torch.LongTensor = None) -> torch.FloatTensor:
        # both of shape (batch_size, num_spans, 1)
        span_starts, span_ends = span_indices.split(1, dim=-1)

        # shape (batch_size, num_spans, 1)
        # These span widths are off by 1, because the span ends are `inclusive`.
        span_widths = span_ends - span_starts

        # We need to know the maximum span width so we can
        # generate indices to extract the spans from the sequence tensor.
        # These indices will then get masked below, such that if the length
        # of a given span is smaller than the max, the rest of the values
        # are masked.
        max_batch_span_width = span_widths.max().item() + 1

        # shape (batch_size, sequence_length, 1)
        global_attention_logits = self._global_attention(sequence_tensor)

        # Shape: (1, 1, max_batch_span_width)
        max_span_range_indices = util.get_range_vector(max_batch_span_width,
                                                       util.get_device_of(sequence_tensor)).view(1, 1, -1)
        # Shape: (batch_size, num_spans, max_batch_span_width)
        # This is a broadcasted comparison - for each span we are considering,
        # we are creating a range vector of size max_span_width, but masking values
        # which are greater than the actual length of the span.
        #
        # We're using <= here (and for the mask below) because the span ends are
        # inclusive, so we want to include indices which are equal to span_widths rather
        # than using it as a non-inclusive upper bound.
        span_mask = (max_span_range_indices <= span_widths).float()
        raw_span_indices = span_ends - max_span_range_indices
        # We also don't want to include span indices which are less than zero,
        # which happens because some spans near the beginning of the sequence
        # have an end index < max_batch_span_width, so we add this to the mask here.
        span_mask = span_mask * (raw_span_indices >= 0).float()
        span_indices = torch.nn.functional.relu(raw_span_indices.float()).long()

        # Shape: (batch_size * num_spans * max_batch_span_width)
        flat_span_indices = util.flatten_and_batch_shift_indices(span_indices, sequence_tensor.size(1))

        # Shape: (batch_size, num_spans, max_batch_span_width, embedding_dim)
        span_embeddings = util.batched_index_select(sequence_tensor, span_indices, flat_span_indices)

        # Shape: (batch_size, num_spans, max_batch_span_width)
        span_attention_logits = util.batched_index_select(global_attention_logits,
                                                          span_indices,
                                                          flat_span_indices).squeeze(-1)
        # Shape: (batch_size, num_spans, max_batch_span_width)
        span_attention_weights = util.masked_softmax(span_attention_logits, span_mask)

        # Do a weighted sum of the embedded spans with
        # respect to the normalised attention distributions.
        # Shape: (batch_size, num_spans, embedding_dim)
        attended_text_embeddings = util.weighted_sum(span_embeddings, span_attention_weights)

        if span_indices_mask is not None:
            # Above we were masking the widths of spans with respect to the max
            # span width in the batch. Here we are masking the spans which were
            # originally passed in as padding.
            return attended_text_embeddings * span_indices_mask.unsqueeze(-1).float()

        return attended_text_embeddings