Ejemplo n.º 1
0
 def forward(self, input_ids, text_masks):
     inputs, input_masks = input_ids
     batch_size = len(inputs)
     segment_ids, position_ids = self.get_segment_and_position_ids(inputs)
     inputs_embedded, _ = self.transformer_encoder(inputs, input_masks,
                                                   segments=segment_ids, position_ids=position_ids)
     text_masks = torch.cat([ops.zeros_var_cuda([batch_size, 1]), text_masks.float()], dim=1)
     text_embedded = inputs_embedded[:, :text_masks.size(1), :]
     output = self.translatability_pred(text_embedded[:, 0, :])
     span_extractor_output = self.span_extractor(text_embedded, text_masks)
     return output, span_extractor_output
Ejemplo n.º 2
0
    def forward(self,
                input_embedded,
                hidden,
                encoder_hiddens,
                encoder_hidden_masks,
                pointer_context=None,
                encoder_ptr_value_ids=None,
                decoder_ptr_value_ids=None,
                last_output=None):
        """
        :param input_embedded: [batch_size, seq_len(=1), input_dim]
        :param hidden: (h, c)
            h - [num_layers, batch_size, hidden_dim]
            c - [num_layers, batch_size, hidden_dim]
        :param encoder_hiddens: [batch_size, encoder_seq_len, hidden_dim]
        :param encoder_hidden_masks: [batch_size, encoder_seq_len]
        :param pointer_context
            p_pointer - [batch_size, seq_len(=1), 1]
            attn_weights - [batch_size, seq_len(=1), attn_value_dim]
        :param encoder_ptr_value_ids: [batch_size, encoder_seq_len]
            Mapping element in the memory to the pointing-generating vocabulary. If None, the pointing and generating
            vocabularies do not overlap.
        :param decoder_ptr_value_ids: [batch_size, decoder_seq_len]
            Decoder output ground truth. Used during training only.
        :param last_output: [batch_size, seq_len(=1)]
            Decoding result of previous step.
        :return outputs: [batch_size, seq_len(=1), output_vocab_size]
        :return hidden: updated decoder hidden state
        :return pointer_context
            p_pointer - [batch_size, seq_len(=1), 1]
            attn_weights - [batch_size, num_head, seq_len(=1), encoder_seq_len]
        """
        if not self.training:
            assert (decoder_ptr_value_ids is None)
        batch_size = len(input_embedded)

        # unpack input_embedded
        if pointer_context:
            p_pointer, attn_weights = pointer_context
        else:
            p_pointer = ops.zeros_var_cuda([batch_size, 1, 1])
            attn_weights = ops.zeros_var_cuda(
                [batch_size, self.attn.num_heads, 1,
                 encoder_hiddens.size(1)])

        outputs, hiddens = [], []
        seq_attn_weights = []
        seq_p_pointers = []

        for i in range(input_embedded.size(1)):
            input_ = input_embedded[:, i:i + 1, :]
            # compute selective read
            if self.training and decoder_ptr_value_ids is not None:
                if i == 0:
                    last_output = ops.int_fill_var_cuda([batch_size, 1],
                                                        self.vocab.start_id)
                else:
                    last_output = decoder_ptr_value_ids[:, i:i + 1]
            else:
                assert (last_output is not None)
            # [batch_size, encoder_seq_len]
            selective_attn = selective_read(
                encoder_ptr_value_ids, encoder_hiddens,
                self.merge_multi_head_attention(attn_weights), last_output)
            # [batch_size, 1, input_dim + self.attn_value_dim]
            input_ = torch.cat([input_, selective_attn], dim=2)

            output, hidden = self.rnn(input_, hidden)
            if self.training and self.return_hiddens:
                hiddens.append(hidden)
            # a) compute attention vector and attention weights
            # [batch_size, 1, attn_value_dim], [batch_size, num_head, 1, encoder_seq_len]
            attn_vec, attn_weights = self.attn(output, encoder_hiddens,
                                               encoder_hiddens,
                                               encoder_hidden_masks)
            # b) compute pointer-generator switch
            # [batch_size, 1, 1]
            p_pointer = self.pointer_switch(output, attn_vec)
            # c) update output state
            output = self.attn_combine(output, attn_vec)
            # d.1) compute generation prob
            # [batch_size, 1, output_vocab_size]
            gen_logit = self.out(output)
            # d.2) merge pointing and generation prob
            # [batch_size, 1, output_vocab_size + max_in_seq_len]
            if encoder_ptr_value_ids is None:
                point_gen_prob = torch.cat([
                    (1 - p_pointer) * torch.exp(gen_logit),
                    p_pointer * self.merge_multi_head_attention(attn_weights)
                ],
                                           dim=2)
            else:
                gen_prob_zeros_pad = ops.zeros_var_cuda(
                    (batch_size, 1, encoder_ptr_value_ids.size(1)))
                weighted_gen_prob = torch.cat([
                    (1 - p_pointer) * torch.exp(gen_logit), gen_prob_zeros_pad
                ],
                                              dim=2)
                weighted_point_prob = p_pointer * self.merge_multi_head_attention(
                    attn_weights)
                point_gen_prob = weighted_gen_prob.scatter_add_(
                    dim=2,
                    index=encoder_ptr_value_ids.unsqueeze(1),
                    src=weighted_point_prob)
            point_gen_logit = ops.safe_log(point_gen_prob)

            outputs.append(point_gen_logit)
            seq_attn_weights.append(attn_weights)
            seq_p_pointers.append(p_pointer)

        if self.training and self.return_hiddens:
            return torch.cat(outputs, dim=1), hidden, \
                   (torch.cat(seq_p_pointers, dim=1), torch.cat(seq_attn_weights, dim=2)), \
                   self.cat_lstm_hiddens(hiddens)
        else:
            return torch.cat(outputs, dim=1), hidden, \
                   (torch.cat(seq_p_pointers, dim=1), torch.cat(seq_attn_weights, dim=2))
Ejemplo n.º 3
0
    def forward(self,
                input_embedded,
                hidden,
                encoder_hiddens,
                encoder_hidden_masks,
                pointer_context=None,
                vocab_masks=None,
                memory_masks=None,
                encoder_ptr_value_ids=None,
                decoder_ptr_value_ids=None,
                last_output=None):
        """
        :param input_embedded: [batch_size, seq_len(=1), input_dim]
        :param hidden: (h, c)
            h - [num_layers, batch_size, hidden_dim]
            c - [num_layers, batch_size, hidden_dim]
        :param pointer_context
            p_pointer - [batch_size, seq_len(=1), 1]
            attn_weights - [batch_size, num_head, seq_len(=1), attn_value_dim]
        :param encoder_hiddens: [batch_size, encoder_seq_len, hidden_dim]
        :param encoder_hidden_masks: [batch_size, encoder_seq_len]
        :param pointer_context:
        :param vocab_masks: [batch_size, vocab_size] binary mask in which the banned vocab entries are set to 0 and the
            rest are set to 1.
        :param memory_masks: [batch_size, memory_seq_len] binary mask in which the banned memory entries are set
            to 0 and the rest are set to 1.
        :param encoder_ptr_value_ids: [batch_size, encoder_seq_len] mapping element in the memory to the
            pointing-generating vocabulary. If None, the pointing and generating libraries do not overlap.
        :param decoder_ptr_value_ids: [batch_size, decoder_seq_len]
            Decoder output ground truth. Used during training only.
        :param last_output: [batch_size, seq_len(=1)]
            Decoding result of previous step.
        :return outputs: [batch_size, seq_len(=1), output_vocab_size]
        :return hidden: updated decoder hidden state
        :return pointer_context
            p_pointer - [batch_size, seq_len(=1), 1]
            attn_weights - [batch_size, num_head, seq_len(=1), encoder_seq_len]
        """
        assert (encoder_hiddens.size(1) == encoder_ptr_value_ids.size(1))
        batch_size = len(input_embedded)

        # unpack input_embedded
        if pointer_context:
            p_pointer, attn_weights = pointer_context
        else:
            p_pointer = ops.zeros_var_cuda([batch_size, 1, 1])
            attn_weights = ops.zeros_var_cuda(
                [batch_size, self.attn.num_heads, 1,
                 encoder_hiddens.size(1)])

        outputs, hiddens = [], []
        seq_attn_weights = []
        seq_p_pointers = []

        for i in range(input_embedded.size(1)):
            input_ = input_embedded[:, i:i + 1, :]
            # compute selective read
            if self.training and decoder_ptr_value_ids is not None:
                last_output = decoder_ptr_value_ids[:, i:i + 1]
            else:
                assert (last_output is not None)

            # [batch_size, encoder_seq_len]x
            select_attn = selective_read(
                encoder_ptr_value_ids, encoder_hiddens,
                self.merge_multi_head_attention(attn_weights), last_output)
            # [batch_size, 1, input_dim + self.attn_value_dim]
            input_sa = torch.cat([input_, p_pointer * select_attn], dim=2)
            output, hidden = self.rnn(input_sa, hidden)
            if self.training and self.return_hiddens:
                hiddens.append(hidden)
            # a) compute attention vector and attention weights
            # [batch_size, 1, attn_value_dim], [batch_size, num_head, 1, encoder_seq_len]
            attn_vec, attn_weights = self.attn(output, encoder_hiddens,
                                               encoder_hiddens,
                                               encoder_hidden_masks)
            # b) compute pointer-generator switch
            # [batch_size, 1, 3]
            p_pointer = self.pointer_switch(output, attn_vec)
            # c) update output state
            output = self.attn_combine(output, attn_vec)
            # d.1) compute generation prob
            # [batch_size, 1, output_vocab_size]
            gen_logit = self.out(output)
            gen_prob = torch.exp(gen_logit)
            # TODO: vocab_mask implementation in progress
            # assert(vocab_masks is None)
            # if vocab_masks is not None:
            #     gen_prob *= vocab_masks.float().unsqueeze(1)
            # d.2) compute schema element pointing prob

            # d.3) compute text span pointing prob

            # d.4) merge d.1, d.2 and d.3
            # [batch_size, 1, output_vocab_size + max_in_seq_len]
            point_prob = self.merge_multi_head_attention(attn_weights)
            if memory_masks is not None:
                point_prob *= memory_masks.float().unsqueeze(1)
            weighted_point_prob = p_pointer * point_prob
            if encoder_ptr_value_ids is None:
                point_gen_prob = torch.cat(
                    [(1 - p_pointer) * gen_prob, weighted_point_prob], dim=2)
            else:
                gen_prob_zeros_pad = ops.zeros_var_cuda(
                    (batch_size, 1, encoder_hiddens.size(1)))
                weighted_gen_prob = torch.cat(
                    [(1 - p_pointer) * gen_prob, gen_prob_zeros_pad], dim=2)
                point_gen_prob = weighted_gen_prob.scatter_add_(
                    index=encoder_ptr_value_ids.unsqueeze(1),
                    src=weighted_point_prob,
                    dim=2)
            point_gen_logit = ops.safe_log(point_gen_prob)

            outputs.append(point_gen_logit), seq_attn_weights.append(attn_weights),\
            seq_p_pointers.append(p_pointer)

        if self.training and self.return_hiddens:
            return torch.cat(outputs, dim=1), hidden, \
                   (torch.cat(seq_p_pointers, dim=1), torch.cat(seq_attn_weights, dim=2)), \
                   self.cat_lstm_hiddens(hiddens)
        else:
            return torch.cat(outputs, dim=1), hidden, \
                   (torch.cat(seq_p_pointers, dim=1), torch.cat(seq_attn_weights, dim=2))