def test_augmented_lstm_computes_same_function_as_pytorch_lstm(self):
        augmented_lstm = AugmentedLstm(10, 11)
        pytorch_lstm = LSTM(10, 11, num_layers=1, batch_first=True)
        # Initialize all weights to be == 1.
        initializer = InitializerApplicator([(".*", lambda tensor: torch.nn.init.constant_(tensor, 1.))])
        initializer(augmented_lstm)
        initializer(pytorch_lstm)

        initial_state = torch.zeros([1, 5, 11])
        initial_memory = torch.zeros([1, 5, 11])

        # Use bigger numbers to avoid floating point instability.
        sorted_tensor, sorted_sequence, _, _ = sort_batch_by_length(self.random_tensor * 5., self.sequence_lengths)
        lstm_input = pack_padded_sequence(sorted_tensor, sorted_sequence.data.tolist(), batch_first=True)

        augmented_output, augmented_state = augmented_lstm(lstm_input, (initial_state, initial_memory))
        pytorch_output, pytorch_state = pytorch_lstm(lstm_input, (initial_state, initial_memory))
        pytorch_output_sequence, _ = pad_packed_sequence(pytorch_output, batch_first=True)
        augmented_output_sequence, _ = pad_packed_sequence(augmented_output, batch_first=True)

        numpy.testing.assert_array_almost_equal(pytorch_output_sequence.data.numpy(),
                                                augmented_output_sequence.data.numpy(), decimal=4)
        numpy.testing.assert_array_almost_equal(pytorch_state[0].data.numpy(),
                                                augmented_state[0].data.numpy(), decimal=4)
        numpy.testing.assert_array_almost_equal(pytorch_state[1].data.numpy(),
                                                augmented_state[1].data.numpy(), decimal=4)
    def forward(self,  # pylint: disable=arguments-differ
                inputs: PackedSequence,
                initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None):
        """
        Parameters
        ----------
        inputs : ``PackedSequence``, required.
            A batch first ``PackedSequence`` to run the stacked LSTM over.
        initial_state : Tuple[torch.Tensor, torch.Tensor], optional, (default = None)
            A tuple (state, memory) representing the initial hidden state and memory
            of the LSTM. Each tensor has shape (1, batch_size, output_dimension * 2).

        Returns
        -------
        output_sequence : PackedSequence
            The encoded sequence of shape (batch_size, sequence_length, hidden_size * 2)
        final_states: torch.Tensor
            The per-layer final (state, memory) states of the LSTM, each with shape
            (num_layers, batch_size, hidden_size * 2).
        """
        if not initial_state:
            hidden_states = [None] * len(self.lstm_layers)
        elif initial_state[0].size()[0] != len(self.lstm_layers):
            raise ConfigurationError("Initial states were passed to forward() but the number of "
                                     "initial states does not match the number of layers.")
        else:
            hidden_states = list(zip(initial_state[0].split(1, 0),
                                     initial_state[1].split(1, 0)))

        output_sequence = inputs
        final_h = []
        final_c = []
        for i, state in enumerate(hidden_states):
            forward_layer = getattr(self, 'forward_layer_{}'.format(i))
            backward_layer = getattr(self, 'backward_layer_{}'.format(i))
            # The state is duplicated to mirror the Pytorch API for LSTMs.
            forward_output, final_forward_state = forward_layer(output_sequence, state)
            backward_output, final_backward_state = backward_layer(output_sequence, state)

            forward_output, lengths = pad_packed_sequence(forward_output, batch_first=True)
            backward_output, _ = pad_packed_sequence(backward_output, batch_first=True)

            output_sequence = torch.cat([forward_output, backward_output], -1)
            output_sequence = pack_padded_sequence(output_sequence, lengths, batch_first=True)

            final_h.extend([final_forward_state[0], final_backward_state[0]])
            final_c.extend([final_forward_state[1], final_backward_state[1]])

        final_h = torch.cat(final_h, dim=0)
        final_c = torch.cat(final_c, dim=0)
        final_state_tuple = (final_h, final_c)
        return output_sequence, final_state_tuple
 def forward(self, input, *args):
     args, seq_lengths = args[:-1], args[-1]
     input = rnn_utils.pack_padded_sequence(input, seq_lengths, self.batch_first)
     rets = self.model(input, *args)
     ret, rets = rets[0], rets[1:]
     ret, _ = rnn_utils.pad_packed_sequence(ret, self.batch_first)
     return tuple([ret] + list(rets))
Exemple #4
0
    def forward(self, xs):
        bsz = len(xs)

        # embed input tokens
        xes = F.dropout(self.lt(xs), p=self.dropout, training=self.training)
        x_lens = [x for x in torch.sum((xs > 0).int(), dim=1).data]
        xes_packed = pack_padded_sequence(xes, x_lens, batch_first=True)

        zeros = self.zeros(xs)
        if zeros.size(1) != bsz:
            zeros.resize_(self.layers * self.dirs, bsz, self.hsz).fill_(0)
        h0 = Variable(zeros, requires_grad=False)

        if type(self.rnn) == nn.LSTM:
            encoder_output_packed, hidden = self.rnn(xes_packed, (h0, h0))
            # take elementwise max between forward and backward hidden states
            hidden = (hidden[0].view(-1, self.dirs, bsz, self.hsz).max(1)[0],
                      hidden[1].view(-1, self.dirs, bsz, self.hsz).max(1)[0])
        else:
            encoder_output_packed, hidden = self.rnn(xes_packed, h0)

            # take elementwise max between forward and backward hidden states
            hidden = hidden.view(-1, self.dirs, bsz, self.hsz).max(1)[0]
        encoder_output, _ = pad_packed_sequence(encoder_output_packed,
                                                batch_first=True)
        return encoder_output, hidden
Exemple #5
0
    def encode_table_header(self, tables):
        # input, ids of table word: (batch_size, max_column_num)
        # encode_output: (max_head_word_num, batch_size, max_column_num, hidden_size)

        # (batch_size, max_column_num, max_head_word_num)
        # table_head_mask: (batch_size, max_column_num)
        # table_col_lens: (batch_size, max_column_num)
        table_head_wids, table_col_lens = WikiSqlBatch.get_table_header_input_tensor(tables,
                                                                                     self.vocab.source,
                                                                                     cuda=self.args.cuda)

        # hack: pack_padded_sequence requires seq length to be greater than 1
        for tbl in table_col_lens:
            for i in range(len(tbl)):
                if tbl[i] == 0: tbl[i] = 1

        table_header_mask = WikiSqlBatch.get_table_header_mask(tables, cuda=self.args.cuda)

        # (batch_size, max_column_num, max_head_word_num, word_embed_size)
        table_head_word_embeds = self.src_embed(table_head_wids.view(-1)).view(list(table_head_wids.size()) + [self.src_embed.embedding_dim])

        batch_size = table_head_word_embeds.size(0)
        max_col_num = table_head_word_embeds.size(1)
        max_col_word_num = table_head_word_embeds.size(2)

        # (batch_size * max_column_num, max_head_word_num, word_embed_size)
        table_head_word_embeds_flatten = table_head_word_embeds.view(batch_size * max_col_num,
                                                                     max_col_word_num, -1)
        table_col_lens_flatten = list(chain.from_iterable(table_col_lens))
        sorted_col_ids = sorted(list(range(len(table_col_lens_flatten))), key=lambda x: -table_col_lens_flatten[x])
        sorted_table_col_lens_flatten = [table_col_lens_flatten[i] for i in sorted_col_ids]

        col_old_pos_map = [-1] * len(sorted_col_ids)
        for new_pos, old_pos in enumerate(sorted_col_ids):
            col_old_pos_map[old_pos] = new_pos

        # (batch_size * max_column_num, max_head_word_num, word_embed_size)
        sorted_table_head_word_embeds = table_head_word_embeds_flatten[sorted_col_ids, :, :]

        packed_table_head_word_embeds = pack_padded_sequence(sorted_table_head_word_embeds, sorted_table_col_lens_flatten, batch_first=True)

        # column_word_encodings: (batch_size * max_column_num, max_head_word_num, hidden_size)
        column_word_encodings, (table_header_encoding, table_head_last_cell) = self.table_header_lstm(packed_table_head_word_embeds)
        column_word_encodings, _ = pad_packed_sequence(column_word_encodings, batch_first=True)

        # (batch_size * max_column_num, max_head_word_num, hidden_size)
        column_word_encodings = column_word_encodings[col_old_pos_map]
        # (batch_size, max_column_num, max_head_word_num, hidden_size)
        column_word_encodings = column_word_encodings.view(batch_size, max_col_num, max_col_word_num, -1)

        # (batch_size, hidden_size * 2)
        table_header_encoding = torch.cat([table_header_encoding[0], table_header_encoding[1]], -1)
        # table_head_last_cell = torch.cat([table_head_last_cell[0], table_head_last_cell[1]], -1)

        # same
        table_header_encoding = table_header_encoding[col_old_pos_map]
        # (batch_size, max_column_num, hidden_size)
        table_header_encoding = table_header_encoding.view(batch_size, max_col_num, -1)

        return column_word_encodings, table_header_encoding, table_header_mask
Exemple #6
0
    def encode(self, src_sents_var, src_sents_len):
        """Encode the input natural language utterance

        Args:
            src_sents_var: a variable of shape (src_sent_len, batch_size), representing word ids of the input
            src_sents_len: a list of lengths of input source sentences, sorted by descending order

        Returns:
            src_encodings: source encodings of shape (batch_size, src_sent_len, hidden_size * 2)
            last_state, last_cell: the last hidden state and cell state of the encoder,
                                   of shape (batch_size, hidden_size)
        """

        # (tgt_query_len, batch_size, embed_size)
        # apply word dropout
        if self.training and self.args.word_dropout:
            mask = Variable(self.new_tensor(src_sents_var.size()).fill_(1. - self.args.word_dropout).bernoulli().long())
            src_sents_var = src_sents_var * mask + (1 - mask) * self.vocab.source.unk_id

        src_token_embed = self.src_embed(src_sents_var)
        packed_src_token_embed = pack_padded_sequence(src_token_embed, src_sents_len)

        # src_encodings: (tgt_query_len, batch_size, hidden_size)
        src_encodings, (last_state, last_cell) = self.encoder_lstm(packed_src_token_embed)
        src_encodings, _ = pad_packed_sequence(src_encodings)
        # src_encodings: (batch_size, tgt_query_len, hidden_size)
        src_encodings = src_encodings.permute(1, 0, 2)

        # (batch_size, hidden_size * 2)
        last_state = torch.cat([last_state[0], last_state[1]], 1)
        last_cell = torch.cat([last_cell[0], last_cell[1]], 1)

        return src_encodings, (last_state, last_cell)
    def forward(self, question,length):
        length = list(length.data.cpu().numpy())
        
        
        emb = self.drop(self.encoder(question))
        emb = self.tanh(emb)

        hidden = self.init_hidden(len(length))
        seqs = trnn.pack_padded_sequence(emb, length, batch_first=True)

        seqs, hidden = self.rnn(seqs, hidden)
        h,_ = trnn.pad_packed_sequence(seqs, batch_first=True)

        #attention
        weights = self.softmax(self.att2(torch.transpose(h, 1, 2)).squeeze(1)).unsqueeze(-1)
        weights = weights.expand_as(h)
        
        bilstmout = torch.sum(h*weights, 1).squeeze(1)


        #bilstmout = torch.cat([hidden[0][0],hidden[0][1]],-1)


        fc1fea = self.fc1(bilstmout)

        return fc1fea
    def forward(self,  # pylint: disable=arguments-differ
                inputs: torch.Tensor,
                mask: torch.Tensor,
                hidden_state: torch.Tensor = None) -> torch.Tensor:

        if self.stateful and mask is None:
            raise ValueError("Always pass a mask with stateful RNNs.")
        if self.stateful and hidden_state is not None:
            raise ValueError("Stateful RNNs provide their own initial hidden_state.")

        if mask is None:
            return self._module(inputs, hidden_state)[0]

        batch_size, total_sequence_length = mask.size()

        packed_sequence_output, final_states, restoration_indices = \
            self.sort_and_run_forward(self._module, inputs, mask, hidden_state)

        unpacked_sequence_tensor, _ = pad_packed_sequence(packed_sequence_output, batch_first=True)

        num_valid = unpacked_sequence_tensor.size(0)
        # Some RNNs (GRUs) only return one state as a Tensor.  Others (LSTMs) return two.
        # If one state, use a single element list to handle in a consistent manner below.
        if not isinstance(final_states, (list, tuple)) and self.stateful:
            final_states = [final_states]

        # Add back invalid rows.
        if num_valid < batch_size:
            _, length, output_dim = unpacked_sequence_tensor.size()
            zeros = unpacked_sequence_tensor.data.new(batch_size - num_valid, length, output_dim).fill_(0)
            zeros = Variable(zeros)
            unpacked_sequence_tensor = torch.cat([unpacked_sequence_tensor, zeros], 0)

            # The states also need to have invalid rows added back.
            if self.stateful:
                new_states = []
                for state in final_states:
                    num_layers, _, state_dim = state.size()
                    zeros = state.data.new(num_layers, batch_size - num_valid, state_dim).fill_(0)
                    zeros = Variable(zeros)
                    new_states.append(torch.cat([state, zeros], 1))
                final_states = new_states

        # It's possible to need to pass sequences which are padded to longer than the
        # max length of the sequence to a Seq2SeqEncoder. However, packing and unpacking
        # the sequences mean that the returned tensor won't include these dimensions, because
        # the RNN did not need to process them. We add them back on in the form of zeros here.
        sequence_length_difference = total_sequence_length - unpacked_sequence_tensor.size(1)
        if sequence_length_difference > 0:
            zeros = unpacked_sequence_tensor.data.new(batch_size,
                                                      sequence_length_difference,
                                                      unpacked_sequence_tensor.size(-1)).fill_(0)
            zeros = Variable(zeros)
            unpacked_sequence_tensor = torch.cat([unpacked_sequence_tensor, zeros], 1)

        if self.stateful:
            self._update_states(final_states, restoration_indices)

        # Restore the original indices and return the sequence.
        return unpacked_sequence_tensor.index_select(0, restoration_indices)
    def test_forward_pulls_out_correct_tensor_for_unsorted_batches(self):
        lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=7, batch_first=True)
        encoder = PytorchSeq2SeqWrapper(lstm)
        input_tensor = torch.rand([5, 7, 3])
        input_tensor[0, 3:, :] = 0
        input_tensor[1, 4:, :] = 0
        input_tensor[2, 2:, :] = 0
        input_tensor[3, 6:, :] = 0
        mask = torch.ones(5, 7)
        mask[0, 3:] = 0
        mask[1, 4:] = 0
        mask[2, 2:] = 0
        mask[3, 6:] = 0

        sequence_lengths = get_lengths_from_binary_sequence_mask(mask)
        sorted_inputs, sorted_sequence_lengths, restoration_indices, _ = sort_batch_by_length(input_tensor,
                                                                                              sequence_lengths)
        packed_sequence = pack_padded_sequence(sorted_inputs,
                                               sorted_sequence_lengths.data.tolist(),
                                               batch_first=True)
        lstm_output, _ = lstm(packed_sequence)
        encoder_output = encoder(input_tensor, mask)
        lstm_tensor, _ = pad_packed_sequence(lstm_output, batch_first=True)
        assert_almost_equal(encoder_output.data.numpy(),
                            lstm_tensor.index_select(0, restoration_indices).data.numpy())
Exemple #10
0
    def encode(self, src_sents_var: torch.Tensor, src_sent_lens: List[int]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """
        Use a GRU/LSTM to encode source sentences into hidden states

        Args:
            src_sents: list of source sentence tokens

        Returns:
            src_encodings: hidden states of tokens in source sentences, this could be a variable
                with shape (batch_size, source_sentence_length, encoding_dim), or in orther formats
            decoder_init_state: decoder GRU/LSTM's initial state, computed from source encodings
        """

        # (src_sent_len, batch_size, embed_size)
        src_word_embeds = self.src_embed(src_sents_var)
        packed_src_embed = pack_padded_sequence(src_word_embeds, src_sent_lens)

        # src_encodings: (src_sent_len, batch_size, hidden_size * 2)
        src_encodings, (last_state, last_cell) = self.encoder_lstm(packed_src_embed)
        src_encodings, _ = pad_packed_sequence(src_encodings)

        # (batch_size, src_sent_len, hidden_size * 2)
        src_encodings = src_encodings.permute(1, 0, 2)

        dec_init_cell = self.decoder_cell_init(torch.cat([last_cell[0], last_cell[1]], dim=1))
        dec_init_state = torch.tanh(dec_init_cell)

        return src_encodings, (dec_init_state, dec_init_cell)
Exemple #11
0
 def forward(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover):
     """
         input:
             word_inputs: (batch_size, sent_len)
             word_seq_lengths: list of batch_size, (batch_size,1)
             char_inputs: (batch_size*sent_len, word_length)
             char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1)
             char_seq_recover: variable which records the char order information, used to recover char order
         output:
             Variable(batch_size, sent_len, hidden_dim)
     """
     word_represent = self.wordrep(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
     ## word_embs (batch_size, seq_len, embed_size)
     if self.word_feature_extractor == "CNN":
         word_in = F.tanh(self.word2cnn(word_represent)).transpose(2,1).contiguous()
         for idx in range(self.cnn_layer):
             if idx == 0:
                 cnn_feature = F.relu(self.cnn_list[idx](word_in))
             else:
                 cnn_feature = F.relu(self.cnn_list[idx](cnn_feature))
             cnn_feature = self.cnn_drop_list[idx](cnn_feature)
             cnn_feature = self.cnn_batchnorm_list[idx](cnn_feature)
         feature_out = cnn_feature.transpose(2,1).contiguous()
     else:
         packed_words = pack_padded_sequence(word_represent, word_seq_lengths.cpu().numpy(), True)
         hidden = None
         lstm_out, hidden = self.lstm(packed_words, hidden)
         lstm_out, _ = pad_packed_sequence(lstm_out)
         ## lstm_out (seq_len, seq_len, hidden_size)
         feature_out = self.droplstm(lstm_out.transpose(1,0))
     ## feature_out (batch_size, seq_len, hidden_size)
     outputs = self.hidden2tag(feature_out)
     return outputs
Exemple #12
0
    def forward(self, sentences, sentences_len, hidden):
        sentences_len = sentences_len.cpu().data.numpy()

        idx = np.argsort(sentences_len).tolist()[::-1]
        ridx = np.argsort(idx).tolist()

        sentences = sentences[idx, :]
        sentences_len = sentences_len[idx, ]
        embedding = self.embedding(sentences)
        embedding = nn.Dropout(0.1)(embedding)

        packed_embedding = pack_padded_sequence(embedding, sentences_len, batch_first=True)
        packed_rnn_feature, hidden = self.rnn_feature(packed_embedding, hidden)
        sentence_feature, _ = pad_packed_sequence(packed_rnn_feature, batch_first=True)

        idx = Variable(LongTensor(sentences_len - 1))
        idx = idx.view(-1, 1).expand(sentence_feature.size(0), sentence_feature.size(2)).unsqueeze(1)
        if sentence_feature.is_cuda:
            idx = idx.cuda()
        sentence_feature = sentence_feature.gather(1, idx).squeeze()

        sentence_feature = sentence_feature[ridx, :]
        sentences_len = sentences_len[ridx, ]

        logits = self.classifier(sentence_feature)
        pred = F.log_softmax(logits, dim=0)
        return pred
Exemple #13
0
 def forward(self, input, h0, c0, lens=None):
     output, hn, cn = self.encoder(input, h0, c0, lens)
     if lens:
         output, _ = pad_packed_sequence(output)
     logprobs = self.scorer(output.contiguous().view(output.size(0)*output.size(1), output.size(2)))
     logprobs = logprobs.view(output.size(0), output.size(1), logprobs.size(1))
     return logprobs, hn, cn
    def forward(self, vocab):
        with torch.no_grad():
            batch_shape = vocab['sentence'].shape
            s_embedding = self.embedding(vocab['sentence'].cuda())
            a_embedding = self.embedding(vocab['aspect'].cuda())

            packed_s = pack_padded_sequence(s_embedding, vocab['sent_len'], batch_first=True)

        out_s, (h_s, c1) = self.lstm_s(packed_s) # packed output
        out_a, (h_a, c2) = self.lstm_a(a_embedding)

        with torch.no_grad():
            unpacked_out_s, _ = pad_packed_sequence(out_s, batch_first=True)

        # Pair-wise interaction matrix
        I_matrix = torch.bmm(unpacked_out_s, out_a.permute(0,2,1))

        # Column-wise softmax
        a2s_attn = F.softmax(I_matrix, dim=1)

        # Row-wise softmax => Column-wise average => aspect attention
        s2a_attn = F.softmax(I_matrix, dim=2)
        a_attn = torch.mean(s2a_attn, dim=1)

        # Final sentence attn => weighted sum of each individual a2s_attn
        s_attn = torch.bmm(a2s_attn, a_attn.unsqueeze(-1))

        final_rep = torch.bmm(unpacked_out_s.permute(0,2,1), s_attn).squeeze(-1)
        pred = self.fc(final_rep)
        return pred
Exemple #15
0
    def forward(self, embs, lengths):
        """
        This is the heart of the model. This function, defines how the data
        passes through the network.
        Args:
            embs (): word embeddings
            lengths (): the lengths of each sentence

        Returns: the logits for each class

        """
        # pack the batch
        packed = pack_padded_sequence(embs, list(lengths.data),
                                      batch_first=True)

        out_packed, _ = self.rnn(packed)

        # unpack output - no need if we are going to use only the last outputs
        outputs, _ = pad_packed_sequence(out_packed, batch_first=True)

        # get the outputs from the last *non-masked* timestep for each sentence
        last_outputs = self.last_timestep(outputs, lengths,
                                          self.rnn.bidirectional)

        # apply dropout to the outputs of the RNN
        last_outputs = self.drop_rnn(last_outputs)

        return outputs, last_outputs
    def forward(self, x):
        """Receives a Variable of indices (n_timesteps, n_samples) and
        returns their recurrent representations."""
        # sort the batch by decreasing length of sequences
        # oidxs: to recover original order
        # sidxs: idxs to sort the batch
        # slens: lengths in sorted order for pack_padded_sequence()
        oidxs, sidxs, slens, mask = sort_batch(x)

        # Fetch embeddings for the sorted batch
        embs = self.emb(x[:, sidxs])

        if self.dropout_emb > 0:
            embs = self.do_emb(embs)

        # Pack and encode
        packed_emb = pack_padded_sequence(embs, slens)
        packed_hs, h_t = self.enc(packed_emb)

        # Get hidden states and revert the order
        hs = pad_packed_sequence(packed_hs)[0][:, oidxs]

        if self.dropout_ctx > 0:
            hs = self.do_ctx(hs)

        return hs, mask
    def _run_rnns(self, inputs, structures, lengths):
        '''
            Run desired rnns
        '''
        for rnn, structure in zip(self.rnns, [structures]):
            if isinstance(rnn, ChildSumTreeLSTM):
                h_all, h_last = rnn(inputs, structure)
            elif isinstance(rnn, LSTM):
                packed = pack_padded_sequence(inputs, list(lengths.data), batch_first=True)
                h_all, (h_last, c_last) = rnn(packed)
                h_all, _ = pad_packed_sequence(h_all, batch_first=True)
            elif isinstance(rnn, GRU):
                packed = pack_padded_sequence(inputs, list(lengths.data), batch_first=True)
                h_all, h_last = rnn(packed)
                h_all, _ = pad_packed_sequence(h_all, batch_first=True)
            inputs = h_all.squeeze()

        return h_all, h_last
Exemple #18
0
    def forward(self, input, seq_lens):
        embedded = self.embedding(input)

        packed = pack_padded_sequence(embedded, seq_lens, batch_first=True)
        output, hidden = self.lstm(packed)

        h, _ = pad_packed_sequence(output, batch_first=True)  # h dim = B x t_k x n
        h = h.contiguous()
        max_h, _ = h.max(dim=1)

        return h, hidden, max_h
    def test_variable_length_sequences_run_backward_return_correctly_padded_outputs(self):
        sorted_tensor, sorted_sequence, _, _ = sort_batch_by_length(self.random_tensor, self.sequence_lengths)
        tensor = pack_padded_sequence(sorted_tensor, sorted_sequence.data.tolist(), batch_first=True)
        lstm = AugmentedLstm(10, 11, go_forward=False)
        output, _ = lstm(tensor)
        output_sequence, _ = pad_packed_sequence(output, batch_first=True)

        numpy.testing.assert_array_equal(output_sequence.data[1, 6:, :].numpy(), 0.0)
        numpy.testing.assert_array_equal(output_sequence.data[2, 4:, :].numpy(), 0.0)
        numpy.testing.assert_array_equal(output_sequence.data[3, 3:, :].numpy(), 0.0)
        numpy.testing.assert_array_equal(output_sequence.data[4, 2:, :].numpy(), 0.0)
 def test_stacked_bidirectional_lstm_completes_forward_pass(self):
     input_tensor = torch.rand(4, 5, 3)
     input_tensor[1, 4:, :] = 0.
     input_tensor[2, 2:, :] = 0.
     input_tensor[3, 1:, :] = 0.
     input_tensor = pack_padded_sequence(input_tensor, [5, 4, 2, 1], batch_first=True)
     lstm = StackedBidirectionalLstm(3, 7, 3)
     output, _ = lstm(input_tensor)
     output_sequence, _ = pad_packed_sequence(output, batch_first=True)
     numpy.testing.assert_array_equal(output_sequence.data[1, 4:, :].numpy(), 0.0)
     numpy.testing.assert_array_equal(output_sequence.data[2, 2:, :].numpy(), 0.0)
     numpy.testing.assert_array_equal(output_sequence.data[3, 1:, :].numpy(), 0.0)
    def forward(self, inputs: PackedSequence,  # pylint: disable=arguments-differ
                # pylint: disable=unused-argument
                initial_state: torch.Tensor = None)-> Tuple[PackedSequence, torch.Tensor]:
        """
        Parameters
        ----------
        inputs : ``PackedSequence``, required.
            A batch first ``PackedSequence`` to run the stacked LSTM over.
        initial_state : Tuple[torch.Tensor, torch.Tensor], optional, (default = None)
            Currently, this is ignored.

        Returns
        -------
        output_sequence : ``PackedSequence``
            The encoded sequence of shape (batch_size, sequence_length, hidden_size)
        final_states: ``torch.Tensor``
            The per-layer final (state, memory) states of the LSTM, each with shape
            (num_layers, batch_size, hidden_size).
        """
        inputs, lengths = pad_packed_sequence(inputs, batch_first=True)

        # Kernel takes sequence length first tensors.
        inputs = inputs.transpose(0, 1)

        sequence_length, batch_size, _ = inputs.size()
        accumulator_shape = [self.num_layers, sequence_length + 1, batch_size, self.hidden_size]
        state_accumulator = Variable(inputs.data.new(*accumulator_shape).zero_(), requires_grad=False)
        memory_accumulator = Variable(inputs.data.new(*accumulator_shape).zero_(), requires_grad=False)

        dropout_weights = inputs.data.new().resize_(self.num_layers, batch_size, self.hidden_size).fill_(1.0)
        if self.training:
            # Normalize by 1 - dropout_prob to preserve the output statistics of the layer.
            dropout_weights.bernoulli_(1 - self.recurrent_dropout_probability)\
                .div_((1 - self.recurrent_dropout_probability))

        dropout_weights = Variable(dropout_weights, requires_grad=False)
        gates = Variable(inputs.data.new().resize_(self.num_layers,
                                                   sequence_length,
                                                   batch_size, 6 * self.hidden_size))

        lengths_variable = Variable(torch.IntTensor(lengths))
        implementation = _AlternatingHighwayLSTMFunction(self.input_size,
                                                         self.hidden_size,
                                                         num_layers=self.num_layers,
                                                         train=self.training)
        output, _ = implementation(inputs, self.weight, self.bias, state_accumulator,
                                   memory_accumulator, dropout_weights, lengths_variable, gates)

        # TODO(Mark): Also return the state here by using index_select with the lengths so we can use
        # it as a Seq2VecEncoder.
        output = output.transpose(0, 1)
        output = pack_padded_sequence(output, lengths, batch_first=True)
        return output, None
 def test_stacked_alternating_lstm_completes_forward_pass(self):
     input_tensor = torch.autograd.Variable(torch.rand(4, 5, 3))
     input_tensor[1, 4:, :] = 0.
     input_tensor[2, 2:, :] = 0.
     input_tensor[3, 1:, :] = 0.
     input_tensor = pack_padded_sequence(input_tensor, [5, 4, 2, 1], batch_first=True)
     lstm = StackedAlternatingLstm(3, 7, 3)
     output, _ = lstm(input_tensor)
     output_sequence, _ = pad_packed_sequence(output, batch_first=True)
     numpy.testing.assert_array_equal(output_sequence.data[1, 4:, :].numpy(), 0.0)
     numpy.testing.assert_array_equal(output_sequence.data[2, 2:, :].numpy(), 0.0)
     numpy.testing.assert_array_equal(output_sequence.data[3, 1:, :].numpy(), 0.0)
Exemple #23
0
    def forward(self, x, lens):
        B, T = x.shape
        # 获取词嵌入向量
        x = self.embed(x)
        x = self.drop(x)

        x = pack_padded_sequence(x, lens, True)
        x, _ = self.lstm(x)
        x, _ = pad_packed_sequence(x, True)
        x = self.drop(x)

        return self.out(x)
Exemple #24
0
 def forward(self, input, hidden, no_pack=False):
     emb = self.drop(self.encoder(input))
     # if eval, pack padded sequence (we don't pack during training because
     # we have no padding in our input samples)
     if not self.training and not no_pack:
         emb_lens = [x for x in torch.sum((input > 0).int(), dim=0).data]
         emb_packed = pack_padded_sequence(emb, emb_lens, batch_first=False)
         packed_output, hidden = self.rnn(emb_packed, hidden)
         output, _ = pad_packed_sequence(packed_output, batch_first=False)
     else:
         output, hidden = self.rnn(emb, hidden)
     output = self.drop(output)
     decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
     return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
    def _lstm_forward(self, inputs, initial_state=None):
        if initial_state is None:
            hidden_states = [None] * len(self.forward_layers)
        else:
            assert initial_state[0].size()[0] == len(self.forward_layers)
            hidden_states = list(zip(initial_state[0].split(1, 0), initial_state[1].split(1, 0)))

        inputs, batch_lengths = pad_packed_sequence(inputs, batch_first=True)
        forward_output_sequence = inputs
        backward_output_sequence = inputs

        final_states = []
        sequence_outputs = []
        for i, state in enumerate(hidden_states):
            forward_layer = getattr(self, "forward_layer_%d" % i)
            backward_layer = getattr(self, "backward_layer_%d" % i)

            forward_cache = forward_output_sequence
            backward_cache = backward_output_sequence

            if state is not None:
                forward_hidden_state, backward_hidden_state = state[0].split(self.hidden_size, 2)
                forward_memory_state, backward_memory_state = state[1].split(self.cell_size, 2)
                forward_state = (forward_hidden_state, forward_memory_state)
                backward_state = (backward_hidden_state, backward_memory_state)
            else:
                forward_state = None
                backward_state = None

            forward_output_sequence, forward_state = forward_layer(forward_output_sequence, batch_lengths,
                                                                   forward_state)
            backward_output_sequence, backward_state = backward_layer(backward_output_sequence, batch_lengths,
                                                                      backward_state)

            if i != 0:
                forward_output_sequence += forward_cache
                backward_output_sequence += backward_cache

            sequence_outputs.append(torch.cat([forward_output_sequence,
                                               backward_output_sequence], -1))

            final_states.append((torch.cat([forward_state[0], backward_state[0]], -1),
                                 torch.cat([forward_state[1], backward_state[1]], -1)))

        stacked_sequence_outputs: torch.FloatTensor = torch.stack(sequence_outputs)

        final_hidden_states, final_memory_states = zip(*final_states)
        final_state_tuple = (torch.cat(final_hidden_states, 0), torch.cat(final_memory_states, 0))
        return stacked_sequence_outputs, final_state_tuple
 def _get_lstm_features(self, names, lengths):
     self.hidden = self.init_hidden(names.size(-1))
     embeds = self.char_embeds(names)  # Figure 4
     packed_input = pack_padded_sequence(embeds, lengths)  # Figure 5
     packed_output, (ht, ct) = self.lstm(packed_input, self.hidden)  # Figure 6
     lstm_out, _ = pad_packed_sequence(packed_output)  # Figure 7
     lstm_out = torch.transpose(lstm_out, 0, 1)
     lstm_out = torch.transpose(lstm_out, 1, 2)
     lstm_out = F.tanh(lstm_out)  # Figure 8
     lstm_out, indices = F.max_pool1d(lstm_out, lstm_out.size(2), return_indices=True)  # Figure 9
     lstm_out = lstm_out.squeeze(2)  #对维度的修正,使其符合输入格式
     lstm_out = F.tanh(lstm_out)
     lstm_feats = self.fully_connected_layer(lstm_out)
     output = self.softmax(lstm_feats)  # Figure 10
     return output
Exemple #27
0
    def forward(self, input, doc_lens):
        """
        :param input: (B, L)
        :param doc_lens: (B)
        :return: (B, L, H), ((2, B, H), (2, B, H)), (B, 2*H)
        """
        input = self.embed(input)  # (B, L) -> (B, L, D)

        packed = pack_padded_sequence(input, doc_lens, batch_first=True)
        output, hidden = self.lstm(packed)  # hidden: ((2, B, H), (2, B, H))

        h, _ = pad_packed_sequence(output, batch_first=True)  # (B, L, 2*H)
        h = h.contiguous()  # (B, L, 2*H)
        max_h, _ = h.max(dim=1)  # (B, 2*H)

        return h, hidden, max_h  # (B, L, 2*H), ((2, B, H), (2, B, H)), (B, 2*H)
Exemple #28
0
 def get_all_hiddens(self, input, seq_lengths):
     """
         input:
             input: Variable(batch_size,  word_length)
             seq_lengths: numpy array (batch_size,  1)
         output:
             Variable(batch_size, word_length, char_hidden_dim)
         Note it only accepts ordered (length) variable, length size is recorded in seq_lengths
     """
     batch_size = input.size(0)
     char_embeds = self.char_drop(self.char_embeddings(input))
     char_hidden = None
     pack_input = pack_padded_sequence(char_embeds, seq_lengths, True)
     char_rnn_out, char_hidden = self.char_lstm(pack_input, char_hidden)
     char_rnn_out, _ = pad_packed_sequence(char_rnn_out)
     return char_rnn_out.transpose(1,0)
    def forward(self, hidden, src_tokens, lengths):
        
        x = self.embedding(src_tokens)

        x = pack_padded_sequence(x, lengths)
        x, hidden = self.rnn(x, hidden)
        x, output_lengths = pad_packed_sequence(x)

        if self.num_directions == 2:
            x = x[:, :, :self.args.hidden_size] + x[:, :, self.args.hidden_size:]
            
        # pad_packed_sequence calculates length of longest sequence and returns that as the length
        # sometimes that may not be equal to max_len (set to 50 by default)
        new_len = x.size()[0]
        encoder_padding_mask = src_tokens[:new_len, :].eq(self.padding_idx)
                
        return x, hidden, encoder_padding_mask
Exemple #30
0
    def forward(self,
                img_feats,
                question_feats,
                actions_in,
                action_lengths,
                hidden=False):
        input_feats = Variable()

        T = False
        if self.image_input == True:
            N, T, _ = img_feats.size()
            input_feats = img_feats

        if self.question_input == True:
            N, D = question_feats.size()
            question_feats = question_feats.view(N, 1, D)
            if T == False:
                T = actions_in.size(1)
            question_feats = question_feats.repeat(1, T, 1)
            if len(input_feats) == 0:
                input_feats = question_feats
            else:
                input_feats = torch.cat([input_feats, question_feats], 2)

        if self.action_input == True:
            if len(input_feats) == 0:
                input_feats = self.action_embed(actions_in)
            else:
                input_feats = torch.cat(
                    [input_feats, self.action_embed(actions_in)], 2)

        packed_input_feats = pack_padded_sequence(
            input_feats, action_lengths, batch_first=True)
        packed_output, hidden = self.rnn(packed_input_feats)
        rnn_output, _ = pad_packed_sequence(packed_output, batch_first=True)

        output = self.decoder(rnn_output.contiguous().view(
            rnn_output.size(0) * rnn_output.size(1), rnn_output.size(2)))

        if self.return_states == True:
            return rnn_output, output, hidden
        else:
            return output, hidden
Exemple #31
0
    def forward(self, fea_v, length, target_start, target_end):
        if self.add_char:
            word_v = fea_v[0]
            char_v = fea_v[1]
        else:
            word_v = fea_v
        batch_size = word_v.size(0)
        seq_length = word_v.size(1)

        word_emb = self.embedding(word_v)
        word_emb = self.dropout_emb(word_emb)
        if self.static:
            word_static = self.embedding_static(word_v)
            word_static = self.dropout_emb(word_static)
            word_emb = torch.cat([word_emb, word_static], 2)

        x = torch.transpose(word_emb, 0, 1)
        packed_words = pack_padded_sequence(x, length)
        lstm_out, self.hidden = self.lstm(packed_words, self.hidden)
        lstm_out, _ = pad_packed_sequence(lstm_out)
        ##### lstm_out: (seq_len, batch_size, hidden_size)
        lstm_out = self.dropout_lstm(lstm_out)
        x = lstm_out
        x = x.transpose(0, 1)
        ##### batch version
        # x = torch.squeeze(lstm_out, 1)
        # x: variable (seq_len, batch_size, hidden_size)
        # target_start: variable (batch_size)
        # _, start = torch.max(target_start.unsqueeze(0), dim=1)
        # max_start = utils.to_scalar(target_start[start])
        # _, end = torch.min(target_end.unsqueeze(0), dim=1)
        # min_end = utils.to_scalar(target_end[end])
        max_length = 0
        for index in range(batch_size):
            x_len = x[index].size(0)
            start = utils.to_scalar(target_start[index])
            end = utils.to_scalar(target_end[index])
            none_t = x_len - (end - start + 1)
            if none_t > max_length: max_length = none_t

        # left_save = []
        # mask_left_save = []
        # right_save = []
        # mask_right_save = []
        none_target = []
        mask_none_target = []
        target_save = []
        for idx in range(batch_size):
            mask_none_t = []
            none_t = None
            x_len_cur = x[idx].size(0)
            start_cur = utils.to_scalar(target_start[idx])
            end_cur = utils.to_scalar(target_end[idx])
            # left_len_cur = start_cur
            # left_len_max = max_start
            x_target = x[idx][start_cur:(end_cur + 1)]
            x_average_target = torch.mean(x_target, 0)
            target_save.append(x_average_target.unsqueeze(0))
            if start_cur != 0:
                left = x[idx][:start_cur]
                none_t = left
                mask_none_t.extend([1] * start_cur)
            if end_cur != (x_len_cur - 1):
                right = x[idx][(end_cur + 1):]
                if none_t is not None: none_t = torch.cat([none_t, right], 0)
                else: none_t = right
                mask_none_t.extend([1] * (x_len_cur - end_cur - 1))
            if len(mask_none_t) != max_length:
                add_t = Variable(
                    torch.zeros((max_length - len(mask_none_t)),
                                self.lstm_hiddens))
                if self.use_cuda: add_t = add_t.cuda()
                mask_none_t.extend([0] * (max_length - len(mask_none_t)))
                # print(add_t)
                none_t = torch.cat([none_t, add_t], 0)
            mask_none_target.append(mask_none_t)
            none_target.append(none_t.unsqueeze(0))
            # if start_cur != 0:
            #     x_cur_left = x[idx][:start_cur]
            #     left_len_sub = left_len_max - left_len_cur
            #     mask_cur_left = [1 for _ in range(left_len_cur)]
            # else:
            #     x_cur_left = x[idx][0].unsqueeze(0)
            #     left_len_sub = left_len_max - 1
            #     # mask_cur_left = [-1e+20]
            #     mask_cur_left = [0]
            # # x_cur_left: variable (start_cur, two_hidden_size)
            # # mask_cur_left = [1 for _ in range(start_cur)]
            # # mask_cur_left: list (start_cur)
            # if start_cur < max_start:
            #     add = Variable(torch.zeros(left_len_sub, self.lstm_hiddens))
            #     if self.use_cuda: add = add.cuda()
            #     x_cur_left = torch.cat([x_cur_left, add], 0)
            #     # x_cur_left: variable (max_start, two_hidden_size)
            #     left_save.append(x_cur_left.unsqueeze(0))
            #     # mask_cur_left.extend([-1e+20 for _ in range(left_len_sub)])
            #     mask_cur_left.extend([0 for _ in range(left_len_sub)])
            #     # mask_cur_left: list (max_start)
            #     mask_left_save.append(mask_cur_left)
            # else:
            #     left_save.append(x_cur_left.unsqueeze(0))
            #     mask_left_save.append(mask_cur_left)
            #
            # end_cur = utils.to_scalar(target_end[idx])
            # right_len_cur = x_len_cur - end_cur - 1
            # right_len_max = x_len_cur - min_end - 1
            # if (end_cur + 1) != x_len_cur:
            #     x_cur_right = x[idx][(end_cur + 1):]
            #     right_len_sub = right_len_max - right_len_cur
            #     mask_cur_right = [1 for _ in range(right_len_cur)]
            # else:
            #     x_cur_right = x[idx][end_cur].unsqueeze(0)
            #     right_len_sub = right_len_max - right_len_cur - 1
            #     # mask_cur_right = [-1e+20]
            #     mask_cur_right = [0]
            # # x_cur_right: variable ((x_len_cur-end_cur-1), two_hidden_size)
            # # mask_cur_right = [1 for _ in range(right_len_cur)]
            # # mask_cur_right: list (x_len_cur-end_cur-1==right_len)
            # if end_cur > min_end:
            #     add = Variable(torch.zeros(right_len_sub, self.lstm_hiddens))
            #     if self.use_cuda: add = add.cuda()
            #     x_cur_right = torch.cat([x_cur_right, add], 0)
            #     right_save.append(x_cur_right.unsqueeze(0))
            #     # mask_cur_right.extend([-1e+20 for _ in range(right_len_sub)])
            #     mask_cur_right.extend([0 for _ in range(right_len_sub)])
            #     mask_right_save.append(mask_cur_right)
            # else:
            #     right_save.append(x_cur_right.unsqueeze(0))
            #     mask_right_save.append(mask_cur_right)

        # mask_left_save = Variable(torch.ByteTensor(mask_left_save))
        # # mask_left_save: variable (batch_size, left_len_max)
        # mask_right_save = Variable(torch.ByteTensor(mask_right_save))
        # # mask_right_save: variable (batch_size, right_len_max)
        # left_save = torch.cat(left_save, 0)
        # right_save = torch.cat(right_save, 0)
        target_save = torch.cat(target_save, 0)
        # print(none_target)
        none_target = torch.cat(none_target, 0)
        mask_none_target = Variable(torch.ByteTensor(mask_none_target))
        # left_save: variable (batch_size, left_len_max, two_hidden_size)
        # right_save: variable (batch_size, right_len_max, two_hidden_size)
        # target_save: variable (batch_size, two_hidden_size)
        if self.use_cuda:
            # mask_right_save = mask_right_save.cuda()
            # mask_left_save = mask_left_save.cuda()
            # left_save = left_save.cuda()
            # right_save = right_save.cuda()
            target_save = target_save.cuda()
            mask_none_target = mask_none_target.cuda()
            none_target = none_target.cuda()

        # squence = torch.cat(none_target, 1)
        s = self.attention(none_target, target_save, mask_none_target)
        # s = self.attention(x, target_save, None)
        # s_l = self.attention_l(left_save, target_save, mask_left_save)
        # s_r = self.attention_r(right_save, target_save, mask_right_save)

        result = self.linear(s)  # result: variable (1, label_num)
        # result = self.linear_l(s_l)
        # result = torch.add(result, self.linear_l(s_l))
        # result = torch.add(result, self.linear_r(s_r))
        # result: variable (batch_size, label_num)
        # print(result)
        return result
Exemple #32
0
    def forward(self, src_tokens, src_lengths):
        if LanguagePairDataset.LEFT_PAD_SOURCE:
            # convert left-padding to right-padding
            src_tokens = utils.convert_padding_direction(src_tokens,
                                                         self.padding_idx,
                                                         left_to_right=True)
        if self.word_dropout_module is not None:
            src_tokens = self.word_dropout_module(src_tokens)

        bsz, seqlen = src_tokens.size()

        # embed tokens
        x = self.embed_tokens(src_tokens)
        x = F.dropout(x, p=self.dropout_in, training=self.training)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # Allows compatibility with Caffe2 inputs for tracing (int32)
        # as well as the current format of Fairseq-Py inputs (int64)
        if src_lengths.dtype is torch.int64:
            src_lengths = src_lengths.int()

        # Generate packed seq to deal with varying source seq length
        # packed_input is of type PackedSequence, which consists of:
        # element [0]: a tensor, the packed data, and
        # element [1]: a list of integers, the batch size for each step
        packed_input = pack_padded_sequence(x, src_lengths)

        final_hiddens, final_cells = [], []
        for i, rnn_layer in enumerate(self.layers):
            if self.bidirectional and i == 0:
                h0 = x.new(2, bsz, self.hidden_dim // 2).zero_()
                c0 = x.new(2, bsz, self.hidden_dim // 2).zero_()
            else:
                h0 = x.new(1, bsz, self.hidden_dim).zero_()
                c0 = x.new(1, bsz, self.hidden_dim).zero_()

            # apply LSTM along entire sequence
            current_output, (h_last,
                             c_last) = rnn_layer(packed_input, (h0, c0))

            # final state shapes: (bsz, hidden_dim)
            if self.bidirectional and i == 0:
                # concatenate last states for forward and backward LSTM
                h_last = torch.cat((h_last[0, :, :], h_last[1, :, :]), dim=1)
                c_last = torch.cat((c_last[0, :, :], c_last[1, :, :]), dim=1)
            else:
                h_last = h_last.squeeze(dim=0)
                c_last = c_last.squeeze(dim=0)

            final_hiddens.append(h_last)
            final_cells.append(c_last)

            if self.residual_level is not None and i >= self.residual_level:
                packed_input[0] = packed_input.clone()[0] + current_output[0]
            else:
                packed_input = current_output

        # Reshape to [num_layer, batch_size, hidden_dim]
        final_hiddens = torch.cat(final_hiddens,
                                  dim=0).view(self.num_layers,
                                              *final_hiddens[0].size())
        final_cells = torch.cat(final_cells,
                                dim=0).view(self.num_layers,
                                            *final_cells[0].size())

        #  [max_seqlen, batch_size, hidden_dim]
        unpacked_output, _ = pad_packed_sequence(
            packed_input, padding_value=self.padding_value)

        return (unpacked_output, final_hiddens, final_cells, src_lengths,
                src_tokens)
Exemple #33
0
    def forward(self,
                sentence,
                p_sentence,
                pos_tags,
                lengths,
                target_idx_in,
                region_marks,
                local_roles_voc,
                frames,
                local_roles_mask,
                sent_pred_lemmas_idx,
                dep_tags,
                dep_heads,
                targets,
                specific_dep_tags,
                specific_dep_relations,
                test=False):

        #contruct input for DEP
        pos_embeds = self.pos_embeddings(pos_tags)
        region_marks = region_marks.view(self.batch_size, len(sentence[0]), 1)
        fixed_embeds = self.word_fixed_embeddings(p_sentence)
        fixed_embeds = fixed_embeds.view(self.batch_size, len(sentence[0]),
                                         self.word_emb_dim)

        h_label_embeddings = self.DEP_Label_embeddings(dep_tags)
        h_link_embeddings = self.DEP_Link_embeddings(specific_dep_relations)

        sent_pred_lemmas_embeds = self.p_lemma_embeddings(sent_pred_lemmas_idx)
        embeds_SRL = self.word_embeddings_SRL(sentence)
        embeds_SRL = embeds_SRL.view(self.batch_size, len(sentence[0]),
                                     self.word_emb_dim)
        SRL_hidden_states = torch.cat(
            (embeds_SRL, fixed_embeds, sent_pred_lemmas_embeds, pos_embeds,
             region_marks, h_label_embeddings), 2)
        SRL_hidden_states = self.SRL_input_dropout(SRL_hidden_states)

        # SRL layer
        embeds_sort, lengths_sort, unsort_idx = self.sort_batch(
            SRL_hidden_states, lengths)
        embeds_sort = rnn.pack_padded_sequence(embeds_sort,
                                               lengths_sort.cpu().numpy(),
                                               batch_first=True)
        # hidden states [time_steps * batch_size * hidden_units]
        hidden_states, self.hidden_4 = self.BiLSTM_SRL(embeds_sort,
                                                       self.hidden_4)
        # it seems that hidden states is already batch first, we don't need swap the dims
        # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, )
        hidden_states, lens = rnn.pad_packed_sequence(hidden_states,
                                                      batch_first=True)
        # hidden_states = hidden_states.transpose(0, 1)
        hidden_states = hidden_states[unsort_idx]
        hidden_states = self.hidden_state_dropout(hidden_states)

        # B * H
        hidden_states_3 = hidden_states
        predicate_embeds = hidden_states_3[
            np.arange(0,
                      hidden_states_3.size()[0]), target_idx_in]
        # T * B * H
        added_embeds = torch.zeros(hidden_states_3.size()[1],
                                   hidden_states_3.size()[0],
                                   hidden_states_3.size()[2]).to(device)
        predicate_embeds = added_embeds + predicate_embeds
        # B * T * H
        predicate_embeds = predicate_embeds.transpose(0, 1)
        hidden_states = torch.cat((hidden_states_3, predicate_embeds), 2)
        # print(hidden_states)
        # non-linear map and rectify the roles' embeddings
        # roles = Variable(torch.from_numpy(np.arange(0, self.tagset_size)))

        # B * roles
        # log(local_roles_voc)
        # log(frames)

        # B * roles * h
        role_embeds = self.role_embeddings(local_roles_voc)
        frame_embeds = self.frame_embeddings(frames)

        role_embeds = torch.cat((role_embeds, frame_embeds), 2)
        mapped_roles = F.relu(self.role_map(role_embeds))
        mapped_roles = torch.transpose(mapped_roles, 1, 2)

        # b, times, roles
        tag_space = torch.matmul(hidden_states, mapped_roles)
        #tag_space = hidden_states.mm(mapped_roles)

        # b, roles
        #sub = torch.div(torch.add(local_roles_mask, -1.0), _BIG_NUMBER)
        sub = torch.add(local_roles_mask, -1.0) * _BIG_NUMBER
        sub = torch.FloatTensor(sub.cpu().numpy()).to(device)
        # b, roles, times
        tag_space = torch.transpose(tag_space, 0, 1)
        tag_space += sub
        # b, T, roles
        tag_space = torch.transpose(tag_space, 0, 1)
        tag_space = tag_space.view(len(sentence[0]) * self.batch_size, -1)

        SRLprobs = F.softmax(tag_space, dim=1)

        #loss_function = nn.NLLLoss(ignore_index=0)
        targets = targets.view(-1)
        #tag_scores = F.log_softmax(tag_space)
        #loss = loss_function(tag_scores, targets)
        loss_function = nn.CrossEntropyLoss(ignore_index=0)

        SRLloss = loss_function(tag_space, targets)

        #weight = float(SRLloss.cpu().data.numpy())
        #if weight > 0.1:
        #    weight = 0.1
        #p = nr.rand()
        #if p<0.2:
        #    loss = SRLloss + DEPloss + SPEDEPloss
        #else:
        #    loss = SRLloss
        loss = SRLloss
        return SRLloss, SRLloss, SRLloss, loss, SRLprobs, 1, 1, 1, 1,  \
               1, 1, 1,\
               1, 1, 1
Exemple #34
0
    def forward(self, batch):
        """
            input:
                word_inputs: (batch_size, sent_len)
                feature_inputs: [(batch_size, sent_len), ...] list of variables
                word_seq_lengths: list of batch_size, (batch_size,1)
                char_inputs: (batch_size*sent_len, word_length)
                char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1)
                char_seq_recover: variable which records the char order information, used to recover char order
            output:
                Variable(batch_size, sent_len, hidden_dim)
        """

        word_inputs = batch["word"]
        word_seq_lengths = batch["word_length"]
        char_inputs = batch["char"]
        char_seq_lengths = batch["char_length"]
        char_seq_recover = batch["char_recover"]
        xupos_inputs = batch["xupos"]
        mask = batch["mask"].transpose(1, 0)
        emb = batch.get("emb", None)

        if self.use_bert:
            raw = batch["raw"]

            seq_max_len = len(raw[0])

            all_input_ids = np.zeros((len(raw), 2048), dtype=int)
            all_input_mask = np.zeros((len(raw), 2048), dtype=int)
            all_word_end_mask = np.zeros((len(raw), 2048), dtype=int)

            subword_max_len = 0

            for snum, sentence in enumerate(raw):
                tokens = []
                word_end_mask = []

                tokens.append("[CLS]")
                word_end_mask.append(1)

                cleaned_words = []
                for word in sentence[1:]:
                    word = BERT_TOKEN_MAPPING.get(word, word)
                    if word == "n't" and cleaned_words:
                        cleaned_words[-1] = cleaned_words[-1] + "n"
                        word = "'t"
                    cleaned_words.append(word)

                for word in cleaned_words:
                    word_tokens = self.bert_tokenizer.tokenize(word)
                    if len(word_tokens) == 0:
                        word_tokens = ['.']
                    for _ in range(len(word_tokens)):
                        word_end_mask.append(0)
                    word_end_mask[-1] = 1
                    tokens.extend(word_tokens)

                tokens.append("[SEP]")

                # pad to sequence length for every sentence
                for i in range(seq_max_len - len(sentence)):
                    word_end_mask.append(1)

                input_ids = self.bert_tokenizer.convert_tokens_to_ids(tokens)
                input_mask = [1] * len(input_ids)

                subword_max_len = max(subword_max_len, len(word_end_mask) + 1)

                all_input_ids[snum, :len(input_ids)] = input_ids
                all_input_mask[snum, :len(input_mask)] = input_mask
                all_word_end_mask[snum, :len(word_end_mask)] = word_end_mask

            all_input_ids = from_numpy(
                np.ascontiguousarray(all_input_ids[:, :subword_max_len]))
            all_input_mask = from_numpy(
                np.ascontiguousarray(all_input_mask[:, :subword_max_len]))
            all_word_end_mask = from_numpy(
                np.ascontiguousarray(all_word_end_mask[:, :subword_max_len]))
            all_encoder_layers, _ = self.bert_model(
                all_input_ids, attention_mask=all_input_mask)
            del _

            features = all_encoder_layers

            features_packed = features.masked_select(
                all_word_end_mask.to(torch.bool).unsqueeze(-1)).reshape(
                    len(raw), seq_max_len, features.shape[-1])

            outputs = self.bert_project(features_packed)
        elif self.use_transformer:
            word_represent = self.wordrep(word_inputs,
                                          word_seq_lengths,
                                          char_inputs,
                                          char_seq_lengths,
                                          char_seq_recover,
                                          xupos_inputs,
                                          emb=emb)
            outputs = self.lstm(word_represent, (1 - batch["mask_h"]))
        else:
            word_represent = self.wordrep(word_inputs,
                                          word_seq_lengths,
                                          char_inputs,
                                          char_seq_lengths,
                                          char_seq_recover,
                                          xupos_inputs,
                                          emb=emb)
            packed_words = pack_padded_sequence(word_represent,
                                                word_seq_lengths, True)
            hidden = None
            lstm_out, hidden = self.lstm(packed_words, hidden)
            lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)
            feature_out = self.droplstm(lstm_out)
            outputs = feature_out

        return outputs
Exemple #35
0
    def forward(self, inputs: List[List[Tweet]], tweets_per_user: List[int]):
        """
        TODO:
        1) use word2vec to create as sequence of vectors for each tweet
            (i.e each tweet is a sequence of words)
        2) create a batch out of all of the sequences (num_users*tweets_per_user, max_seq_len, embedding_dim)
            (make sure to remember which tweets belong to which users)
        3) feed the batch into the recurrent feature extractor (num_users*tweets_per_user, max_seq_len, hidden_dim)
        4) use only the last output (or the last few outputs) of each sequence.
        5) create a tensor for each user made out of the tensors that came out of their tweets (concat or something)
        6) add some other relevant data about each tweet to the tensors (like post time and stuff like that)
        7) create a batch from those tensors (num_users, hidden_dim*tweets_per_user)
        8) feed these these tensors into the linear feature extractor and return it's output

        """
        handmade_features = []

        # TASK 1
        device = next(self.parameters()).device
        sequences = embed(self.word2vec_model, sum(inputs, []), device)

        sorted_indices, sorted_lengths = self.sorted_seq_by_len(sequences)
        num_tweets = len(sorted_indices)

        # TASK 2
        # DON'T FORGET TO USE PADDING AND PACKING FOR INPUT
        padded_seq_batch = pad_sequence(sequences, batch_first=True)
        if self.use_TCN:
            padded_seq_batch = torch.stack(
                [m.t() for m in padded_seq_batch[sorted_indices]])
        else:
            packed_seq_batch = pack_padded_sequence(
                padded_seq_batch[sorted_indices],
                sorted_lengths,
                batch_first=True)
        # TASK 3
        # DON'T FORGET TO UNDO THE PADDING AND PACKING FROM TASK 3
        if self.use_TCN:
            recurrent_features = torch.stack(
                [m.t() for m in self.temporal_extractor(padded_seq_batch)])
        else:
            recurrent_features, _ = self.temporal_extractor(packed_seq_batch)
            recurrent_features, _ = pad_packed_sequence(recurrent_features,
                                                        batch_first=True)

        # TASK 4
        seq_end_indices = [l - 1 for l in sorted_lengths]
        used_recurrent_features = recurrent_features[range(num_tweets),
                                                     seq_end_indices]
        # also reorder the tweets back
        used_recurrent_features = used_recurrent_features[sorted_indices]

        # ADD HANDMADE FEATURES
        # favorite count
        handmade_features.append(
            torch.Tensor([tweet.favorite_count for tweet in sum(inputs, [])
                          ]).to(device).unsqueeze(1))
        # is the tweet a quote?
        handmade_features.append(
            torch.Tensor([tweet.is_quote for tweet in sum(inputs, [])
                          ]).to(device).unsqueeze(1))
        # whether or not there is a retweeted status
        handmade_features.append(
            torch.Tensor([
                tweet.retweeted_status is None for tweet in sum(inputs, [])
            ]).to(device).unsqueeze(1))
        # the number of entities in the tweet
        handmade_features.append(
            torch.Tensor([
                sum([len(entity) for entity in tweet.entities.values()])
                for tweet in sum(inputs, [])
            ]).to(device).unsqueeze(1))

        handmade_features = (torch.cat(handmade_features, dim=1) -
                             self.means.to(device)) / self.stds.to(device)

        features_dim = self.hidden_dim + self.num_handmade_features
        used_recurrent_features = torch.cat(
            (used_recurrent_features, handmade_features), dim=1)

        # TASK 5
        used_recurrent_features = list(
            torch.split(used_recurrent_features, tweets_per_user))

        for i, urf in enumerate(used_recurrent_features):
            dim0 = urf.shape[0]
            if dim0 != 100:
                used_recurrent_features[i] = torch.cat([
                    urf,
                    torch.zeros(100 - dim0, features_dim, device=urf.device)
                ], 0)

        used_recurrent_features = torch.cat(used_recurrent_features)

        recurrent_features_batch = used_recurrent_features.view(
            len(inputs), -1, features_dim)

        # TASK 7
        recurrent_features_batch = recurrent_features_batch.view(
            -1, features_dim)

        # TASK 8
        return self.feature_extractor(recurrent_features_batch).view(
            len(inputs), -1, self.output_dim)
Exemple #36
0
    def loss(self, batch):
        xq = batch['xq_padded']  # n_class * n_query * max_len * mfcc_dim
        xs = batch['xs_padded']  # n_class * n_support * max_len * mfcc_dim
        xq_len = batch['xq_len']  # n_class * n_query
        xs_len = batch['xs_len']  # n_class * n_support

        assert xq.shape[0] == xq_len.shape[0]
        assert xs.shape[0] == xs_len.shape[0]

        n_class = xq_len.shape[0]
        n_query = xq_len.shape[1]
        n_support = xs_len.shape[1]

        target_inds = torch.arange(0, n_class).view(n_class, 1, 1).expand(
            n_class, n_query, 1).long()
        target_inds = Variable(target_inds, requires_grad=False)

        if xq.is_cuda:
            target_inds = target_inds.cuda()

        seq_len = torch.cat([
            xq_len.view(n_class * n_query, -1).squeeze(-1),
            xs_len.view(n_class * n_support, -1).squeeze(-1)
        ], 0)
        seq_len = Variable(seq_len, requires_grad=False)

        x = torch.cat([
            xs.view(n_class * n_support,
                    *xs.size()[2:]),
            xq.view(n_class * n_query,
                    *xq.size()[2:])
        ], 0)

        _len, perm_idx = seq_len.sort(0, descending=True)

        x = x[perm_idx]

        packed_input = pack_padded_sequence(
            x, _len.cpu().numpy().astype(dtype=np.int32), batch_first=True)

        packed_output, _ = self.encoder_rnn.forward(packed_input)

        z, _ = pad_packed_sequence(packed_output, batch_first=True)

        _, unperm_idx = perm_idx.sort(0)
        z = z[unperm_idx]

        #z, _ = self.encoder_rnn.forward(x)

        z = self.encoder_linear.forward((z, seq_len))

        z_dim = z.size(-1)
        z_proto = z[:n_class * n_support].view(n_class, n_support,
                                               z_dim).mean(1)
        zq = z[n_class * n_support:]

        dists = euclidean_dist(zq, z_proto)

        log_p_y = F.log_softmax(-dists, dim=1).view(n_class, n_query, -1)

        loss_val = -log_p_y.gather(2, target_inds).squeeze().view(-1).mean()

        _, y_hat = log_p_y.max(2)
        acc_val = torch.eq(y_hat, target_inds.squeeze(-1)).float().mean()

        logger.info(f'loss: {loss_val.item()}, acc: {acc_val.item()}')

        return loss_val, {'loss': loss_val.item(), 'acc': acc_val.item()}
Exemple #37
0
    def forward(self, e_q, e_s, qseq_len, n_sents, seq_len):
        # e : word embedding
        # e_q - shape - [batch_size, q_len, emb_size]
        # e_d - shape - [batch_size, max_s, s_len, emb_size]

        #
        #	SHARED Q&D MODELLING
        #

        batch_size = e_q.shape[0]
        emb_size = self.config.emb_size
        max_s = self.config.max_s
        s_len = self.config.max_slen
        q_len = self.config.max_qlen
        hidden_size = self.config.hidden_size
        device = self.config.device

        # shape of e_q - [batch_size, q_len, emb_size]

        emb_q = pack_padded_sequence(e_q,
                                     qseq_len,
                                     batch_first=True,
                                     enforce_sorted=False)
        u_q, _ = self.bilstm1(
            emb_q)  # shape of u_q - [batch_size, q_len, 2*hidden_size]
        u_q, _ = pad_packed_sequence(u_q, batch_first=True)

        q_len = u_q.shape[1]

        e_s = e_s.view(batch_size * max_s, s_len, emb_size)
        seq_len = seq_len.view(batch_size * max_s)

        emb_d = pack_padded_sequence(e_s,
                                     seq_len,
                                     batch_first=True,
                                     enforce_sorted=False)
        u_d, _ = self.bilstm2(emb_d)
        u_d, _ = pad_packed_sequence(u_d, batch_first=True)

        # shape of u_d - [batch_size*max_s, s_len, 2*hidden_size]

        s_len = u_d.shape[1]

        # shape of u_d - [batch_size, max_s, s_len, 2*hidden_size]
        u_d = u_d.view(batch_size, max_s, s_len, 2 * hidden_size)
        seq_len = seq_len.view(batch_size, max_s)

        #
        # Co-attention and Fusion
        #

        # shape of u_q_temp - [batch_size, q_len, 2*hidden_size]
        u_q_temp = F.relu(self.linear1(u_q))

        # shape of u_d_temp - [batch_size, max_s, s_len, 2*hidden_size]
        u_d_temp = F.relu(self.linear2(u_d))

        u_q_temp2 = u_q_temp.transpose(
            1, 2)  # shape - [batch_size, 2*hidden_size, q_len]

        s = torch.matmul(u_d_temp,
                         u_q_temp2.view(batch_size, 1, 2 * hidden_size, q_len))

        alpha = F.softmax(s, dim=3)  # shape -[batch_size, max_s, s_len, q_len]

        u_d_att = torch.matmul(
            alpha, u_q_temp.view(batch_size, 1, q_len, 2 * hidden_size))

        #shape - [batch_size, max_s, s_len, 2*hidden_size]

        v_d = self.fuse_linear1(torch.cat(
            [u_d, u_d_att],
            dim=3))  #shape - [batch_size, max_s, s_len, 2*hidden_size]

        #
        # Self-attention and Fusion
        #

        v_d_temp = self.linear3(v_d)

        s = torch.matmul(v_d_temp, v_d_temp.transpose(2, 3))

        beta = F.softmax(s, dim=3)  # shape - [batch_size, max_s, s_len, s_len]

        v_d_att = torch.matmul(beta, v_d)

        d_d = self.fuse_linear2(torch.cat(
            [v_d, v_d_att],
            dim=3))  #shape - [batch_size, max_s, s_len, 2*hidden_size]

        #
        # Self-align for query
        #

        s = self.align_linear1(u_q).view(batch_size, q_len)

        gamma = F.softmax(s, dim=1)

        r_q = torch.matmul(gamma.view(batch_size, 1, q_len), u_q)
        r_q = r_q.view(batch_size, 2 * hidden_size)

        #
        #	SENTENCE RANKING
        #

        #shape of d_d - [batch_size, max_s, s_len, 2*hidden_size]
        s = self.align_linear2(d_d).view(batch_size, max_s, s_len)
        mu = F.softmax(s, dim=2)

        r_d = torch.matmul(mu.view(batch_size, max_s, 1, s_len), d_d)
        r_d = r_d.view(batch_size, max_s, 2 * hidden_size)

        r_d2 = r_d.transpose(0, 1)
        s_d = torch.zeros(max_s, batch_size, device=device)
        for n in range(max_s):
            s_d[n] = self.bilinear(r_q, r_d2[n]).view(batch_size)

        s_d_norm = torch.sigmoid(s_d)

        s_d_norm2 = s_d_norm.transpose(0, 1)

        return s_d_norm2
Exemple #38
0
    def forward(self, word, word_mask, wordchars, wordchars_mask, upos, xpos,
                ufeats, pretrained, lemma, head, deprel, word_orig_idx,
                sentlens, wordlens):
        def pack(x):
            return pack_padded_sequence(x, sentlens, batch_first=True)

        inputs = []
        if self.args['pretrain']:
            pretrained_emb = self.pretrained_emb(pretrained)
            pretrained_emb = self.trans_pretrained(pretrained_emb)
            pretrained_emb = pack(pretrained_emb)
            inputs += [pretrained_emb]

        #def pad(x):
        #    return pad_packed_sequence(PackedSequence(x, pretrained_emb.batch_sizes), batch_first=True)[0]

        if self.args['word_emb_dim'] > 0:
            word_emb = self.word_emb(word)
            word_emb = pack(word_emb)
            lemma_emb = self.lemma_emb(lemma)
            lemma_emb = pack(lemma_emb)
            inputs += [word_emb, lemma_emb]

        if self.args['tag_emb_dim'] > 0:
            pos_emb = self.upos_emb(upos)

            if isinstance(self.vocab['xpos'], CompositeVocab):
                for i in range(len(self.vocab['xpos'])):
                    pos_emb += self.xpos_emb[i](xpos[:, :, i])
            else:
                pos_emb += self.xpos_emb(xpos)
            pos_emb = pack(pos_emb)

            feats_emb = 0
            for i in range(len(self.vocab['feats'])):
                feats_emb += self.ufeats_emb[i](ufeats[:, :, i])
            feats_emb = pack(feats_emb)

            inputs += [pos_emb, feats_emb]

        if self.args['char'] and self.args['char_emb_dim'] > 0:
            char_reps = self.charmodel(wordchars, wordchars_mask,
                                       word_orig_idx, sentlens, wordlens)
            char_reps = PackedSequence(
                self.trans_char(self.drop(char_reps.data)),
                char_reps.batch_sizes)
            inputs += [char_reps]

        lstm_inputs = torch.cat([x.data for x in inputs], 1)

        lstm_inputs = self.worddrop(lstm_inputs, self.drop_replacement)
        lstm_inputs = self.drop(lstm_inputs)

        lstm_inputs = PackedSequence(lstm_inputs, inputs[0].batch_sizes)

        lstm_outputs, _ = self.parserlstm(
            lstm_inputs,
            sentlens,
            hx=(self.parserlstm_h_init.expand(
                2 * self.args['num_layers'], word.size(0),
                self.args['hidden_dim']).contiguous(),
                self.parserlstm_c_init.expand(
                    2 * self.args['num_layers'], word.size(0),
                    self.args['hidden_dim']).contiguous()))
        lstm_outputs, _ = pad_packed_sequence(lstm_outputs, batch_first=True)

        unlabeled_scores = self.unlabeled(self.drop(lstm_outputs),
                                          self.drop(lstm_outputs)).squeeze(3)
        deprel_scores = self.deprel(self.drop(lstm_outputs),
                                    self.drop(lstm_outputs))

        #goldmask = head.new_zeros(*head.size(), head.size(-1)+1, dtype=torch.uint8)
        #goldmask.scatter_(2, head.unsqueeze(2), 1)

        if self.args['linearization'] or self.args['distance']:
            head_offset = torch.arange(word.size(1), device=head.device).view(
                1, 1, -1).expand(word.size(0), -1, -1) - torch.arange(
                    word.size(1), device=head.device).view(1, -1, 1).expand(
                        word.size(0), -1, -1)

        if self.args['linearization']:
            lin_scores = self.linearization(self.drop(lstm_outputs),
                                            self.drop(lstm_outputs)).squeeze(3)
            unlabeled_scores += F.logsigmoid(
                lin_scores * torch.sign(head_offset).float()).detach()

        if self.args['distance']:
            dist_scores = self.distance(self.drop(lstm_outputs),
                                        self.drop(lstm_outputs)).squeeze(3)
            dist_pred = 1 + F.softplus(dist_scores)
            dist_target = torch.abs(head_offset)
            dist_kld = -torch.log((dist_target.float() - dist_pred)**2 / 2 + 1)
            unlabeled_scores += dist_kld.detach()

        diag = torch.eye(head.size(-1) + 1,
                         dtype=torch.bool,
                         device=head.device).unsqueeze(0)
        unlabeled_scores.masked_fill_(diag, -float('inf'))

        preds = []

        if self.training:
            unlabeled_scores = unlabeled_scores[:,
                                                1:, :]  # exclude attachment for the root symbol
            unlabeled_scores = unlabeled_scores.masked_fill(
                word_mask.unsqueeze(1), -float('inf'))
            unlabeled_target = head.masked_fill(word_mask[:, 1:], -1)
            loss = self.crit(
                unlabeled_scores.contiguous().view(-1,
                                                   unlabeled_scores.size(2)),
                unlabeled_target.view(-1))

            deprel_scores = deprel_scores[:,
                                          1:]  # exclude attachment for the root symbol
            #deprel_scores = deprel_scores.masked_select(goldmask.unsqueeze(3)).view(-1, len(self.vocab['deprel']))
            deprel_scores = torch.gather(
                deprel_scores, 2,
                head.unsqueeze(2).unsqueeze(3).expand(
                    -1, -1, -1,
                    len(self.vocab['deprel']))).view(-1,
                                                     len(self.vocab['deprel']))
            deprel_target = deprel.masked_fill(word_mask[:, 1:], -1)
            loss += self.crit(deprel_scores.contiguous(),
                              deprel_target.view(-1))

            if self.args['linearization']:
                #lin_scores = lin_scores[:, 1:].masked_select(goldmask)
                lin_scores = torch.gather(lin_scores[:, 1:], 2,
                                          head.unsqueeze(2)).view(-1)
                lin_scores = torch.cat([
                    -lin_scores.unsqueeze(1) / 2,
                    lin_scores.unsqueeze(1) / 2
                ], 1)
                #lin_target = (head_offset[:, 1:] > 0).long().masked_select(goldmask)
                lin_target = torch.gather((head_offset[:, 1:] > 0).long(), 2,
                                          head.unsqueeze(2))
                loss += self.crit(lin_scores.contiguous(), lin_target.view(-1))

            if self.args['distance']:
                #dist_kld = dist_kld[:, 1:].masked_select(goldmask)
                dist_kld = torch.gather(dist_kld[:, 1:], 2, head.unsqueeze(2))
                loss -= dist_kld.sum()

            loss /= wordchars.size(0)  # number of words
        else:
            loss = 0
            preds.append(
                F.log_softmax(unlabeled_scores, 2).detach().cpu().numpy())
            preds.append(deprel_scores.max(3)[1].detach().cpu().numpy())

        return loss, preds
    def forward(self, data, id_):
        #         features = self.LL(features)
        batch_size = 1

        if (data[id_][3] == 0):
            #             print "data"
            features_fast = self.get_imputed_feats(
                data[id_][0], data[id_][1], data[id_][2],
                self.dict_selected_feats_fast, self.imputation_layer_in_fast,
                len(self.fast_features_indexes))
            lenghts_fast = [features_fast.shape[1]]
            lengths_fast = torch.cuda.LongTensor(lenghts_fast)
            lengths_fast = autograd.Variable(lengths_fast)
            packed_fast = pack_padded_sequence(features_fast,
                                               lengths_fast,
                                               batch_first=True)
            self.hidden_fast = self.init_hidden(batch_size,
                                                self.hidden_dim_fast)
            packed_output_fast, self.hidden_fast = self.lstm_fast(
                packed_fast, self.hidden_fast)
            lstm_out_fast = pad_packed_sequence(packed_output_fast,
                                                batch_first=True)[0]
            if (self.attn_category == 'dot'):
                pad_attn_fast = self.attn_fast(
                    (lstm_out_fast, torch.cuda.LongTensor(lengths_fast)))
        else:
            pad_attn_fast = torch.cuda.FloatTensor(
                np.zeros([1, self.hidden_dim_fast]))

        if (data[id_][7] == 0):
            #             print "data"
            features_slow = self.get_imputed_feats(
                data[id_][4], data[id_][5], data[id_][6],
                self.dict_selected_feats_slow, self.imputation_layer_in_slow,
                len(self.slow_features_indexes))
            lenghts_slow = [features_slow.shape[1]]
            lengths_slow = torch.cuda.LongTensor(lenghts_slow)
            lengths_slow = autograd.Variable(lengths_slow)
            packed_slow = pack_padded_sequence(features_slow,
                                               lengths_slow,
                                               batch_first=True)
            self.hidden_slow = self.init_hidden(batch_size,
                                                self.hidden_dim_slow)
            packed_output_slow, self.hidden_slow = self.lstm_slow(
                packed_slow, self.hidden_slow)
            lstm_out_slow = pad_packed_sequence(packed_output_slow,
                                                batch_first=True)[0]
            if (self.attn_category == 'dot'):
                pad_attn_slow = self.attn_slow(
                    (lstm_out_slow, torch.cuda.LongTensor(lengths_slow)))
        else:
            pad_attn_slow = torch.cuda.FloatTensor(
                np.zeros([1, self.hidden_dim_slow]))
#             print pad_attn_final.shape
#         else:
# For now this won't work
#             tag_space = self.hidden2tag(lstm_out[:,-1,:])
#         print pad_attn_fast.shape
#         print pad_attn_slow.shape
        pad_attn_final = torch.cat([pad_attn_fast, pad_attn_slow], 1)
        tag_space = self.hidden2tag(pad_attn_final)
        #         print tag_space
        tag_score = F.log_softmax(tag_space, dim=1)
        return tag_score
Exemple #40
0
    def forward(self, input_seqs):
        """ Forward pass.

        # Arguments:
            input_seqs: Can be one of Numpy array, Torch.LongTensor, Torch.Variable, Torch.PackedSequence.

        # Return:
            Same format as input format (except for PackedSequence returned as Variable).
        """
        # Check if we have Torch.LongTensor inputs or not Torch.Variable (assume Numpy array in this case), take note to return same format
        return_numpy = False
        if isinstance(input_seqs, (torch.LongTensor, torch.cuda.LongTensor)):
            input_seqs = Variable(input_seqs)
        elif not isinstance(input_seqs, Variable):
            input_seqs = Variable(
                torch.from_numpy(input_seqs.astype('int64')).long())
            return_numpy = True

        # If we don't have a packed inputs, let's pack it
        reorder_output = False
        if not isinstance(input_seqs, PackedSequence):
            ho = self.lstm_0.weight_hh_l0.data.new(2,
                                                   input_seqs.size()[0],
                                                   self.hidden_size).zero_()
            co = self.lstm_0.weight_hh_l0.data.new(2,
                                                   input_seqs.size()[0],
                                                   self.hidden_size).zero_()

            # Reorder batch by sequence length
            input_lengths = torch.LongTensor([
                torch.max(input_seqs[i, :].data.nonzero()) + 1
                for i in range(input_seqs.size()[0])
            ])
            input_lengths, perm_idx = input_lengths.sort(0, descending=True)
            input_seqs = input_seqs[perm_idx][:, :input_lengths.max()]

            # Pack sequence and work on data tensor to reduce embeddings/dropout computations
            packed_input = pack_padded_sequence(input_seqs,
                                                input_lengths.cpu().numpy(),
                                                batch_first=True)
            reorder_output = True
        else:
            ho = self.lstm_0.weight_hh_l0.data.data.new(
                2,
                input_seqs.size()[0], self.hidden_size).zero_()
            co = self.lstm_0.weight_hh_l0.data.data.new(
                2,
                input_seqs.size()[0], self.hidden_size).zero_()
            input_lengths = input_seqs.batch_sizes
            packed_input = input_seqs

        hidden = (Variable(ho, requires_grad=False),
                  Variable(co, requires_grad=False))

        # Embed with an activation function to bound the values of the embeddings
        x = self.embed(packed_input.data)
        x = nn.Tanh()(x)

        # pyTorch 2D dropout2d operate on axis 1 which is fine for us
        x = self.embed_dropout(x)

        # Update packed sequence data for RNN
        packed_input = PackedSequence(x, packed_input.batch_sizes)

        # skip-connection from embedding to output eases gradient-flow and allows access to lower-level features
        # ordering of the way the merge is done is important for consistency with the pretrained model
        lstm_0_output, _ = self.lstm_0(packed_input, hidden)
        lstm_1_output, _ = self.lstm_1(lstm_0_output, hidden)

        # Update packed sequence data for attention layer
        packed_input = PackedSequence(
            torch.cat(
                (lstm_1_output.data, lstm_0_output.data, packed_input.data),
                dim=1), packed_input.batch_sizes)

        input_seqs, _ = pad_packed_sequence(packed_input, batch_first=True)

        x, att_weights = self.attention_layer(input_seqs, input_lengths)

        # output class probabilities or penultimate feature vector
        if not self.feature_output:
            x = self.final_dropout(x)
            outputs = self.output_layer(x)
        else:
            outputs = x

        # Reorder output if needed
        if reorder_output:
            reorered = Variable(outputs.data.new(outputs.size()))
            reorered[perm_idx] = outputs
            outputs = reorered

        # Adapt return format if needed
        if return_numpy:
            outputs = outputs.data.numpy()

        if self.return_attention:
            return outputs, att_weights
        else:
            return outputs
Exemple #41
0
    def forward(
        self,
        embedded_tokens: torch.Tensor,
        seq_lengths: torch.Tensor,
        states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """
        Given an input batch of sequential data such as word embeddings, produces
        a bidirectional LSTM representation of the sequential input and new state
        tensors.

        Args:
            embedded_tokens (torch.Tensor): Input tensor of shape
                (bsize x seq_len x input_dim).
            seq_lengths (torch.Tensor): List of sequences lengths of each batch element.
            states (Tuple[torch.Tensor, torch.Tensor]): Tuple of tensors containing
                the initial hidden state and the cell state of each element in
                the batch. Each of these tensors have a dimension of
                (bsize x num_layers * num_directions x nhid). Defaults to `None`.

        Returns:
            Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: Bidirectional
                LSTM representation of input and the state of the LSTM `t = seq_len`.
                Shape of representation is (bsize x seq_len x representation_dim).
                Shape of each state is (bsize x num_layers * num_directions x nhid).

        """
        if self.dropout.p > 0.0:
            embedded_tokens = self.dropout(embedded_tokens)

        if states is not None:
            # convert (h0, c0) from (bsz x num_layers*num_directions x nhid) to
            # (num_layers*num_directions x bsz x nhid)
            states = (
                states[0].transpose(0, 1).contiguous(),
                states[1].transpose(0, 1).contiguous(),
            )
        else:
            # We need to send in a zero state that matches the batch size, because
            # torch.jit tracing currently traces this as constant and therefore
            # locks the traced model into a static batch size.
            # see https://github.com/pytorch/pytorch/issues/16664
            state = torch.zeros(
                self.config.num_layers *
                (2 if self.config.bidirectional else 1),
                embedded_tokens.size(0),  # batch size
                self.config.lstm_dim,
                device=torch.cuda.current_device()
                if cuda.CUDA_ENABLED else None,
            )
            states = (state, state)
        if torch.onnx.is_in_onnx_export():
            lstm_in = [embedded_tokens, states[0], states[1]] + [
                param.detach() for param in self.lstm._flat_weights
            ]
            rep, new_state_0, new_state_1 = torch.ops._caffe2.InferenceLSTM(
                lstm_in,
                self.lstm.num_layers,
                self.lstm.bias,
                True,
                self.lstm.bidirectional,
            )
            new_state = (new_state_0, new_state_1)
        else:
            if self.pack_sequence:
                rnn_input = pack_padded_sequence(embedded_tokens,
                                                 seq_lengths,
                                                 batch_first=True,
                                                 enforce_sorted=False)
            else:
                rnn_input = embedded_tokens
            rep, new_state = self.lstm(rnn_input, states)
            if self.pack_sequence:
                rep, _ = pad_packed_sequence(
                    rep,
                    padding_value=self.padding_value,
                    batch_first=True,
                    total_length=embedded_tokens.size(1),
                )
        # Make sure the output from LSTM is padded to input's sequence length.
        # convert states back to (bsz x num_layers*num_directions x nhid) to be
        # used in data parallel model
        new_state = (new_state[0].transpose(0,
                                            1), new_state[1].transpose(0, 1))
        return rep, new_state
criterion = nn.L1Loss()


def set_requires_grad(net, requires_grad=False):
    for param in net.parameters():
        param.requires_grad = requires_grad


step = 0
t = trange(config.epoch)

for epoch in t:
    for l, r in train_loader:
        b_size = l.size(0)

        seq_padded, lens = rnn_utils.pad_packed_sequence(r, batch_first=False)
        max_len = seq_padded.shape[0]
        pad_mask = torch.arange(max_len)[None, :] < lens[:, None]

        seq_padded = seq_padded.to(device)
        pad_mask = ~pad_mask.to(device)

        optimizer_g.zero_grad()

        fake_y = netG(seq_padded, pad_mask)

        y = l.to(device)

        ########################### train G ############################

        loss = criterion(fake_y, l)
    def forward(self,
                sentence,
                p_sentence,
                pos_tags,
                lengths,
                target_idx_in,
                region_marks,
                local_roles_voc,
                frames,
                local_roles_mask,
                sent_pred_lemmas_idx,
                dep_tags,
                dep_heads,
                targets,
                specific_dep_tags,
                specific_dep_relations,
                test=False):
        """
        elmo_embedding_0 = self.elmo_embeddings_0(sentence).view(self.batch_size, len(sentence[0]), 1024)
        elmo_embedding_1 = self.elmo_embeddings_1(sentence).view(self.batch_size, len(sentence[0]), 1024)
        w = F.softmax(self.elmo_word, dim=0)
        elmo_emb = self.elmo_gamma_word * (w[0] * elmo_embedding_0 + w[1] * elmo_embedding_1)
        elmo_emb_word = self.elmo_mlp_word(elmo_emb)
        """

        log(sentence)
        log(p_sentence)
        log(pos_tags)
        log(region_marks)
        log(sent_pred_lemmas_idx)

        embeds_DEP = self.word_embeddings_DEP(sentence)
        add_zero = torch.zeros(
            (self.batch_size, 1, self.word_emb_dim)).to(device)
        embeds_DEP = embeds_DEP.view(self.batch_size, len(sentence[0]),
                                     self.word_emb_dim)
        embeds_DEP_cat = torch.cat(
            (self.VR_word_embedding_random + add_zero, embeds_DEP), 1)

        pos_embeds = self.pos_embeddings(pos_tags)
        add_zero = torch.zeros((self.batch_size, 1, 16)).to(device)
        pos_embeds_cat = torch.cat(
            (self.VR_POS_embedding + add_zero, pos_embeds), 1)

        fixed_embeds_DEP = self.word_fixed_embeddings(p_sentence)
        fixed_embeds_DEP = fixed_embeds_DEP.view(self.batch_size,
                                                 len(sentence[0]),
                                                 self.word_emb_dim)
        add_zero = torch.zeros(
            (self.batch_size, 1, self.word_emb_dim)).to(device)
        fixed_embeds_DEP_cat = torch.cat(
            (self.VR_word_embedding + add_zero, fixed_embeds_DEP), 1)
        embeds_forDEP = torch.cat(
            (embeds_DEP_cat, fixed_embeds_DEP_cat, pos_embeds_cat), 2)
        embeds_forDEP = self.DEP_input_dropout(embeds_forDEP)

        # first layer
        embeds_sort, lengths_sort, unsort_idx = self.sort_batch(
            embeds_forDEP, lengths + 1)
        embeds_sort = rnn.pack_padded_sequence(embeds_sort,
                                               lengths_sort,
                                               batch_first=True)
        # hidden states [time_steps * batch_size * hidden_units]
        hidden_states, self.hidden = self.BiLSTM_0(embeds_sort, self.hidden)
        # it seems that hidden states is already batch first, we don't need swap the dims
        # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, )
        hidden_states, lens = rnn.pad_packed_sequence(hidden_states,
                                                      batch_first=True)
        # hidden_states = hidden_states.transpose(0, 1)
        hidden_states_0 = hidden_states[unsort_idx]

        # second_layer
        embeds_sort, lengths_sort, unsort_idx = self.sort_batch(
            hidden_states_0, lengths + 1)
        embeds_sort = rnn.pack_padded_sequence(embeds_sort,
                                               lengths_sort,
                                               batch_first=True)
        # hidden states [time_steps * batch_size * hidden_units]
        hidden_states, self.hidden_2 = self.BiLSTM_1(embeds_sort,
                                                     self.hidden_2)
        # it seems that hidden states is already batch first, we don't need swap the dims
        # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, )
        hidden_states, lens = rnn.pad_packed_sequence(hidden_states,
                                                      batch_first=True)
        # hidden_states = hidden_states.transpose(0, 1)
        hidden_states_1 = hidden_states[unsort_idx]

        ##########################################
        Head_hidden = F.relu(self.hidLayerFOH(hidden_states_1))
        Dependent_hidden = F.relu(self.hidLayerFOM(hidden_states_1))

        bias_one = torch.ones(
            (self.batch_size, len(sentence[0]) + 1, 1)).to(device)
        Head_hidden = torch.cat((Head_hidden, Variable(bias_one)), 2)

        bias_one = torch.ones(
            (self.batch_size, len(sentence[0]) + 1, 1)).to(device)
        Dependent_hidden = torch.cat((Dependent_hidden, Variable(bias_one)), 2)

        left_part = torch.mm(
            Dependent_hidden.view(self.batch_size * (len(sentence[0]) + 1),
                                  -1), self.W_R)
        left_part = left_part.view(self.batch_size, (len(sentence[0]) + 1), -1)
        Head_hidden = Head_hidden.view(self.batch_size, (len(sentence[0]) + 1),
                                       -1).transpose(1, 2)
        tag_space = torch.bmm(left_part, Head_hidden).view(
            (len(sentence[0]) + 1) * self.batch_size,
            len(sentence[0]) + 1)

        heads = np.argmax(tag_space.cpu().data.numpy(), axis=1)

        nums = 0.0
        wrong_nums = 0.0
        for a, b in zip(heads, dep_heads.flatten()):
            if b == -1:
                continue
            nums += 1
            if a != b:
                wrong_nums += 1

        loss_function = nn.CrossEntropyLoss(ignore_index=-1)
        DEPloss = loss_function(
            tag_space,
            torch.from_numpy(dep_heads).to(device).view(-1))

        #+++++++++++++++++++++++++++++++++++++++++++++++++++++
        Head_hidden_tag = F.relu(self.hidLayerFOH_tag(hidden_states_1))
        Dependent_hidden_tag = F.relu(self.hidLayerFOM_tag(hidden_states_1))

        bias_one = torch.ones(
            (self.batch_size, len(sentence[0]) + 1, 1)).to(device)
        Head_hidden_tag = torch.cat((Head_hidden_tag, Variable(bias_one)), 2)

        bias_one = torch.ones(
            (self.batch_size, len(sentence[0]) + 1, 1)).to(device)
        Dependent_hidden_tag = torch.cat(
            (Dependent_hidden_tag, Variable(bias_one)), 2)

        left_part = torch.mm(
            Dependent_hidden_tag.view(self.batch_size * (len(sentence[0]) + 1),
                                      -1), self.W_R_tag)
        left_part = left_part.view(self.batch_size,
                                   (len(sentence[0]) + 1) * self.dep_size, -1)
        Head_hidden_tag = Head_hidden_tag.view(self.batch_size,
                                               (len(sentence[0]) + 1),
                                               -1).transpose(1, 2)
        tag_space_tag = torch.bmm(left_part, Head_hidden_tag).view(
            (len(sentence[0]) + 1) * self.batch_size, self.dep_size,
            len(sentence[0]) + 1).transpose(1, 2)

        tag_space_tag = tag_space_tag[np.arange(0, (len(sentence[0]) + 1) *
                                                self.batch_size),
                                      dep_heads.flatten()]
        tag_space_tag = tag_space_tag.view(
            (len(sentence[0]) + 1) * self.batch_size, -1)
        heads_tag = np.argmax(tag_space_tag.cpu().data.numpy(), axis=1)

        nums_tag = 0.0
        wrong_nums_tag = 0.0
        for a, b in zip(heads_tag, dep_tags.view(-1).cpu().data.numpy()):
            if b == -1 or b == 0:
                continue
            nums_tag += 1
            if a != b:
                wrong_nums_tag += 1

        loss_function = nn.CrossEntropyLoss(ignore_index=0)
        DEPloss_tag = loss_function(tag_space_tag, dep_tags.view(-1))

        h_layer_0 = hidden_states_0[:, 1:]  # .detach()
        h_layer_1 = hidden_states_1[:, 1:]  # .detach()

        w = F.softmax(self.elmo_w, dim=0)
        SRL_composer = self.elmo_gamma * (w[0] * h_layer_0 + w[1] * h_layer_1)
        SRL_composer = self.elmo_mlp(SRL_composer)

        fixed_embeds = self.word_fixed_embeddings(p_sentence)
        fixed_embeds = fixed_embeds.view(self.batch_size, len(sentence[0]),
                                         self.word_emb_dim)
        sent_pred_lemmas_embeds = self.p_lemma_embeddings(sent_pred_lemmas_idx)
        embeds_SRL = self.word_embeddings_SRL(sentence)
        embeds_SRL = embeds_SRL.view(self.batch_size, len(sentence[0]),
                                     self.word_emb_dim)

        region_marks = self.region_embeddings(region_marks).view(
            self.batch_size, len(sentence[0]), 16)
        SRL_hidden_states = torch.cat(
            (embeds_SRL, fixed_embeds, sent_pred_lemmas_embeds, pos_embeds,
             region_marks, SRL_composer), 2)
        SRL_hidden_states = self.SRL_input_dropout(SRL_hidden_states)

        # SRL layer
        embeds_sort, lengths_sort, unsort_idx = self.sort_batch(
            SRL_hidden_states, lengths)
        embeds_sort = rnn.pack_padded_sequence(embeds_sort,
                                               lengths_sort.cpu().numpy(),
                                               batch_first=True)
        # hidden states [time_steps * batch_size * hidden_units]
        hidden_states, self.hidden_4 = self.BiLSTM_SRL(embeds_sort,
                                                       self.hidden_4)
        # it seems that hidden states is already batch first, we don't need swap the dims
        # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, )
        hidden_states, lens = rnn.pad_packed_sequence(hidden_states,
                                                      batch_first=True)
        # hidden_states = hidden_states.transpose(0, 1)
        hidden_states = hidden_states[unsort_idx]
        hidden_states = self.hidden_state_dropout(hidden_states)

        # B * H
        hidden_states_3 = hidden_states
        predicate_embeds = hidden_states_3[
            np.arange(0,
                      hidden_states_3.size()[0]), target_idx_in]
        # T * B * H
        added_embeds = torch.zeros(hidden_states_3.size()[1],
                                   hidden_states_3.size()[0],
                                   hidden_states_3.size()[2]).to(device)
        predicate_embeds = added_embeds + predicate_embeds
        # B * T * H
        predicate_embeds = predicate_embeds.transpose(0, 1)
        hidden_states = torch.cat((hidden_states_3, predicate_embeds), 2)
        # print(hidden_states)
        # non-linear map and rectify the roles' embeddings
        # roles = Variable(torch.from_numpy(np.arange(0, self.tagset_size)))

        # B * roles
        # log(local_roles_voc)
        # log(frames)

        # B * roles * h
        role_embeds = self.role_embeddings(local_roles_voc)
        frame_embeds = self.frame_embeddings(frames)

        role_embeds = torch.cat((role_embeds, frame_embeds), 2)
        mapped_roles = F.relu(self.role_map(role_embeds))
        mapped_roles = torch.transpose(mapped_roles, 1, 2)

        # b, times, roles
        tag_space = torch.matmul(hidden_states, mapped_roles)
        #tag_space = hidden_states.mm(mapped_roles)

        # b, roles
        #sub = torch.div(torch.add(local_roles_mask, -1.0), _BIG_NUMBER)
        sub = torch.add(local_roles_mask, -1.0) * _BIG_NUMBER
        sub = torch.FloatTensor(sub.cpu().numpy()).to(device)
        # b, roles, times
        tag_space = torch.transpose(tag_space, 0, 1)
        tag_space += sub
        # b, T, roles
        tag_space = torch.transpose(tag_space, 0, 1)
        tag_space = tag_space.view(len(sentence[0]) * self.batch_size, -1)

        SRLprobs = F.softmax(tag_space, dim=1)

        targets = targets.view(-1)

        loss_function = nn.CrossEntropyLoss(ignore_index=0)

        SRLloss = loss_function(tag_space, targets)

        loss = SRLloss + DEPloss + DEPloss_tag
        return SRLloss, DEPloss, DEPloss_tag, loss, SRLprobs, wrong_nums, nums, wrong_nums, nums,  \
               wrong_nums, nums, nums,\
               wrong_nums_tag, nums_tag, nums_tag
Exemple #44
0
def pad_unsort_packed_sequence(input, inv_ix):
    tmp, _ = pad_packed_sequence(input, batch_first=True)
    tmp = tmp[inv_ix]
    return tmp
Exemple #45
0
    def forward(self, src_tokens, src_lengths):
        if LanguagePairDataset.LEFT_PAD_SOURCE:
            # convert left-padding to right-padding
            src_tokens = utils.convert_padding_direction(src_tokens,
                                                         self.padding_idx,
                                                         left_to_right=True)
        if self.word_dropout_module is not None:
            src_tokens = self.word_dropout_module(src_tokens)
        bsz, seqlen = src_tokens.size()

        # embed tokens
        x = self.embed_tokens(src_tokens)
        x = F.dropout(x, p=self.dropout_in, training=self.training)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # Generate packed seq to deal with varying source seq length
        packed_input, batch_sizes = pack_padded_sequence(x, src_lengths)
        final_hiddens, final_cells = [], []
        next_hiddens = []
        for i, rnn_layer in enumerate(self.layers):
            current_hidden_size = (self.hidden_dim //
                                   2 if rnn_layer.is_bidirectional else
                                   self.hidden_dim)

            if self.cell_type in ["lstm", "milstm", "layer_norm_lstm"]:
                prev_hidden = (
                    x.new(bsz, current_hidden_size).zero_(),
                    x.new(bsz, current_hidden_size).zero_(),
                )
            else:
                raise Exception(f"{self.cell_type} not implemented")

            hidden, current_output = rnn_layer.forward(packed_input,
                                                       prev_hidden,
                                                       batch_sizes)
            next_hiddens.append(hidden)
            prev_hidden = next_hiddens[-1]

            if self.dropout_out != 0:
                current_output = F.dropout(current_output,
                                           p=self.dropout_out,
                                           training=self.training)

            if self.residual_level is not None and i >= self.residual_level:
                packed_input = packed_input.clone() + current_output
            else:
                packed_input = current_output

        final_hiddens, final_cells = zip(*next_hiddens)
        # Reshape to [num_layer, batch_size, hidden_dim]
        final_hiddens = torch.cat(final_hiddens,
                                  dim=0).view(self.num_layers,
                                              *final_hiddens[0].size())
        final_cells = torch.cat(final_cells,
                                dim=0).view(self.num_layers,
                                            *final_cells[0].size())

        #  [max_seqlen, batch_size, hidden_dim]
        unpacked_output, _ = pad_packed_sequence(
            PackedSequence(packed_input, batch_sizes),
            padding_value=self.padding_value)

        return (unpacked_output, final_hiddens, final_cells, src_lengths,
                src_tokens)
Exemple #46
0
    def forward(self,
                sentence,
                p_sentence,
                pos_tags,
                lengths,
                target_idx_in,
                region_marks,
                local_roles_voc,
                frames,
                local_roles_mask,
                sent_pred_lemmas_idx,
                dep_tags,
                dep_heads,
                targets,
                P_identification,
                all_l_ids,
                Predicate_link,
                Predicate_Labels_nd,
                Predicate_Labels,
                unlabeled_sentence=None,
                p_unlabeled_sentence=None,
                unlabeled_lengths=None,
                test=False,
                cvt_train=False):

        if cvt_train:
            CVT_SRL_Loss = self.CVT_train(unlabeled_sentence,
                                          p_unlabeled_sentence,
                                          unlabeled_lengths)
            return CVT_SRL_Loss
        """
        perform predicate Identificaiton first
        """
        Predicate_Identification_Space = self.Predicate_Id(
            sentence, p_sentence, lengths)

        # +++++++++++++++++++++++
        wrong_l_nums = 0.0
        all_l_nums = 0.0

        right_noNull_predict = 0.0
        noNull_predict = 0.0
        noNUll_truth = 0.0

        PI_labels = np.argmax(
            Predicate_Identification_Space.cpu().data.numpy(), axis=1)
        for predict_l, gold_l in zip(
                PI_labels,
                P_identification.cpu().view(-1).data.numpy()):
            if predict_l > 1 and gold_l != 0:
                noNull_predict += 1
            if gold_l != 0:
                all_l_nums += 1
                if gold_l != 1:
                    noNUll_truth += 1
                    if gold_l == predict_l:
                        right_noNull_predict += 1
            if predict_l != gold_l and gold_l != 0:
                wrong_l_nums += 1
        """
        construct DEP_input
        """
        # contruct input for DEP
        embeds_DEP = self.word_embeddings_DEP(sentence)
        embeds_DEP = embeds_DEP.view(self.batch_size, len(sentence[0]),
                                     self.word_emb_dim)
        region_marks = self.region_embeddings(region_marks).view(
            self.batch_size, len(sentence[0]), 16)
        # sharing pretrained word_embeds
        fixed_embeds_DEP = self.word_fixed_embeddings_DEP(p_sentence)
        fixed_embeds_DEP = fixed_embeds_DEP.view(self.batch_size,
                                                 len(sentence[0]),
                                                 self.word_emb_dim)

        embeds_forDEP = torch.cat((embeds_DEP, fixed_embeds_DEP, region_marks),
                                  2)
        embeds_forDEP = self.DEP_input_dropout(embeds_forDEP)

        # first layer
        embeds_sort, lengths_sort, unsort_idx = self.sort_batch(
            embeds_forDEP, lengths)
        embeds_sort = rnn.pack_padded_sequence(embeds_sort,
                                               lengths_sort,
                                               batch_first=True)
        # hidden states [time_steps * batch_size * hidden_units]
        hidden_states, self.hidden_DEP_base = self.BiLSTM_0(
            embeds_sort, self.hidden_DEP_base)
        # it seems that hidden states is already batch first, we don't need swap the dims
        # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, )
        hidden_states, lens = rnn.pad_packed_sequence(hidden_states,
                                                      batch_first=True)
        # hidden_states = hidden_states.transpose(0, 1)
        hidden_states_0 = hidden_states[unsort_idx]

        # second_layer
        embeds_sort, lengths_sort, unsort_idx = self.sort_batch(
            hidden_states_0, lengths)
        embeds_sort = rnn.pack_padded_sequence(embeds_sort,
                                               lengths_sort,
                                               batch_first=True)
        # hidden states [time_steps * batch_size * hidden_units]
        hidden_states, self.hidden_DEP = self.BiLSTM_DEP(
            embeds_sort, self.hidden_DEP)
        # it seems that hidden states is already batch first, we don't need swap the dims
        # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, )
        hidden_states, lens = rnn.pad_packed_sequence(hidden_states,
                                                      batch_first=True)
        # hidden_states = hidden_states.transpose(0, 1)
        hidden_states_1 = hidden_states[unsort_idx]

        ###########################################
        hidden_states_3 = self.hidden_state_dropout_DEP(hidden_states_1)
        hidden_states_word = self.dropout_1_DEP(
            F.relu(self.Non_Predicate_Proj_DEP(hidden_states_3)))
        predicate_embeds = hidden_states_3[
            np.arange(0,
                      hidden_states_3.size()[0]), target_idx_in]
        hidden_states_predicate = self.dropout_2_DEP(
            F.relu(self.Predicate_Proj_DEP(predicate_embeds)))

        bias_one = torch.ones(
            (self.batch_size, len(sentence[0]), 1)).to(device)
        hidden_states_word = torch.cat(
            (hidden_states_word, Variable(bias_one)), 2)

        left_part = torch.mm(
            hidden_states_word.view(self.batch_size * len(sentence[0]), -1),
            self.W_R_DEP)
        left_part = left_part.view(self.batch_size,
                                   len(sentence[0]) * self.dep_size, -1)
        hidden_states_predicate = hidden_states_predicate.view(
            self.batch_size, -1, 1)
        tag_space_DEP = torch.bmm(left_part, hidden_states_predicate).view(
            len(sentence[0]) * self.batch_size, -1)

        # +++++++++++++++++++++++
        h_layer_0 = hidden_states_0  # .detach()
        h_layer_1 = hidden_states_1  # .detach()

        w = F.softmax(self.elmo_w, dim=0)
        SRL_composer = self.elmo_gamma * (w[0] * h_layer_0 + w[1] * h_layer_1)
        SRL_composer = self.elmo_mlp(SRL_composer)

        fixed_embeds = self.word_fixed_embeddings(p_sentence)
        fixed_embeds = fixed_embeds.view(self.batch_size, len(sentence[0]),
                                         self.word_emb_dim)
        embeds_SRL = self.word_embeddings_SRL(sentence)
        embeds_SRL = embeds_SRL.view(self.batch_size, len(sentence[0]),
                                     self.word_emb_dim)

        SRL_hidden_states = torch.cat(
            (embeds_SRL, fixed_embeds, region_marks, SRL_composer), 2)
        SRL_hidden_states = self.SRL_input_dropout(SRL_hidden_states)

        # SRL layer
        embeds_sort, lengths_sort, unsort_idx = self.sort_batch(
            SRL_hidden_states, lengths)
        embeds_sort = rnn.pack_padded_sequence(embeds_sort,
                                               lengths_sort,
                                               batch_first=True)
        # hidden states [time_steps * batch_size * hidden_units]
        hidden_states, self.hidden_SRL_base = self.BiLSTM_1(
            embeds_sort, self.hidden_SRL_base)
        # it seems that hidden states is already batch first, we don't need swap the dims
        # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, )
        hidden_states, lens = rnn.pad_packed_sequence(hidden_states,
                                                      batch_first=True)
        # hidden_states = hidden_states.transpose(0, 1)
        hidden_states_0 = hidden_states[unsort_idx]

        embeds_sort, lengths_sort, unsort_idx = self.sort_batch(
            hidden_states_0, lengths)
        embeds_sort = rnn.pack_padded_sequence(embeds_sort,
                                               lengths_sort.cpu().numpy(),
                                               batch_first=True)
        # hidden states [time_steps * batch_size * hidden_units]
        hidden_states, self.hidden_SRL = self.BiLSTM_SRL(
            embeds_sort, self.hidden_SRL)
        # it seems that hidden states is already batch first, we don't need swap the dims
        # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, )
        hidden_states, lens = rnn.pad_packed_sequence(hidden_states,
                                                      batch_first=True)
        # hidden_states = hidden_states.transpose(0, 1)
        hidden_states = hidden_states[unsort_idx]
        hidden_states = self.hidden_state_dropout_SRL(hidden_states)

        # B * H
        hidden_states_3 = hidden_states
        hidden_states_word = self.dropout_1(
            F.relu(self.Non_Predicate_Proj(hidden_states_3)))
        predicate_embeds = hidden_states_3[
            np.arange(0,
                      hidden_states_3.size()[0]), target_idx_in]
        hidden_states_predicate = self.dropout_2(
            F.relu(self.Predicate_Proj(predicate_embeds)))

        bias_one = torch.ones(
            (self.batch_size, len(sentence[0]), 1)).to(device)
        hidden_states_word = torch.cat(
            (hidden_states_word, Variable(bias_one)), 2)

        bias_one = torch.ones((self.batch_size, 1)).to(device)
        hidden_states_predicate = torch.cat(
            (hidden_states_predicate, Variable(bias_one)), 1)

        left_part = torch.mm(
            hidden_states_word.view(self.batch_size * len(sentence[0]), -1),
            self.W_R)
        left_part = left_part.view(self.batch_size,
                                   len(sentence[0]) * self.tagset_size, -1)
        hidden_states_predicate = hidden_states_predicate.view(
            self.batch_size, -1, 1)
        tag_space = torch.bmm(left_part, hidden_states_predicate).view(
            len(sentence[0]) * self.batch_size, -1)
        SRLprobs = F.softmax(tag_space, dim=1)

        right_noNull_predict_DEP = 0.0
        noNull_predict_DEP = 0.0
        noNUll_truth_DEP = 0.0

        PI_labels = np.argmax(tag_space_DEP.cpu().data.numpy(), axis=1)
        for predict_l, gold_l in zip(
                PI_labels,
                Predicate_Labels_nd.cpu().view(-1).data.numpy()):
            if predict_l > 1 and gold_l != 0:
                noNull_predict_DEP += 1
            if gold_l != 0:
                all_l_nums += 1
                if gold_l != 1:
                    noNUll_truth_DEP += 1
                    if gold_l == predict_l:
                        right_noNull_predict_DEP += 1
            if predict_l != gold_l and gold_l != 0:
                wrong_l_nums += 1

        loss_function = nn.CrossEntropyLoss(ignore_index=0)

        SRLloss = loss_function(tag_space, targets.view(-1))
        DEPloss = loss_function(tag_space_DEP, Predicate_Labels_nd.view(-1))
        IDloss = loss_function(Predicate_Identification_Space,
                               P_identification.view(-1))

        return SRLloss, IDloss, DEPloss, SRLprobs, wrong_l_nums, all_l_nums, wrong_l_nums, all_l_nums,  \
               right_noNull_predict, noNull_predict, noNUll_truth,\
               right_noNull_predict_DEP, noNull_predict_DEP, noNUll_truth_DEP
Exemple #47
0
    def forward(self, mode, original_words_batch, word_inputs, feature_inputs,
                word_seq_lengths, char_inputs, char_seq_lengths,
                char_seq_recover, mask):
        """
            input:
                word_inputs: (batch_size, sent_len)
                feature_inputs: [(batch_size, sent_len), ...] list of variables
                word_seq_lengths: list of batch_size, (batch_size,1)
                char_inputs: (batch_size*sent_len, word_length)
                char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1)
                char_seq_recover: variable which records the char order information, used to recover char order
            output:
                Variable(batch_size, sent_len, hidden_dim)
        """

        word_represent = self.wordrep(original_words_batch, word_inputs,
                                      feature_inputs, word_seq_lengths,
                                      char_inputs, char_seq_lengths,
                                      char_seq_recover, mask)
        ## word_embs (batch_size, seq_len, embed_size)
        if self.word_feature_extractor == "CNN":
            batch_size = word_inputs.size(0)
            word_in = torch.tanh(self.word2cnn(word_represent)).transpose(
                2, 1).contiguous()
            for idx in range(self.cnn_layer):
                if idx == 0:
                    cnn_feature = F.relu(self.cnn_list[idx](word_in))
                else:
                    cnn_feature = F.relu(self.cnn_list[idx](cnn_feature))
                cnn_feature = self.cnn_drop_list[idx](cnn_feature)
                if batch_size > 1:
                    cnn_feature = self.cnn_batchnorm_list[idx](cnn_feature)
            feature_out = cnn_feature.transpose(2, 1).contiguous()
            outputs = self.hidden2tag(feature_out)
        elif self.word_feature_extractor == "LSTM":  # lstm
            packed_words = pack_padded_sequence(word_represent,
                                                word_seq_lengths.cpu().numpy(),
                                                True)
            hidden = None
            lstm_out, hidden = self.lstm(packed_words, hidden)
            lstm_out, _ = pad_packed_sequence(lstm_out)
            ## lstm_out (seq_len, seq_len, hidden_size)
            feature_out = self.droplstm(lstm_out.transpose(1, 0))
            outputs = self.hidden2tag(feature_out)
        ## feature_out (batch_size, seq_len, hidden_size)
        # outputs = self.hidden2tag(feature_out)
        elif self.word_feature_extractor == "MultiCellLSTM":  # MultiCellLSTM
            hidden = None
            # (batch_size, seq_len, cell_num, hidden_size)
            hidden_outputs_forward, cell_states_forward, atten_probs_forward = self.lstm(
                word_represent, mask, hidden)
            if self.bilstm_flag:
                back_hidden = None
                hidden_outputs_back, cell_states_back, atten_probs_back = self.lstm_back(
                    word_represent, mask, back_hidden)
                hidden_outputs = torch.cat(
                    [hidden_outputs_forward, hidden_outputs_back], dim=-1)
                cell_states = torch.cat(
                    [cell_states_forward, cell_states_back], dim=-1)
                atten_probs = (atten_probs_forward + atten_probs_back) / 2
            hidden_outputs = self.droplstm(hidden_outputs)
            cell_states = self.droplstm(cell_states)
            cell_out = self.cell2entity(cell_states)
            if mode == 'LM':
                return hidden_outputs_forward, hidden_outputs_back, cell_out, atten_probs
            elif mode == 'NER':
                outputs = self.hidden2tag(hidden_outputs)
                return outputs, cell_out, atten_probs
    def forward(self, inputs, input_raw, hidden=None):
        """
        forward
        """
        if isinstance(inputs, tuple):
            inputs, lengths = inputs
        else:
            inputs, lengths = inputs, None

        if self.embedder is not None:
            rnn_inputs = self.embedder(inputs)
        else:
            rnn_inputs = inputs

        elmo_embed = self.elmo_embedder.sents2elmo(input_raw)
        elmo_length = [x.shape[0] for x in elmo_embed]
        batch_size_1 = len(elmo_length)
        max_l = max(elmo_length)
        size = (batch_size_1, max_l, 1024)
        tensor_1 = torch.zeros(size, dtype=torch.float)
        for i in range(batch_size_1):
            tensor_1[i][:elmo_length[i]] = torch.tensor(elmo_embed[i])

        elmo_embed = tensor_1.cuda()
        rnn_inputs = torch.cat([rnn_inputs, elmo_embed], dim=-1)

        batch_size = rnn_inputs.size(0)

        if lengths is not None:
            num_valid = lengths.gt(0).int().sum().item()
            sorted_lengths, indices = lengths.sort(descending=True)
            rnn_inputs = rnn_inputs.index_select(0, indices)

            rnn_inputs = pack_padded_sequence(
                rnn_inputs[:num_valid],
                sorted_lengths[:num_valid].tolist(),
                batch_first=True)

            if hidden is not None:
                hidden = hidden.index_select(1, indices)[:, :num_valid]

        outputs, last_hidden = self.rnn(rnn_inputs, hidden)

        if self.bidirectional:
            last_hidden = self._bridge_bidirectional_hidden(last_hidden)

        if lengths is not None:
            outputs, _ = pad_packed_sequence(outputs, batch_first=True)

            if num_valid < batch_size:
                zeros = outputs.new_zeros(batch_size - num_valid,
                                          outputs.size(1), self.hidden_size)
                outputs = torch.cat([outputs, zeros], dim=0)

                zeros = last_hidden.new_zeros(self.num_layers,
                                              batch_size - num_valid,
                                              self.hidden_size)
                last_hidden = torch.cat([last_hidden, zeros], dim=1)

            _, inv_indices = indices.sort()
            outputs = outputs.index_select(0, inv_indices)
            last_hidden = last_hidden.index_select(1, inv_indices)

        return outputs, last_hidden
    def forward(self, word_info, mode, *input_tensor):

        if mode == 'train':
            teacher_forcing = True
            query_batch, query, context_batch, context, doc_batch, document, response_batch, response, p_star_copy = input_tensor
            decoder_ip_batch = response_batch[:, :-1]
        else:
            teacher_forcing = False
            query_batch, query, context_batch, context, doc_batch, document, response_batch, respons, target_length = input_tensor
            decoder_ip_batch = response_batch[:, 0]
        query_embed, context_embed, doc_embed, decoder_ip_embed = self.word_embed(
            [query_batch, context_batch, doc_batch, decoder_ip_batch])
        query_encoding, query_repr, doc_encoding, doc_ht, context_encoding, context_ht = self.passage_context_encode(
            query_embed, query, doc_embed, document, context_embed, context)
        doc_unpack, _ = pad_packed_sequence(doc_encoding, batch_first=True)
        context_unpack, _ = pad_packed_sequence(context_encoding,
                                                batch_first=True)
        query_repr, doc_ht = self.check_direction(query_repr, doc_ht)
        query_repr_original = query.get_original_order(query_repr)
        doc_padded_original = document.get_original_order(doc_unpack)
        context_padded_original = context.get_original_order(context_unpack)
        context_repr_original, context_attn_wts, context_attn_scores = self.context_enc(
            context_padded_original, query_repr_original, context.masks)
        doc_repr_original, start_attn_wts, start_attn_scores = self.doc_enc(
            doc_padded_original, context_repr_original, query_repr_original,
            document.masks)
        start_max_score, start_max_ind = torch.max(start_attn_scores, dim=1)
        end_attn_scores = self.span_pred(doc_padded_original,
                                         doc_repr_original, start_attn_wts,
                                         start_max_ind, context_repr_original,
                                         query_repr_original, document)
        end_max_score, end_max_ind = torch.max(end_attn_scores, dim=1)
        hidden_state = doc_ht  #In place operation check
        if teacher_forcing == True:
            all_gen_op, all_gen_prob, all_copy_prob, add_ind = self.create_batch_variable(
                decoder_ip_batch.size(1), doc_batch.size(1), mode,
                decoder_ip_batch.size(0))
            for step, batch_data in enumerate(decoder_ip_embed.transpose(0,
                                                                         1)):
                decoder_step_ip = batch_data
                if self.prev_connection == True and step - 1 >= 0:
                    to_put, non_zero_ind = self.get_prev_connection(
                        p_star_copy.transpose(0, 1)[step - 1], doc_batch,
                        end_max_ind, add_ind)
                    if non_zero_ind.size() != torch.LongTensor().size(
                    ) and to_put.size() != torch.LongTensor().size():
                        last_word_embed = self.word_embed([to_put.squeeze(1)
                                                           ])[0]
                        decoder_step_ip.data.index_copy_(
                            0, non_zero_ind.data, last_word_embed.data)
                gen_prob, copy_prob = self.gen_copy_prob(
                    context_repr_original, doc_repr_original, hidden_state)
                output_scores, output_prob, hidden_state = self.decoder(
                    decoder_step_ip, context_repr_original, doc_repr_original,
                    hidden_state)
                all_gen_op[step] = output_scores
                all_gen_prob[step] = gen_prob.squeeze(1)
                all_copy_prob[step] = copy_prob.squeeze(1)
            return all_gen_op.transpose(
                0,
                1), start_attn_scores, end_attn_scores, all_gen_prob.transpose(
                    0, 1), all_copy_prob.transpose(0, 1)
        else:
            all_gen_op, all_gen_prob, all_copy_prob, add_ind, all_top_ind = self.create_batch_variable(
                target_length, doc_batch.size(1), mode,
                decoder_ip_batch.size(0))
            decoder_step_ip = decoder_ip_embed.clone()
            for step in range(target_length):
                gen_prob, copy_prob = self.gen_copy_prob(
                    context_repr_original, doc_repr_original, hidden_state)
                output_scores, output_prob, hidden_state = self.decoder(
                    decoder_step_ip, context_repr_original, doc_repr_original,
                    hidden_state)
                comp_prob = copy_prob > gen_prob
                all_gen_op[step] = output_scores
                all_gen_prob[step] = gen_prob.squeeze(1)
                all_copy_prob[step] = copy_prob.squeeze(1)
                top_elem, top_ind = torch.topk(output_prob, 1, dim=1)
                all_top_ind[step] = top_ind.squeeze()
                changed_top_ind = top_ind.clone()
                if self.prev_connection == True and step - 1 >= 0:
                    to_put, non_zero_ind = self.get_prev_connection(
                        prev_comp_prob, doc_batch, end_max_ind, add_ind)
                    if non_zero_ind.size() != torch.LongTensor().size(
                    ) and to_put.size() != torch.LongTensor().size():
                        changed_top_ind.data.index_copy_(
                            0, non_zero_ind.data, to_put.data)
                decoder_step_ip = self.word_embed([changed_top_ind.squeeze(1)
                                                   ])[0]
                prev_comp_prob = comp_prob.clone().squeeze(1)
            return all_gen_op.transpose(
                0, 1
            ), start_attn_scores, end_attn_scores, start_max_ind, end_max_ind, all_gen_prob.transpose(
                0, 1), all_copy_prob.transpose(0,
                                               1), all_top_ind.transpose(0, 1)
Exemple #50
0
    def forward(
        self, inputs: PackedSequence, states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
    ) -> Tuple[PackedSequence, Tuple[torch.Tensor, torch.Tensor]]:
        """
        Warning: Would be better to use the BiAugmentedLstm class in a regular model

        Given an input batch of sequential data such as word embeddings, produces a single layer unidirectional
        AugmentedLSTM representation of the sequential input and new state tensors.

        # Parameters

        inputs : `PackedSequence`
            `bsize` sequences of shape `(len, input_dim)` each, in PackedSequence format
        states : `Tuple[torch.Tensor, torch.Tensor]`
            Tuple of tensors containing the initial hidden state and
            the cell state of each element in the batch. Each of these tensors have a dimension of
            (1 x bsize x nhid). Defaults to `None`.

        # Returns

        `Tuple[PackedSequence, Tuple[torch.Tensor, torch.Tensor]]`
            AugmentedLSTM representation of input and the state of the LSTM `t = seq_len`.
            Shape of representation is (bsize x seq_len x representation_dim).
            Shape of each state is (1 x bsize x nhid).

        """
        if not isinstance(inputs, PackedSequence):
            raise Exception("inputs must be PackedSequence but got %s" % (type(inputs)))

        sequence_tensor, batch_lengths = pad_packed_sequence(inputs, batch_first=True)
        batch_size = sequence_tensor.size()[0]
        total_timesteps = sequence_tensor.size()[1]
        output_accumulator = sequence_tensor.new_zeros(batch_size, total_timesteps, self.lstm_dim)
        if states is None:
            full_batch_previous_memory = sequence_tensor.new_zeros(batch_size, self.lstm_dim)
            full_batch_previous_state = sequence_tensor.data.new_zeros(batch_size, self.lstm_dim)
        else:
            full_batch_previous_state = states[0].squeeze(0)
            full_batch_previous_memory = states[1].squeeze(0)
        current_length_index = batch_size - 1 if self.go_forward else 0
        if self.recurrent_dropout_probability > 0.0:
            dropout_mask = get_dropout_mask(
                self.recurrent_dropout_probability, full_batch_previous_memory
            )
        else:
            dropout_mask = None

        for timestep in range(total_timesteps):
            index = timestep if self.go_forward else total_timesteps - timestep - 1

            if self.go_forward:
                while batch_lengths[current_length_index] <= index:
                    current_length_index -= 1
            # If we're going backwards, we are _picking up_ more indices.
            else:
                # First conditional: Are we already at the maximum
                # number of elements in the batch?
                # Second conditional: Does the next shortest
                # sequence beyond the current batch
                # index require computation use this timestep?
                while (
                    current_length_index < (len(batch_lengths) - 1)
                    and batch_lengths[current_length_index + 1] > index
                ):
                    current_length_index += 1

            previous_memory = full_batch_previous_memory[0: current_length_index + 1].clone()
            previous_state = full_batch_previous_state[0: current_length_index + 1].clone()
            timestep_input = sequence_tensor[0: current_length_index + 1, index]
            timestep_output, memory = self.cell(
                timestep_input,
                (previous_state, previous_memory),
                dropout_mask[0: current_length_index + 1] if dropout_mask is not None else None,
            )
            full_batch_previous_memory = full_batch_previous_memory.data.clone()
            full_batch_previous_state = full_batch_previous_state.data.clone()
            full_batch_previous_memory[0: current_length_index + 1] = memory
            full_batch_previous_state[0: current_length_index + 1] = timestep_output
            output_accumulator[0: current_length_index + 1, index, :] = timestep_output

        output_accumulator = pack_padded_sequence(
            output_accumulator, batch_lengths, batch_first=True
        )

        # Mimic the pytorch API by returning state in the following shape:
        # (num_layers * num_directions, batch_size, lstm_dim). As this
        # LSTM cannot be stacked, the first dimension here is just 1.
        final_state = (
            full_batch_previous_state.unsqueeze(0),
            full_batch_previous_memory.unsqueeze(0),
        )
        return output_accumulator, final_state
Exemple #51
0
    def encode(
        self, source_padded: torch.Tensor, source_lengths: List[int]
    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.

        @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where
                                        b = batch_size, src_len = maximum source sentence length. Note that 
                                       these have already been sorted in order of longest to shortest sentence.
        @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch
        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where
                                        b = batch size, src_len = maximum source sentence length, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
                                                hidden state and cell.
        """
        enc_hiddens, dec_init_state = None, None

        ### YOUR CODE HERE (~ 8 Lines)
        ### TODO:
        ###     1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings.
        ###         src_len = maximum source sentence length, b = batch size, e = embedding size. Note
        ###         that there is no initial hidden state or cell for the decoder.
        ###     2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`.
        ###         - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X.
        ###         - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens.
        ###         - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to
        ###           return a tensor of shape (b, src_len, h*2) as `enc_hiddens`.
        ###     3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell):
        ###         - `init_decoder_hidden`:
        ###             `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the h_projection layer to this in order to compute init_decoder_hidden.
        ###             This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###         - `init_decoder_cell`:
        ###             `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the c_projection layer to this in order to compute init_decoder_cell.
        ###             This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###
        ### See the following docs, as you may need to use some of the following functions in your implementation:
        ###     Pack the padded sequence X before passing to the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence
        ###     Pad the packed sequence, enc_hiddens, returned by the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tensor Permute:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute
        # 1
        X = self.model_embeddings.source(source_padded)
        #2
        enc_hiddens, (last_hidden, last_cell) = self.encoder(
            pack_padded_sequence(X, lengths=source_lengths))
        enc_hiddens, _ = pad_packed_sequence(enc_hiddens)
        enc_hiddens = enc_hiddens.permute(1, 0, 2)
        #3
        batch_size = enc_hiddens.size(0)
        init_decoder_hidden = self.h_projection(
            last_hidden.permute(1, 0, 2).contiguous().view(
                [batch_size, 2 * self.hidden_size]))
        init_decoder_cell = self.c_projection(
            last_cell.permute(1, 0,
                              2).contiguous().view(batch_size,
                                                   2 * self.hidden_size))
        dec_init_state = (init_decoder_hidden, init_decoder_cell)

        ### END YOUR CODE

        return enc_hiddens, dec_init_state
 def get_decoded_output(self, decoder_input, hidden, lens):
     # TODO: adapt this
     output, hidden = self.decoder(decoder_input, hidden, lens)
     output, _ = pad_packed_sequence(output)
     output = self.out_layer(output)
     return output, hidden
Exemple #53
0
    def CVT_train(self, sentence, p_sentence, lengths):
        ## start unlabeled training:

        Predicate_Identification_Space = self.Predicate_Id(
            sentence, p_sentence, lengths)

        Predicate_probs = Predicate_Identification_Space.view(
            self.batch_size, len(sentence[0]), -1).cpu().data.numpy()
        Predicate_idx_batch = [0] * self.batch_size
        for i in range(self.batch_size):
            candidate_set = []
            for j in range(len(sentence[0])):
                if j >= lengths[i]:
                    break
                if Predicate_probs[i][j][2] > Predicate_probs[i][j][
                        1] and Predicate_probs[i][j][2] > Predicate_probs[i][
                            j][0]:
                    candidate_set.append(j)
            if len(candidate_set) > 0:
                index = random.sample(candidate_set, 1)
                Predicate_idx_batch[i] = index[0]

        # +++++++++++++++++++++++
        """
        construct DEP_input
        """
        # contruct input for DEP
        unlabeled_region_mark = np.zeros(sentence.size(), dtype='int64')
        for i in range(len(unlabeled_region_mark)):
            unlabeled_region_mark[i][Predicate_idx_batch[i]] = 1

        unlabeled_region_mark_in = torch.from_numpy(unlabeled_region_mark).to(
            device)
        region_marks = self.region_embeddings(unlabeled_region_mark_in).view(
            self.batch_size, len(sentence[0]), 16)

        embeds_DEP = self.word_embeddings_DEP(sentence)
        embeds_DEP = embeds_DEP.view(self.batch_size, len(sentence[0]),
                                     self.word_emb_dim)

        # sharing pretrained word_embeds
        fixed_embeds_DEP = self.word_fixed_embeddings_DEP(p_sentence)
        fixed_embeds_DEP = fixed_embeds_DEP.view(self.batch_size,
                                                 len(sentence[0]),
                                                 self.word_emb_dim)

        embeds_forDEP = torch.cat((embeds_DEP, fixed_embeds_DEP, region_marks),
                                  2)
        embeds_forDEP = self.DEP_input_dropout(embeds_forDEP)

        # first layer
        embeds_sort, lengths_sort, unsort_idx = self.sort_batch(
            embeds_forDEP, lengths)
        embeds_sort = rnn.pack_padded_sequence(embeds_sort,
                                               lengths_sort,
                                               batch_first=True)
        # hidden states [time_steps * batch_size * hidden_units]
        hidden_states, self.hidden_DEP_base = self.BiLSTM_0(
            embeds_sort, self.hidden_DEP_base)
        # it seems that hidden states is already batch first, we don't need swap the dims
        # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, )
        hidden_states, lens = rnn.pad_packed_sequence(hidden_states,
                                                      batch_first=True)
        # hidden_states = hidden_states.transpose(0, 1)
        hidden_states_0_DEP = hidden_states[unsort_idx]

        # second_layer
        embeds_sort, lengths_sort, unsort_idx = self.sort_batch(
            hidden_states_0_DEP, lengths)
        embeds_sort = rnn.pack_padded_sequence(embeds_sort,
                                               lengths_sort,
                                               batch_first=True)
        # hidden states [time_steps * batch_size * hidden_units]
        hidden_states, self.hidden_DEP = self.BiLSTM_DEP(
            embeds_sort, self.hidden_DEP)
        # it seems that hidden states is already batch first, we don't need swap the dims
        # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, )
        hidden_states, lens = rnn.pad_packed_sequence(hidden_states,
                                                      batch_first=True)
        # hidden_states = hidden_states.transpose(0, 1)
        hidden_states_1 = hidden_states[unsort_idx]

        ###########################################

        # +++++++++++++++++++++++
        h_layer_0 = hidden_states_0_DEP.detach()
        h_layer_1 = hidden_states_1.detach()

        w = F.softmax(self.elmo_w, dim=0)
        SRL_composer = self.elmo_gamma * (w[0] * h_layer_0 + w[1] * h_layer_1)
        SRL_composer = self.elmo_mlp(SRL_composer)

        fixed_embeds = self.word_fixed_embeddings(p_sentence)
        fixed_embeds = fixed_embeds.view(self.batch_size, len(sentence[0]),
                                         self.word_emb_dim)
        embeds_SRL = self.word_embeddings_SRL(sentence)
        embeds_SRL = embeds_SRL.view(self.batch_size, len(sentence[0]),
                                     self.word_emb_dim)

        SRL_hidden_states = torch.cat(
            (embeds_SRL, fixed_embeds, region_marks, SRL_composer), 2)
        SRL_hidden_states = self.SRL_input_dropout(SRL_hidden_states)

        # SRL layer
        embeds_sort, lengths_sort, unsort_idx = self.sort_batch(
            SRL_hidden_states, lengths)
        embeds_sort = rnn.pack_padded_sequence(embeds_sort,
                                               lengths_sort,
                                               batch_first=True)
        # hidden states [time_steps * batch_size * hidden_units]
        hidden_states, self.hidden_SRL_base = self.BiLSTM_1(
            embeds_sort, self.hidden_SRL_base)
        # it seems that hidden states is already batch first, we don't need swap the dims
        # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, )
        hidden_states, lens = rnn.pad_packed_sequence(hidden_states,
                                                      batch_first=True)
        # hidden_states = hidden_states.transpose(0, 1)
        hidden_states_0 = hidden_states[unsort_idx]

        embeds_sort, lengths_sort, unsort_idx = self.sort_batch(
            hidden_states_0, lengths)
        embeds_sort = rnn.pack_padded_sequence(embeds_sort,
                                               lengths_sort.cpu().numpy(),
                                               batch_first=True)
        # hidden states [time_steps * batch_size * hidden_units]
        hidden_states, self.hidden_SRL = self.BiLSTM_SRL(
            embeds_sort, self.hidden_SRL)
        # it seems that hidden states is already batch first, we don't need swap the dims
        # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, )
        hidden_states, lens = rnn.pad_packed_sequence(hidden_states,
                                                      batch_first=True)
        # hidden_states = hidden_states.transpose(0, 1)
        hidden_states = hidden_states[unsort_idx]
        hidden_states = self.hidden_state_dropout_SRL(hidden_states)

        # B * H
        hidden_states_3 = hidden_states
        hidden_states_word = self.dropout_1(
            F.relu(self.Non_Predicate_Proj(hidden_states_3)))
        predicate_embeds = hidden_states_3[
            np.arange(0,
                      hidden_states_3.size()[0]), Predicate_idx_batch]
        hidden_states_predicate = self.dropout_2(
            F.relu(self.Predicate_Proj(predicate_embeds)))

        bias_one = torch.ones(
            (self.batch_size, len(sentence[0]), 1)).to(device)
        hidden_states_word = torch.cat(
            (hidden_states_word, Variable(bias_one)), 2)

        bias_one = torch.ones((self.batch_size, 1)).to(device)
        hidden_states_predicate = torch.cat(
            (hidden_states_predicate, Variable(bias_one)), 1)

        left_part = torch.mm(
            hidden_states_word.view(self.batch_size * len(sentence[0]), -1),
            self.W_R)
        left_part = left_part.view(self.batch_size,
                                   len(sentence[0]) * self.tagset_size, -1)
        hidden_states_predicate = hidden_states_predicate.view(
            self.batch_size, -1, 1)
        tag_space = torch.bmm(left_part, hidden_states_predicate).view(
            self.batch_size, len(sentence[0]), -1)

        ## obtain the teacher probs
        SRLprobs_teacher = tag_space.detach()
        hidden_forward, hidden_backward = hidden_states_0_DEP.split(
            self.hidden_dim, 2)
        CVT_SRL_Loss = self.Semi_SRL_Loss(hidden_forward, hidden_backward,
                                          Predicate_idx_batch, sentence,
                                          SRLprobs_teacher, lengths)

        return CVT_SRL_Loss
Exemple #54
0
    def _lstm_forward(self,
                      inputs: PackedSequence,
                      initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> \
            Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """
        Parameters
        ----------
        inputs : ``PackedSequence``, required.
          A batch first ``PackedSequence`` to run the stacked LSTM over.
        initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None)
          A tuple (state, memory) representing the initial hidden state and memory
          of the LSTM, with shape (num_layers, batch_size, 2 * hidden_size) and
          (num_layers, batch_size, 2 * cell_size) respectively.
        Returns
        -------
        output_sequence : ``torch.FloatTensor``
          The encoded sequence of shape (num_layers, batch_size, sequence_length, hidden_size)
        final_states: ``Tuple[torch.FloatTensor, torch.FloatTensor]``
          The per-layer final (state, memory) states of the LSTM, with shape
          (num_layers, batch_size, 2 * hidden_size) and  (num_layers, batch_size, 2 * cell_size)
          respectively. The last dimension is duplicated because it contains the state/memory
          for both the forward and backward layers.
        """

        if initial_state is None:
            hidden_states: List[Optional[Tuple[torch.Tensor,
                                               torch.Tensor]]] = [None] * len(
                                                   self.forward_layers)
        elif initial_state[0].size()[0] != len(self.forward_layers):
            raise Exception(
                "Initial states were passed to forward() but the number of "
                "initial states does not match the number of layers.")
        else:
            hidden_states = list(
                zip(initial_state[0].split(1, 0), initial_state[1].split(1,
                                                                         0)))

        inputs, batch_lengths = pad_packed_sequence(inputs, batch_first=True)
        forward_output_sequence = inputs
        backward_output_sequence = inputs

        final_states = []
        sequence_outputs = []
        for layer_index, state in enumerate(hidden_states):
            forward_layer = getattr(self,
                                    'forward_layer_{}'.format(layer_index))
            backward_layer = getattr(self,
                                     'backward_layer_{}'.format(layer_index))

            forward_cache = forward_output_sequence
            backward_cache = backward_output_sequence

            if state is not None:
                forward_hidden_state, backward_hidden_state = state[0].split(
                    self.hidden_size, 2)
                forward_memory_state, backward_memory_state = state[1].split(
                    self.cell_size, 2)
                forward_state = (forward_hidden_state, forward_memory_state)
                backward_state = (backward_hidden_state, backward_memory_state)
            else:
                forward_state = None
                backward_state = None

            forward_output_sequence, forward_state = forward_layer(
                forward_output_sequence, batch_lengths, forward_state)
            backward_output_sequence, backward_state = backward_layer(
                backward_output_sequence, batch_lengths, backward_state)
            # Skip connections, just adding the input to the output.
            if layer_index != 0:
                forward_output_sequence += forward_cache
                backward_output_sequence += backward_cache

            sequence_outputs.append(
                torch.cat([forward_output_sequence, backward_output_sequence],
                          -1))
            # Append the state tuples in a list, so that we can return
            # the final states for all the layers.
            final_states.append(
                (torch.cat([forward_state[0], backward_state[0]], -1),
                 torch.cat([forward_state[1], backward_state[1]], -1)))

        stacked_sequence_outputs: torch.FloatTensor = torch.stack(
            sequence_outputs)
        # Stack the hidden state and memory for each layer in。to 2 tensors of shape
        # (num_layers, batch_size, hidden_size) and (num_layers, batch_size, cell_size)
        # respectively.
        final_hidden_states, final_memory_states = zip(*final_states)
        final_state_tuple: Tuple[torch.FloatTensor,
                                 torch.FloatTensor] = (torch.cat(
                                     final_hidden_states,
                                     0), torch.cat(final_memory_states, 0))
        return stacked_sequence_outputs, final_state_tuple
Exemple #55
0
    def forward(self, query, keys, keys_length, mask=None):
        """
        Parameters
        ----------
        query: 2D tensor, [B, H]
        keys: (masked_interests), 3D tensor, [b, T, H]
        keys_length: 1D tensor, [B]

        Returns
        -------
        outputs: 2D tensor, [B, H]
        """
        batch_size, dim = query.size()
        max_length = keys.size()[1]

        # check batch validation
        zero_outputs = torch.zeros(batch_size, dim, device=query.device)
        mask = keys_length > 0
        # [B] -> [b]
        keys_length = keys_length[mask]
        if keys_length.shape[0] == 0:
            return zero_outputs

        # [B, H] -> [b, 1, H]
        query = torch.masked_select(query, mask.view(-1,
                                                     1)).view(-1,
                                                              dim).unsqueeze(1)

        if self.gru_type == 'GRU':
            packed_keys = pack_padded_sequence(keys,
                                               lengths=keys_length,
                                               batch_first=True,
                                               enforce_sorted=False)
            packed_interests, _ = self.interest_evolution(packed_keys)
            interests, _ = pad_packed_sequence(packed_interests,
                                               batch_first=True,
                                               padding_value=0.0,
                                               total_length=max_length)
            outputs = self.attention(query, interests,
                                     keys_length.unsqueeze(1))  # [b, 1, H]
            outputs = outputs.squeeze(1)  # [b, H]
        elif self.gru_type == 'AIGRU':
            att_scores = self.attention(query, keys,
                                        keys_length.unsqueeze(1))  # [b, 1, T]
            interests = keys * att_scores.transpose(1, 2)  # [b, T, H]
            packed_interests = pack_padded_sequence(interests,
                                                    lengths=keys_length,
                                                    batch_first=True,
                                                    enforce_sorted=False)
            _, outputs = self.interest_evolution(packed_interests)
            outputs = outputs.squeeze(0)  # [b, H]
        elif self.gru_type == 'AGRU' or self.gru_type == 'AUGRU':
            att_scores = self.attention(
                query, keys, keys_length.unsqueeze(1)).squeeze(1)  # [b, T]
            packed_interests = pack_padded_sequence(keys,
                                                    lengths=keys_length,
                                                    batch_first=True,
                                                    enforce_sorted=False)
            packed_scores = pack_padded_sequence(att_scores,
                                                 lengths=keys_length,
                                                 batch_first=True,
                                                 enforce_sorted=False)
            outputs = self.interest_evolution(packed_interests, packed_scores)
            outputs, _ = pad_packed_sequence(outputs,
                                             batch_first=True,
                                             padding_value=0.0,
                                             total_length=max_length)
            # pick last state
            outputs = InterestEvolving._get_last_state(outputs,
                                                       keys_length)  # [b, H]
        # [b, H] -> [B, H]
        zero_outputs[mask] = outputs
        return zero_outputs
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        # def forward(self, args, input_ids=None, attention_mask=None, labels=None):

        # BERT
        outputs = self.bert_layer(input_ids, attention_mask=attention_mask)

        # output 0 = batch size 6, tokens 512, each token dimension 768 [CLS] token
        # output 1 = batch size 6, each token dimension 768
        # output 2 = layers 13, batch 6 (hidden states), tokens 512, each token dimension 768
        sequence_output = outputs[2]  # Last layer of each token prediction

        num_layer_sum = 4
        summed_last_4_layers = torch.stack(
            sequence_output[:num_layer_sum]).mean(0)

        # lstm with masks (same as attention masks)
        packed_input, perm_idx, seq_lengths = get_packed_padded_output(
            summed_last_4_layers, input_ids, attention_mask, self.tokenizer)
        packed_output, (ht, ct) = self.lstm_layer(packed_input)

        # Unpack and reorder the output
        output, input_sizes = pad_packed_sequence(packed_output,
                                                  batch_first=True)
        _, unperm_idx = perm_idx.sort(0)
        lstm_output = output[
            unperm_idx]  # lstm_output.shape = shorter than the padded torch.Size([6, 388, 512])
        seq_lengths_ordered = seq_lengths[unperm_idx]

        # shorten the labels as per the batchsize
        labels = labels[:, :lstm_output.shape[1]]

        # Apply mask before calculating the matmul of inputs and the K, Q, V weights
        attention_mask_ = attention_mask[:, :lstm_output.shape[1]]
        attention_mask_ = attention_mask_.bool()

        # Apply attention here
        attention_applied, attention_weights = self.self_attention(
            lstm_output,
            lstm_output,
            lstm_output,
            key_padding_mask=None,
            need_weights=True,
            attn_mask=None)

        # mask the unimportant tokens before attention is applied
        mask = (
            (input_ids[:, :attention_applied.shape[1]] !=
             self.tokenizer.pad_token_id)
            & (input_ids[:, :attention_applied.shape[1]] !=
               self.tokenizer.convert_tokens_to_ids(self.tokenizer.sep_token))
            & (labels != 100))

        mask_expanded = mask.unsqueeze(-1).expand(attention_applied.size())
        attention_applied *= mask_expanded.float()
        labels *= mask.long()

        # log reg
        probablity = F.relu(self.hidden2tag(attention_applied))

        # CRF emissions (coarse)
        loss = self.crf_layer(probablity,
                              labels,
                              mask=mask,
                              reduction='token_mean',
                              weights=self.weights)

        emissions = self.crf_layer.decode(probablity, mask=mask)
        emissions_ = [item for sublist in emissions for item in sublist
                      ]  # flatten the nest list of emissions

        # mask labels here according to masks
        labels_masked = labels[mask]

        return loss, emissions_, labels_masked, mask
Exemple #57
0
    def _get_instr_embedding(self, instr):
        if self.lang_model == 'gru':
            _, hidden = self.instr_rnn(self.word_embedding(instr))
            return hidden[-1]

        elif self.lang_model in ['bigru', 'attgru']:
            lengths = (instr != 0).sum(1).long()
            masks = (instr != 0).float()

            if lengths.shape[0] > 1:
                seq_lengths, perm_idx = lengths.sort(0, descending=True)
                iperm_idx = torch.LongTensor(perm_idx.shape).fill_(0)
                if instr.is_cuda: iperm_idx = iperm_idx.cuda()
                for i, v in enumerate(perm_idx):
                    iperm_idx[v.data] = i

                inputs = self.word_embedding(instr)
                inputs = inputs[perm_idx]

                inputs = pack_padded_sequence(inputs,
                                              seq_lengths.data.cpu().numpy(),
                                              batch_first=True)

                outputs, final_states = self.instr_rnn(inputs)
            else:
                instr = instr[:, 0:lengths[0]]
                outputs, final_states = self.instr_rnn(
                    self.word_embedding(instr))
                iperm_idx = None
            final_states = final_states.transpose(0, 1).contiguous()
            final_states = final_states.view(final_states.shape[0], -1)
            if iperm_idx is not None:
                outputs, _ = pad_packed_sequence(outputs, batch_first=True)
                outputs = outputs[iperm_idx]
                final_states = final_states[iperm_idx]

            if outputs.shape[1] < masks.shape[1]:
                masks = masks[:, :(outputs.shape[1] - masks.shape[1])]
                # the packing truncated the original length
                # so we need to change mask to fit it

            return outputs if self.lang_model == 'attgru' else final_states

        elif self.lang_model == 'conv':
            inputs = self.word_embedding(instr).unsqueeze(1)  # (B,1,T,D)
            inputs = [
                F.relu(conv(inputs)).squeeze(3) for conv in self.instr_convs
            ]
            inputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in inputs]

            return torch.cat(inputs, 1)

        elif self.lang_model == 'bow':
            device = torch.device("cuda" if instr.is_cuda else "cpu")
            input_dim = self.obs_space["instr"]
            input = torch.zeros((instr.size(0), input_dim), device=device)
            idx = torch.arange(instr.size(0), dtype=torch.int64)
            input[idx.unsqueeze(1), instr] = 1.
            return self.instr_bow(input)
        else:
            ValueError("Undefined instruction architecture: {}".format(
                self.use_instr))
Exemple #58
0
 def forward(self, input, lengths, h_0=None):
     input_packed = pack_padded_sequence(input, lengths=lengths, batch_first=self.batch_first)
     out_packed, h_n = self.rnn(input_packed, h_0)
     out = pad_packed_sequence(out_packed, batch_first=self.batch_first)[0]
     return out.contiguous(), h_n
Exemple #59
0
    def forward(self, words, feats):
        r"""
        Args:
            words (~torch.LongTensor): ``[batch_size, seq_len]``.
                Word indices.
            feats (list[~torch.LongTensor]):
                A list of feat indices.
                The size of indices is ``[batch_size, seq_len, fix_len]`` if feat is ``'char'`` or ``'bert'``,
                or ``[batch_size, seq_len]`` otherwise.

        Returns:
            ~torch.Tensor, ~torch.Tensor, ~torch.Tensor:
                Scores of all possible edges of shape ``[batch_size, seq_len, seq_len]``,
                dependent-head-sibling triples of shape ``[batch_size, seq_len, seq_len, seq_len]`` and
                all possible labels on each edge of shape ``[batch_size, seq_len, seq_len, n_labels]``.
        """

        _, seq_len = words.shape
        # get the mask and lengths of given batch
        mask = words.ne(self.pad_index)
        ext_words = words
        # set the indices larger than num_embeddings to unk_index
        if hasattr(self, 'pretrained'):
            ext_mask = words.ge(self.word_embed.num_embeddings)
            ext_words = words.masked_fill(ext_mask, self.unk_index)

        # get outputs from embedding layers
        word_embed = self.word_embed(ext_words)
        if hasattr(self, 'pretrained'):
            word_embed = torch.cat(
                (word_embed, self.embed_proj(self.pretrained(words))), -1)

        feat_embeds = []
        if 'tag' in self.args.feat:
            feat_embeds.append(self.tag_embed(feats.pop()))
        if 'char' in self.args.feat:
            feat_embeds.append(self.char_embed(feats.pop(0)))
        if 'bert' in self.args.feat:
            feat_embeds.append(self.bert_embed(feats.pop(0)))
        if 'lemma' in self.args.feat:
            feat_embeds.append(self.lemma_embed(feats.pop(0)))
        word_embed, feat_embed = self.embed_dropout(word_embed,
                                                    torch.cat(feat_embeds, -1))
        # concatenate the word and feat representations
        embed = torch.cat((word_embed, feat_embed), -1)

        x = pack_padded_sequence(embed, mask.sum(1), True, False)
        x, _ = self.lstm(x)
        x, _ = pad_packed_sequence(x, True, total_length=seq_len)
        x = self.lstm_dropout(x)

        # apply MLPs to the BiLSTM output states
        un_d = self.mlp_un_d(x)
        un_h = self.mlp_un_h(x)
        bin_d = self.mlp_bin_d(x)
        bin_h = self.mlp_bin_h(x)
        bin_g = self.mlp_bin_g(x)
        label_h = self.mlp_label_h(x)
        label_d = self.mlp_label_d(x)
        label_h = self.mlp_label_h(x)

        # [batch_size, seq_len, seq_len]
        s_egde = self.edge_attn(un_d, un_h)
        # [batch_size, seq_len, seq_len, n_labels]
        s_sib = self.sib_attn(bin_d, bin_d, bin_h).triu_()
        s_sib = (s_sib + s_sib.transpose(-1, -2)).permute(0, 3, 1, 2)
        # [batch_size, seq_len, seq_len, n_labels]
        s_cop = self.cop_attn(bin_h, bin_d, bin_h).permute(0, 3, 1, 2).triu_()
        s_cop = s_cop + s_cop.transpose(-1, -2)
        # [batch_size, seq_len, seq_len, n_labels]
        s_grd = self.grd_attn(bin_g, bin_d, bin_h).permute(0, 3, 1, 2)
        # [batch_size, seq_len, seq_len, n_labels]
        s_label = self.label_attn(label_d, label_h).permute(0, 2, 3, 1)

        return s_egde, s_sib, s_cop, s_grd, s_label
Exemple #60
0
    def forward(
        self,
        inputs: PackedSequence,  # pylint: disable=arguments-differ
        # pylint: disable=unused-argument
        initial_state: torch.Tensor = None
    ) -> Tuple[PackedSequence, torch.Tensor]:
        """
        Parameters
        ----------
        inputs : ``PackedSequence``, required.
            A batch first ``PackedSequence`` to run the stacked LSTM over.
        initial_state : Tuple[torch.Tensor, torch.Tensor], optional, (default = None)
            Currently, this is ignored.

        Returns
        -------
        output_sequence : ``PackedSequence``
            The encoded sequence of shape (batch_size, sequence_length, hidden_size)
        final_states: ``torch.Tensor``
            The per-layer final (state, memory) states of the LSTM, each with shape
            (num_layers, batch_size, hidden_size).
        """
        inputs, lengths = pad_packed_sequence(inputs, batch_first=True)

        # Kernel takes sequence length first tensors.
        inputs = inputs.transpose(0, 1)

        sequence_length, batch_size, _ = inputs.size()
        accumulator_shape = [
            self.num_layers, sequence_length + 1, batch_size, self.hidden_size
        ]
        state_accumulator = Variable(
            inputs.data.new(*accumulator_shape).zero_(), requires_grad=False)
        memory_accumulator = Variable(
            inputs.data.new(*accumulator_shape).zero_(), requires_grad=False)

        dropout_weights = inputs.data.new().resize_(
            self.num_layers, batch_size, self.hidden_size).fill_(1.0)
        if self.training:
            # Normalize by 1 - dropout_prob to preserve the output statistics of the layer.
            dropout_weights.bernoulli_(1 - self.recurrent_dropout_probability)\
                .div_((1 - self.recurrent_dropout_probability))

        dropout_weights = Variable(dropout_weights, requires_grad=False)
        gates = Variable(inputs.data.new().resize_(self.num_layers,
                                                   sequence_length, batch_size,
                                                   6 * self.hidden_size))

        lengths_variable = Variable(torch.IntTensor(lengths))
        implementation = _AlternatingHighwayLSTMFunction(
            self.input_size,
            self.hidden_size,
            num_layers=self.num_layers,
            train=self.training)
        output, _ = implementation(inputs, self.weight, self.bias,
                                   state_accumulator, memory_accumulator,
                                   dropout_weights, lengths_variable, gates)

        # TODO(Mark): Also return the state here by using index_select with the lengths so we can use
        # it as a Seq2VecEncoder.
        output = output.transpose(0, 1)
        output = pack_padded_sequence(output, lengths, batch_first=True)
        return output, None