def test_augmented_lstm_computes_same_function_as_pytorch_lstm(self): augmented_lstm = AugmentedLstm(10, 11) pytorch_lstm = LSTM(10, 11, num_layers=1, batch_first=True) # Initialize all weights to be == 1. initializer = InitializerApplicator([(".*", lambda tensor: torch.nn.init.constant_(tensor, 1.))]) initializer(augmented_lstm) initializer(pytorch_lstm) initial_state = torch.zeros([1, 5, 11]) initial_memory = torch.zeros([1, 5, 11]) # Use bigger numbers to avoid floating point instability. sorted_tensor, sorted_sequence, _, _ = sort_batch_by_length(self.random_tensor * 5., self.sequence_lengths) lstm_input = pack_padded_sequence(sorted_tensor, sorted_sequence.data.tolist(), batch_first=True) augmented_output, augmented_state = augmented_lstm(lstm_input, (initial_state, initial_memory)) pytorch_output, pytorch_state = pytorch_lstm(lstm_input, (initial_state, initial_memory)) pytorch_output_sequence, _ = pad_packed_sequence(pytorch_output, batch_first=True) augmented_output_sequence, _ = pad_packed_sequence(augmented_output, batch_first=True) numpy.testing.assert_array_almost_equal(pytorch_output_sequence.data.numpy(), augmented_output_sequence.data.numpy(), decimal=4) numpy.testing.assert_array_almost_equal(pytorch_state[0].data.numpy(), augmented_state[0].data.numpy(), decimal=4) numpy.testing.assert_array_almost_equal(pytorch_state[1].data.numpy(), augmented_state[1].data.numpy(), decimal=4)
def forward(self, # pylint: disable=arguments-differ inputs: PackedSequence, initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None): """ Parameters ---------- inputs : ``PackedSequence``, required. A batch first ``PackedSequence`` to run the stacked LSTM over. initial_state : Tuple[torch.Tensor, torch.Tensor], optional, (default = None) A tuple (state, memory) representing the initial hidden state and memory of the LSTM. Each tensor has shape (1, batch_size, output_dimension * 2). Returns ------- output_sequence : PackedSequence The encoded sequence of shape (batch_size, sequence_length, hidden_size * 2) final_states: torch.Tensor The per-layer final (state, memory) states of the LSTM, each with shape (num_layers, batch_size, hidden_size * 2). """ if not initial_state: hidden_states = [None] * len(self.lstm_layers) elif initial_state[0].size()[0] != len(self.lstm_layers): raise ConfigurationError("Initial states were passed to forward() but the number of " "initial states does not match the number of layers.") else: hidden_states = list(zip(initial_state[0].split(1, 0), initial_state[1].split(1, 0))) output_sequence = inputs final_h = [] final_c = [] for i, state in enumerate(hidden_states): forward_layer = getattr(self, 'forward_layer_{}'.format(i)) backward_layer = getattr(self, 'backward_layer_{}'.format(i)) # The state is duplicated to mirror the Pytorch API for LSTMs. forward_output, final_forward_state = forward_layer(output_sequence, state) backward_output, final_backward_state = backward_layer(output_sequence, state) forward_output, lengths = pad_packed_sequence(forward_output, batch_first=True) backward_output, _ = pad_packed_sequence(backward_output, batch_first=True) output_sequence = torch.cat([forward_output, backward_output], -1) output_sequence = pack_padded_sequence(output_sequence, lengths, batch_first=True) final_h.extend([final_forward_state[0], final_backward_state[0]]) final_c.extend([final_forward_state[1], final_backward_state[1]]) final_h = torch.cat(final_h, dim=0) final_c = torch.cat(final_c, dim=0) final_state_tuple = (final_h, final_c) return output_sequence, final_state_tuple
def forward(self, input, *args): args, seq_lengths = args[:-1], args[-1] input = rnn_utils.pack_padded_sequence(input, seq_lengths, self.batch_first) rets = self.model(input, *args) ret, rets = rets[0], rets[1:] ret, _ = rnn_utils.pad_packed_sequence(ret, self.batch_first) return tuple([ret] + list(rets))
def forward(self, xs): bsz = len(xs) # embed input tokens xes = F.dropout(self.lt(xs), p=self.dropout, training=self.training) x_lens = [x for x in torch.sum((xs > 0).int(), dim=1).data] xes_packed = pack_padded_sequence(xes, x_lens, batch_first=True) zeros = self.zeros(xs) if zeros.size(1) != bsz: zeros.resize_(self.layers * self.dirs, bsz, self.hsz).fill_(0) h0 = Variable(zeros, requires_grad=False) if type(self.rnn) == nn.LSTM: encoder_output_packed, hidden = self.rnn(xes_packed, (h0, h0)) # take elementwise max between forward and backward hidden states hidden = (hidden[0].view(-1, self.dirs, bsz, self.hsz).max(1)[0], hidden[1].view(-1, self.dirs, bsz, self.hsz).max(1)[0]) else: encoder_output_packed, hidden = self.rnn(xes_packed, h0) # take elementwise max between forward and backward hidden states hidden = hidden.view(-1, self.dirs, bsz, self.hsz).max(1)[0] encoder_output, _ = pad_packed_sequence(encoder_output_packed, batch_first=True) return encoder_output, hidden
def encode_table_header(self, tables): # input, ids of table word: (batch_size, max_column_num) # encode_output: (max_head_word_num, batch_size, max_column_num, hidden_size) # (batch_size, max_column_num, max_head_word_num) # table_head_mask: (batch_size, max_column_num) # table_col_lens: (batch_size, max_column_num) table_head_wids, table_col_lens = WikiSqlBatch.get_table_header_input_tensor(tables, self.vocab.source, cuda=self.args.cuda) # hack: pack_padded_sequence requires seq length to be greater than 1 for tbl in table_col_lens: for i in range(len(tbl)): if tbl[i] == 0: tbl[i] = 1 table_header_mask = WikiSqlBatch.get_table_header_mask(tables, cuda=self.args.cuda) # (batch_size, max_column_num, max_head_word_num, word_embed_size) table_head_word_embeds = self.src_embed(table_head_wids.view(-1)).view(list(table_head_wids.size()) + [self.src_embed.embedding_dim]) batch_size = table_head_word_embeds.size(0) max_col_num = table_head_word_embeds.size(1) max_col_word_num = table_head_word_embeds.size(2) # (batch_size * max_column_num, max_head_word_num, word_embed_size) table_head_word_embeds_flatten = table_head_word_embeds.view(batch_size * max_col_num, max_col_word_num, -1) table_col_lens_flatten = list(chain.from_iterable(table_col_lens)) sorted_col_ids = sorted(list(range(len(table_col_lens_flatten))), key=lambda x: -table_col_lens_flatten[x]) sorted_table_col_lens_flatten = [table_col_lens_flatten[i] for i in sorted_col_ids] col_old_pos_map = [-1] * len(sorted_col_ids) for new_pos, old_pos in enumerate(sorted_col_ids): col_old_pos_map[old_pos] = new_pos # (batch_size * max_column_num, max_head_word_num, word_embed_size) sorted_table_head_word_embeds = table_head_word_embeds_flatten[sorted_col_ids, :, :] packed_table_head_word_embeds = pack_padded_sequence(sorted_table_head_word_embeds, sorted_table_col_lens_flatten, batch_first=True) # column_word_encodings: (batch_size * max_column_num, max_head_word_num, hidden_size) column_word_encodings, (table_header_encoding, table_head_last_cell) = self.table_header_lstm(packed_table_head_word_embeds) column_word_encodings, _ = pad_packed_sequence(column_word_encodings, batch_first=True) # (batch_size * max_column_num, max_head_word_num, hidden_size) column_word_encodings = column_word_encodings[col_old_pos_map] # (batch_size, max_column_num, max_head_word_num, hidden_size) column_word_encodings = column_word_encodings.view(batch_size, max_col_num, max_col_word_num, -1) # (batch_size, hidden_size * 2) table_header_encoding = torch.cat([table_header_encoding[0], table_header_encoding[1]], -1) # table_head_last_cell = torch.cat([table_head_last_cell[0], table_head_last_cell[1]], -1) # same table_header_encoding = table_header_encoding[col_old_pos_map] # (batch_size, max_column_num, hidden_size) table_header_encoding = table_header_encoding.view(batch_size, max_col_num, -1) return column_word_encodings, table_header_encoding, table_header_mask
def encode(self, src_sents_var, src_sents_len): """Encode the input natural language utterance Args: src_sents_var: a variable of shape (src_sent_len, batch_size), representing word ids of the input src_sents_len: a list of lengths of input source sentences, sorted by descending order Returns: src_encodings: source encodings of shape (batch_size, src_sent_len, hidden_size * 2) last_state, last_cell: the last hidden state and cell state of the encoder, of shape (batch_size, hidden_size) """ # (tgt_query_len, batch_size, embed_size) # apply word dropout if self.training and self.args.word_dropout: mask = Variable(self.new_tensor(src_sents_var.size()).fill_(1. - self.args.word_dropout).bernoulli().long()) src_sents_var = src_sents_var * mask + (1 - mask) * self.vocab.source.unk_id src_token_embed = self.src_embed(src_sents_var) packed_src_token_embed = pack_padded_sequence(src_token_embed, src_sents_len) # src_encodings: (tgt_query_len, batch_size, hidden_size) src_encodings, (last_state, last_cell) = self.encoder_lstm(packed_src_token_embed) src_encodings, _ = pad_packed_sequence(src_encodings) # src_encodings: (batch_size, tgt_query_len, hidden_size) src_encodings = src_encodings.permute(1, 0, 2) # (batch_size, hidden_size * 2) last_state = torch.cat([last_state[0], last_state[1]], 1) last_cell = torch.cat([last_cell[0], last_cell[1]], 1) return src_encodings, (last_state, last_cell)
def forward(self, question,length): length = list(length.data.cpu().numpy()) emb = self.drop(self.encoder(question)) emb = self.tanh(emb) hidden = self.init_hidden(len(length)) seqs = trnn.pack_padded_sequence(emb, length, batch_first=True) seqs, hidden = self.rnn(seqs, hidden) h,_ = trnn.pad_packed_sequence(seqs, batch_first=True) #attention weights = self.softmax(self.att2(torch.transpose(h, 1, 2)).squeeze(1)).unsqueeze(-1) weights = weights.expand_as(h) bilstmout = torch.sum(h*weights, 1).squeeze(1) #bilstmout = torch.cat([hidden[0][0],hidden[0][1]],-1) fc1fea = self.fc1(bilstmout) return fc1fea
def forward(self, # pylint: disable=arguments-differ inputs: torch.Tensor, mask: torch.Tensor, hidden_state: torch.Tensor = None) -> torch.Tensor: if self.stateful and mask is None: raise ValueError("Always pass a mask with stateful RNNs.") if self.stateful and hidden_state is not None: raise ValueError("Stateful RNNs provide their own initial hidden_state.") if mask is None: return self._module(inputs, hidden_state)[0] batch_size, total_sequence_length = mask.size() packed_sequence_output, final_states, restoration_indices = \ self.sort_and_run_forward(self._module, inputs, mask, hidden_state) unpacked_sequence_tensor, _ = pad_packed_sequence(packed_sequence_output, batch_first=True) num_valid = unpacked_sequence_tensor.size(0) # Some RNNs (GRUs) only return one state as a Tensor. Others (LSTMs) return two. # If one state, use a single element list to handle in a consistent manner below. if not isinstance(final_states, (list, tuple)) and self.stateful: final_states = [final_states] # Add back invalid rows. if num_valid < batch_size: _, length, output_dim = unpacked_sequence_tensor.size() zeros = unpacked_sequence_tensor.data.new(batch_size - num_valid, length, output_dim).fill_(0) zeros = Variable(zeros) unpacked_sequence_tensor = torch.cat([unpacked_sequence_tensor, zeros], 0) # The states also need to have invalid rows added back. if self.stateful: new_states = [] for state in final_states: num_layers, _, state_dim = state.size() zeros = state.data.new(num_layers, batch_size - num_valid, state_dim).fill_(0) zeros = Variable(zeros) new_states.append(torch.cat([state, zeros], 1)) final_states = new_states # It's possible to need to pass sequences which are padded to longer than the # max length of the sequence to a Seq2SeqEncoder. However, packing and unpacking # the sequences mean that the returned tensor won't include these dimensions, because # the RNN did not need to process them. We add them back on in the form of zeros here. sequence_length_difference = total_sequence_length - unpacked_sequence_tensor.size(1) if sequence_length_difference > 0: zeros = unpacked_sequence_tensor.data.new(batch_size, sequence_length_difference, unpacked_sequence_tensor.size(-1)).fill_(0) zeros = Variable(zeros) unpacked_sequence_tensor = torch.cat([unpacked_sequence_tensor, zeros], 1) if self.stateful: self._update_states(final_states, restoration_indices) # Restore the original indices and return the sequence. return unpacked_sequence_tensor.index_select(0, restoration_indices)
def test_forward_pulls_out_correct_tensor_for_unsorted_batches(self): lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=7, batch_first=True) encoder = PytorchSeq2SeqWrapper(lstm) input_tensor = torch.rand([5, 7, 3]) input_tensor[0, 3:, :] = 0 input_tensor[1, 4:, :] = 0 input_tensor[2, 2:, :] = 0 input_tensor[3, 6:, :] = 0 mask = torch.ones(5, 7) mask[0, 3:] = 0 mask[1, 4:] = 0 mask[2, 2:] = 0 mask[3, 6:] = 0 sequence_lengths = get_lengths_from_binary_sequence_mask(mask) sorted_inputs, sorted_sequence_lengths, restoration_indices, _ = sort_batch_by_length(input_tensor, sequence_lengths) packed_sequence = pack_padded_sequence(sorted_inputs, sorted_sequence_lengths.data.tolist(), batch_first=True) lstm_output, _ = lstm(packed_sequence) encoder_output = encoder(input_tensor, mask) lstm_tensor, _ = pad_packed_sequence(lstm_output, batch_first=True) assert_almost_equal(encoder_output.data.numpy(), lstm_tensor.index_select(0, restoration_indices).data.numpy())
def encode(self, src_sents_var: torch.Tensor, src_sent_lens: List[int]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Use a GRU/LSTM to encode source sentences into hidden states Args: src_sents: list of source sentence tokens Returns: src_encodings: hidden states of tokens in source sentences, this could be a variable with shape (batch_size, source_sentence_length, encoding_dim), or in orther formats decoder_init_state: decoder GRU/LSTM's initial state, computed from source encodings """ # (src_sent_len, batch_size, embed_size) src_word_embeds = self.src_embed(src_sents_var) packed_src_embed = pack_padded_sequence(src_word_embeds, src_sent_lens) # src_encodings: (src_sent_len, batch_size, hidden_size * 2) src_encodings, (last_state, last_cell) = self.encoder_lstm(packed_src_embed) src_encodings, _ = pad_packed_sequence(src_encodings) # (batch_size, src_sent_len, hidden_size * 2) src_encodings = src_encodings.permute(1, 0, 2) dec_init_cell = self.decoder_cell_init(torch.cat([last_cell[0], last_cell[1]], dim=1)) dec_init_state = torch.tanh(dec_init_cell) return src_encodings, (dec_init_state, dec_init_cell)
def forward(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): """ input: word_inputs: (batch_size, sent_len) word_seq_lengths: list of batch_size, (batch_size,1) char_inputs: (batch_size*sent_len, word_length) char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1) char_seq_recover: variable which records the char order information, used to recover char order output: Variable(batch_size, sent_len, hidden_dim) """ word_represent = self.wordrep(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) ## word_embs (batch_size, seq_len, embed_size) if self.word_feature_extractor == "CNN": word_in = F.tanh(self.word2cnn(word_represent)).transpose(2,1).contiguous() for idx in range(self.cnn_layer): if idx == 0: cnn_feature = F.relu(self.cnn_list[idx](word_in)) else: cnn_feature = F.relu(self.cnn_list[idx](cnn_feature)) cnn_feature = self.cnn_drop_list[idx](cnn_feature) cnn_feature = self.cnn_batchnorm_list[idx](cnn_feature) feature_out = cnn_feature.transpose(2,1).contiguous() else: packed_words = pack_padded_sequence(word_represent, word_seq_lengths.cpu().numpy(), True) hidden = None lstm_out, hidden = self.lstm(packed_words, hidden) lstm_out, _ = pad_packed_sequence(lstm_out) ## lstm_out (seq_len, seq_len, hidden_size) feature_out = self.droplstm(lstm_out.transpose(1,0)) ## feature_out (batch_size, seq_len, hidden_size) outputs = self.hidden2tag(feature_out) return outputs
def forward(self, sentences, sentences_len, hidden): sentences_len = sentences_len.cpu().data.numpy() idx = np.argsort(sentences_len).tolist()[::-1] ridx = np.argsort(idx).tolist() sentences = sentences[idx, :] sentences_len = sentences_len[idx, ] embedding = self.embedding(sentences) embedding = nn.Dropout(0.1)(embedding) packed_embedding = pack_padded_sequence(embedding, sentences_len, batch_first=True) packed_rnn_feature, hidden = self.rnn_feature(packed_embedding, hidden) sentence_feature, _ = pad_packed_sequence(packed_rnn_feature, batch_first=True) idx = Variable(LongTensor(sentences_len - 1)) idx = idx.view(-1, 1).expand(sentence_feature.size(0), sentence_feature.size(2)).unsqueeze(1) if sentence_feature.is_cuda: idx = idx.cuda() sentence_feature = sentence_feature.gather(1, idx).squeeze() sentence_feature = sentence_feature[ridx, :] sentences_len = sentences_len[ridx, ] logits = self.classifier(sentence_feature) pred = F.log_softmax(logits, dim=0) return pred
def forward(self, input, h0, c0, lens=None): output, hn, cn = self.encoder(input, h0, c0, lens) if lens: output, _ = pad_packed_sequence(output) logprobs = self.scorer(output.contiguous().view(output.size(0)*output.size(1), output.size(2))) logprobs = logprobs.view(output.size(0), output.size(1), logprobs.size(1)) return logprobs, hn, cn
def forward(self, vocab): with torch.no_grad(): batch_shape = vocab['sentence'].shape s_embedding = self.embedding(vocab['sentence'].cuda()) a_embedding = self.embedding(vocab['aspect'].cuda()) packed_s = pack_padded_sequence(s_embedding, vocab['sent_len'], batch_first=True) out_s, (h_s, c1) = self.lstm_s(packed_s) # packed output out_a, (h_a, c2) = self.lstm_a(a_embedding) with torch.no_grad(): unpacked_out_s, _ = pad_packed_sequence(out_s, batch_first=True) # Pair-wise interaction matrix I_matrix = torch.bmm(unpacked_out_s, out_a.permute(0,2,1)) # Column-wise softmax a2s_attn = F.softmax(I_matrix, dim=1) # Row-wise softmax => Column-wise average => aspect attention s2a_attn = F.softmax(I_matrix, dim=2) a_attn = torch.mean(s2a_attn, dim=1) # Final sentence attn => weighted sum of each individual a2s_attn s_attn = torch.bmm(a2s_attn, a_attn.unsqueeze(-1)) final_rep = torch.bmm(unpacked_out_s.permute(0,2,1), s_attn).squeeze(-1) pred = self.fc(final_rep) return pred
def forward(self, embs, lengths): """ This is the heart of the model. This function, defines how the data passes through the network. Args: embs (): word embeddings lengths (): the lengths of each sentence Returns: the logits for each class """ # pack the batch packed = pack_padded_sequence(embs, list(lengths.data), batch_first=True) out_packed, _ = self.rnn(packed) # unpack output - no need if we are going to use only the last outputs outputs, _ = pad_packed_sequence(out_packed, batch_first=True) # get the outputs from the last *non-masked* timestep for each sentence last_outputs = self.last_timestep(outputs, lengths, self.rnn.bidirectional) # apply dropout to the outputs of the RNN last_outputs = self.drop_rnn(last_outputs) return outputs, last_outputs
def forward(self, x): """Receives a Variable of indices (n_timesteps, n_samples) and returns their recurrent representations.""" # sort the batch by decreasing length of sequences # oidxs: to recover original order # sidxs: idxs to sort the batch # slens: lengths in sorted order for pack_padded_sequence() oidxs, sidxs, slens, mask = sort_batch(x) # Fetch embeddings for the sorted batch embs = self.emb(x[:, sidxs]) if self.dropout_emb > 0: embs = self.do_emb(embs) # Pack and encode packed_emb = pack_padded_sequence(embs, slens) packed_hs, h_t = self.enc(packed_emb) # Get hidden states and revert the order hs = pad_packed_sequence(packed_hs)[0][:, oidxs] if self.dropout_ctx > 0: hs = self.do_ctx(hs) return hs, mask
def _run_rnns(self, inputs, structures, lengths): ''' Run desired rnns ''' for rnn, structure in zip(self.rnns, [structures]): if isinstance(rnn, ChildSumTreeLSTM): h_all, h_last = rnn(inputs, structure) elif isinstance(rnn, LSTM): packed = pack_padded_sequence(inputs, list(lengths.data), batch_first=True) h_all, (h_last, c_last) = rnn(packed) h_all, _ = pad_packed_sequence(h_all, batch_first=True) elif isinstance(rnn, GRU): packed = pack_padded_sequence(inputs, list(lengths.data), batch_first=True) h_all, h_last = rnn(packed) h_all, _ = pad_packed_sequence(h_all, batch_first=True) inputs = h_all.squeeze() return h_all, h_last
def forward(self, input, seq_lens): embedded = self.embedding(input) packed = pack_padded_sequence(embedded, seq_lens, batch_first=True) output, hidden = self.lstm(packed) h, _ = pad_packed_sequence(output, batch_first=True) # h dim = B x t_k x n h = h.contiguous() max_h, _ = h.max(dim=1) return h, hidden, max_h
def test_variable_length_sequences_run_backward_return_correctly_padded_outputs(self): sorted_tensor, sorted_sequence, _, _ = sort_batch_by_length(self.random_tensor, self.sequence_lengths) tensor = pack_padded_sequence(sorted_tensor, sorted_sequence.data.tolist(), batch_first=True) lstm = AugmentedLstm(10, 11, go_forward=False) output, _ = lstm(tensor) output_sequence, _ = pad_packed_sequence(output, batch_first=True) numpy.testing.assert_array_equal(output_sequence.data[1, 6:, :].numpy(), 0.0) numpy.testing.assert_array_equal(output_sequence.data[2, 4:, :].numpy(), 0.0) numpy.testing.assert_array_equal(output_sequence.data[3, 3:, :].numpy(), 0.0) numpy.testing.assert_array_equal(output_sequence.data[4, 2:, :].numpy(), 0.0)
def test_stacked_bidirectional_lstm_completes_forward_pass(self): input_tensor = torch.rand(4, 5, 3) input_tensor[1, 4:, :] = 0. input_tensor[2, 2:, :] = 0. input_tensor[3, 1:, :] = 0. input_tensor = pack_padded_sequence(input_tensor, [5, 4, 2, 1], batch_first=True) lstm = StackedBidirectionalLstm(3, 7, 3) output, _ = lstm(input_tensor) output_sequence, _ = pad_packed_sequence(output, batch_first=True) numpy.testing.assert_array_equal(output_sequence.data[1, 4:, :].numpy(), 0.0) numpy.testing.assert_array_equal(output_sequence.data[2, 2:, :].numpy(), 0.0) numpy.testing.assert_array_equal(output_sequence.data[3, 1:, :].numpy(), 0.0)
def forward(self, inputs: PackedSequence, # pylint: disable=arguments-differ # pylint: disable=unused-argument initial_state: torch.Tensor = None)-> Tuple[PackedSequence, torch.Tensor]: """ Parameters ---------- inputs : ``PackedSequence``, required. A batch first ``PackedSequence`` to run the stacked LSTM over. initial_state : Tuple[torch.Tensor, torch.Tensor], optional, (default = None) Currently, this is ignored. Returns ------- output_sequence : ``PackedSequence`` The encoded sequence of shape (batch_size, sequence_length, hidden_size) final_states: ``torch.Tensor`` The per-layer final (state, memory) states of the LSTM, each with shape (num_layers, batch_size, hidden_size). """ inputs, lengths = pad_packed_sequence(inputs, batch_first=True) # Kernel takes sequence length first tensors. inputs = inputs.transpose(0, 1) sequence_length, batch_size, _ = inputs.size() accumulator_shape = [self.num_layers, sequence_length + 1, batch_size, self.hidden_size] state_accumulator = Variable(inputs.data.new(*accumulator_shape).zero_(), requires_grad=False) memory_accumulator = Variable(inputs.data.new(*accumulator_shape).zero_(), requires_grad=False) dropout_weights = inputs.data.new().resize_(self.num_layers, batch_size, self.hidden_size).fill_(1.0) if self.training: # Normalize by 1 - dropout_prob to preserve the output statistics of the layer. dropout_weights.bernoulli_(1 - self.recurrent_dropout_probability)\ .div_((1 - self.recurrent_dropout_probability)) dropout_weights = Variable(dropout_weights, requires_grad=False) gates = Variable(inputs.data.new().resize_(self.num_layers, sequence_length, batch_size, 6 * self.hidden_size)) lengths_variable = Variable(torch.IntTensor(lengths)) implementation = _AlternatingHighwayLSTMFunction(self.input_size, self.hidden_size, num_layers=self.num_layers, train=self.training) output, _ = implementation(inputs, self.weight, self.bias, state_accumulator, memory_accumulator, dropout_weights, lengths_variable, gates) # TODO(Mark): Also return the state here by using index_select with the lengths so we can use # it as a Seq2VecEncoder. output = output.transpose(0, 1) output = pack_padded_sequence(output, lengths, batch_first=True) return output, None
def test_stacked_alternating_lstm_completes_forward_pass(self): input_tensor = torch.autograd.Variable(torch.rand(4, 5, 3)) input_tensor[1, 4:, :] = 0. input_tensor[2, 2:, :] = 0. input_tensor[3, 1:, :] = 0. input_tensor = pack_padded_sequence(input_tensor, [5, 4, 2, 1], batch_first=True) lstm = StackedAlternatingLstm(3, 7, 3) output, _ = lstm(input_tensor) output_sequence, _ = pad_packed_sequence(output, batch_first=True) numpy.testing.assert_array_equal(output_sequence.data[1, 4:, :].numpy(), 0.0) numpy.testing.assert_array_equal(output_sequence.data[2, 2:, :].numpy(), 0.0) numpy.testing.assert_array_equal(output_sequence.data[3, 1:, :].numpy(), 0.0)
def forward(self, x, lens): B, T = x.shape # 获取词嵌入向量 x = self.embed(x) x = self.drop(x) x = pack_padded_sequence(x, lens, True) x, _ = self.lstm(x) x, _ = pad_packed_sequence(x, True) x = self.drop(x) return self.out(x)
def forward(self, input, hidden, no_pack=False): emb = self.drop(self.encoder(input)) # if eval, pack padded sequence (we don't pack during training because # we have no padding in our input samples) if not self.training and not no_pack: emb_lens = [x for x in torch.sum((input > 0).int(), dim=0).data] emb_packed = pack_padded_sequence(emb, emb_lens, batch_first=False) packed_output, hidden = self.rnn(emb_packed, hidden) output, _ = pad_packed_sequence(packed_output, batch_first=False) else: output, hidden = self.rnn(emb, hidden) output = self.drop(output) decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2))) return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
def _lstm_forward(self, inputs, initial_state=None): if initial_state is None: hidden_states = [None] * len(self.forward_layers) else: assert initial_state[0].size()[0] == len(self.forward_layers) hidden_states = list(zip(initial_state[0].split(1, 0), initial_state[1].split(1, 0))) inputs, batch_lengths = pad_packed_sequence(inputs, batch_first=True) forward_output_sequence = inputs backward_output_sequence = inputs final_states = [] sequence_outputs = [] for i, state in enumerate(hidden_states): forward_layer = getattr(self, "forward_layer_%d" % i) backward_layer = getattr(self, "backward_layer_%d" % i) forward_cache = forward_output_sequence backward_cache = backward_output_sequence if state is not None: forward_hidden_state, backward_hidden_state = state[0].split(self.hidden_size, 2) forward_memory_state, backward_memory_state = state[1].split(self.cell_size, 2) forward_state = (forward_hidden_state, forward_memory_state) backward_state = (backward_hidden_state, backward_memory_state) else: forward_state = None backward_state = None forward_output_sequence, forward_state = forward_layer(forward_output_sequence, batch_lengths, forward_state) backward_output_sequence, backward_state = backward_layer(backward_output_sequence, batch_lengths, backward_state) if i != 0: forward_output_sequence += forward_cache backward_output_sequence += backward_cache sequence_outputs.append(torch.cat([forward_output_sequence, backward_output_sequence], -1)) final_states.append((torch.cat([forward_state[0], backward_state[0]], -1), torch.cat([forward_state[1], backward_state[1]], -1))) stacked_sequence_outputs: torch.FloatTensor = torch.stack(sequence_outputs) final_hidden_states, final_memory_states = zip(*final_states) final_state_tuple = (torch.cat(final_hidden_states, 0), torch.cat(final_memory_states, 0)) return stacked_sequence_outputs, final_state_tuple
def _get_lstm_features(self, names, lengths): self.hidden = self.init_hidden(names.size(-1)) embeds = self.char_embeds(names) # Figure 4 packed_input = pack_padded_sequence(embeds, lengths) # Figure 5 packed_output, (ht, ct) = self.lstm(packed_input, self.hidden) # Figure 6 lstm_out, _ = pad_packed_sequence(packed_output) # Figure 7 lstm_out = torch.transpose(lstm_out, 0, 1) lstm_out = torch.transpose(lstm_out, 1, 2) lstm_out = F.tanh(lstm_out) # Figure 8 lstm_out, indices = F.max_pool1d(lstm_out, lstm_out.size(2), return_indices=True) # Figure 9 lstm_out = lstm_out.squeeze(2) #对维度的修正,使其符合输入格式 lstm_out = F.tanh(lstm_out) lstm_feats = self.fully_connected_layer(lstm_out) output = self.softmax(lstm_feats) # Figure 10 return output
def forward(self, input, doc_lens): """ :param input: (B, L) :param doc_lens: (B) :return: (B, L, H), ((2, B, H), (2, B, H)), (B, 2*H) """ input = self.embed(input) # (B, L) -> (B, L, D) packed = pack_padded_sequence(input, doc_lens, batch_first=True) output, hidden = self.lstm(packed) # hidden: ((2, B, H), (2, B, H)) h, _ = pad_packed_sequence(output, batch_first=True) # (B, L, 2*H) h = h.contiguous() # (B, L, 2*H) max_h, _ = h.max(dim=1) # (B, 2*H) return h, hidden, max_h # (B, L, 2*H), ((2, B, H), (2, B, H)), (B, 2*H)
def get_all_hiddens(self, input, seq_lengths): """ input: input: Variable(batch_size, word_length) seq_lengths: numpy array (batch_size, 1) output: Variable(batch_size, word_length, char_hidden_dim) Note it only accepts ordered (length) variable, length size is recorded in seq_lengths """ batch_size = input.size(0) char_embeds = self.char_drop(self.char_embeddings(input)) char_hidden = None pack_input = pack_padded_sequence(char_embeds, seq_lengths, True) char_rnn_out, char_hidden = self.char_lstm(pack_input, char_hidden) char_rnn_out, _ = pad_packed_sequence(char_rnn_out) return char_rnn_out.transpose(1,0)
def forward(self, hidden, src_tokens, lengths): x = self.embedding(src_tokens) x = pack_padded_sequence(x, lengths) x, hidden = self.rnn(x, hidden) x, output_lengths = pad_packed_sequence(x) if self.num_directions == 2: x = x[:, :, :self.args.hidden_size] + x[:, :, self.args.hidden_size:] # pad_packed_sequence calculates length of longest sequence and returns that as the length # sometimes that may not be equal to max_len (set to 50 by default) new_len = x.size()[0] encoder_padding_mask = src_tokens[:new_len, :].eq(self.padding_idx) return x, hidden, encoder_padding_mask
def forward(self, img_feats, question_feats, actions_in, action_lengths, hidden=False): input_feats = Variable() T = False if self.image_input == True: N, T, _ = img_feats.size() input_feats = img_feats if self.question_input == True: N, D = question_feats.size() question_feats = question_feats.view(N, 1, D) if T == False: T = actions_in.size(1) question_feats = question_feats.repeat(1, T, 1) if len(input_feats) == 0: input_feats = question_feats else: input_feats = torch.cat([input_feats, question_feats], 2) if self.action_input == True: if len(input_feats) == 0: input_feats = self.action_embed(actions_in) else: input_feats = torch.cat( [input_feats, self.action_embed(actions_in)], 2) packed_input_feats = pack_padded_sequence( input_feats, action_lengths, batch_first=True) packed_output, hidden = self.rnn(packed_input_feats) rnn_output, _ = pad_packed_sequence(packed_output, batch_first=True) output = self.decoder(rnn_output.contiguous().view( rnn_output.size(0) * rnn_output.size(1), rnn_output.size(2))) if self.return_states == True: return rnn_output, output, hidden else: return output, hidden
def forward(self, fea_v, length, target_start, target_end): if self.add_char: word_v = fea_v[0] char_v = fea_v[1] else: word_v = fea_v batch_size = word_v.size(0) seq_length = word_v.size(1) word_emb = self.embedding(word_v) word_emb = self.dropout_emb(word_emb) if self.static: word_static = self.embedding_static(word_v) word_static = self.dropout_emb(word_static) word_emb = torch.cat([word_emb, word_static], 2) x = torch.transpose(word_emb, 0, 1) packed_words = pack_padded_sequence(x, length) lstm_out, self.hidden = self.lstm(packed_words, self.hidden) lstm_out, _ = pad_packed_sequence(lstm_out) ##### lstm_out: (seq_len, batch_size, hidden_size) lstm_out = self.dropout_lstm(lstm_out) x = lstm_out x = x.transpose(0, 1) ##### batch version # x = torch.squeeze(lstm_out, 1) # x: variable (seq_len, batch_size, hidden_size) # target_start: variable (batch_size) # _, start = torch.max(target_start.unsqueeze(0), dim=1) # max_start = utils.to_scalar(target_start[start]) # _, end = torch.min(target_end.unsqueeze(0), dim=1) # min_end = utils.to_scalar(target_end[end]) max_length = 0 for index in range(batch_size): x_len = x[index].size(0) start = utils.to_scalar(target_start[index]) end = utils.to_scalar(target_end[index]) none_t = x_len - (end - start + 1) if none_t > max_length: max_length = none_t # left_save = [] # mask_left_save = [] # right_save = [] # mask_right_save = [] none_target = [] mask_none_target = [] target_save = [] for idx in range(batch_size): mask_none_t = [] none_t = None x_len_cur = x[idx].size(0) start_cur = utils.to_scalar(target_start[idx]) end_cur = utils.to_scalar(target_end[idx]) # left_len_cur = start_cur # left_len_max = max_start x_target = x[idx][start_cur:(end_cur + 1)] x_average_target = torch.mean(x_target, 0) target_save.append(x_average_target.unsqueeze(0)) if start_cur != 0: left = x[idx][:start_cur] none_t = left mask_none_t.extend([1] * start_cur) if end_cur != (x_len_cur - 1): right = x[idx][(end_cur + 1):] if none_t is not None: none_t = torch.cat([none_t, right], 0) else: none_t = right mask_none_t.extend([1] * (x_len_cur - end_cur - 1)) if len(mask_none_t) != max_length: add_t = Variable( torch.zeros((max_length - len(mask_none_t)), self.lstm_hiddens)) if self.use_cuda: add_t = add_t.cuda() mask_none_t.extend([0] * (max_length - len(mask_none_t))) # print(add_t) none_t = torch.cat([none_t, add_t], 0) mask_none_target.append(mask_none_t) none_target.append(none_t.unsqueeze(0)) # if start_cur != 0: # x_cur_left = x[idx][:start_cur] # left_len_sub = left_len_max - left_len_cur # mask_cur_left = [1 for _ in range(left_len_cur)] # else: # x_cur_left = x[idx][0].unsqueeze(0) # left_len_sub = left_len_max - 1 # # mask_cur_left = [-1e+20] # mask_cur_left = [0] # # x_cur_left: variable (start_cur, two_hidden_size) # # mask_cur_left = [1 for _ in range(start_cur)] # # mask_cur_left: list (start_cur) # if start_cur < max_start: # add = Variable(torch.zeros(left_len_sub, self.lstm_hiddens)) # if self.use_cuda: add = add.cuda() # x_cur_left = torch.cat([x_cur_left, add], 0) # # x_cur_left: variable (max_start, two_hidden_size) # left_save.append(x_cur_left.unsqueeze(0)) # # mask_cur_left.extend([-1e+20 for _ in range(left_len_sub)]) # mask_cur_left.extend([0 for _ in range(left_len_sub)]) # # mask_cur_left: list (max_start) # mask_left_save.append(mask_cur_left) # else: # left_save.append(x_cur_left.unsqueeze(0)) # mask_left_save.append(mask_cur_left) # # end_cur = utils.to_scalar(target_end[idx]) # right_len_cur = x_len_cur - end_cur - 1 # right_len_max = x_len_cur - min_end - 1 # if (end_cur + 1) != x_len_cur: # x_cur_right = x[idx][(end_cur + 1):] # right_len_sub = right_len_max - right_len_cur # mask_cur_right = [1 for _ in range(right_len_cur)] # else: # x_cur_right = x[idx][end_cur].unsqueeze(0) # right_len_sub = right_len_max - right_len_cur - 1 # # mask_cur_right = [-1e+20] # mask_cur_right = [0] # # x_cur_right: variable ((x_len_cur-end_cur-1), two_hidden_size) # # mask_cur_right = [1 for _ in range(right_len_cur)] # # mask_cur_right: list (x_len_cur-end_cur-1==right_len) # if end_cur > min_end: # add = Variable(torch.zeros(right_len_sub, self.lstm_hiddens)) # if self.use_cuda: add = add.cuda() # x_cur_right = torch.cat([x_cur_right, add], 0) # right_save.append(x_cur_right.unsqueeze(0)) # # mask_cur_right.extend([-1e+20 for _ in range(right_len_sub)]) # mask_cur_right.extend([0 for _ in range(right_len_sub)]) # mask_right_save.append(mask_cur_right) # else: # right_save.append(x_cur_right.unsqueeze(0)) # mask_right_save.append(mask_cur_right) # mask_left_save = Variable(torch.ByteTensor(mask_left_save)) # # mask_left_save: variable (batch_size, left_len_max) # mask_right_save = Variable(torch.ByteTensor(mask_right_save)) # # mask_right_save: variable (batch_size, right_len_max) # left_save = torch.cat(left_save, 0) # right_save = torch.cat(right_save, 0) target_save = torch.cat(target_save, 0) # print(none_target) none_target = torch.cat(none_target, 0) mask_none_target = Variable(torch.ByteTensor(mask_none_target)) # left_save: variable (batch_size, left_len_max, two_hidden_size) # right_save: variable (batch_size, right_len_max, two_hidden_size) # target_save: variable (batch_size, two_hidden_size) if self.use_cuda: # mask_right_save = mask_right_save.cuda() # mask_left_save = mask_left_save.cuda() # left_save = left_save.cuda() # right_save = right_save.cuda() target_save = target_save.cuda() mask_none_target = mask_none_target.cuda() none_target = none_target.cuda() # squence = torch.cat(none_target, 1) s = self.attention(none_target, target_save, mask_none_target) # s = self.attention(x, target_save, None) # s_l = self.attention_l(left_save, target_save, mask_left_save) # s_r = self.attention_r(right_save, target_save, mask_right_save) result = self.linear(s) # result: variable (1, label_num) # result = self.linear_l(s_l) # result = torch.add(result, self.linear_l(s_l)) # result = torch.add(result, self.linear_r(s_r)) # result: variable (batch_size, label_num) # print(result) return result
def forward(self, src_tokens, src_lengths): if LanguagePairDataset.LEFT_PAD_SOURCE: # convert left-padding to right-padding src_tokens = utils.convert_padding_direction(src_tokens, self.padding_idx, left_to_right=True) if self.word_dropout_module is not None: src_tokens = self.word_dropout_module(src_tokens) bsz, seqlen = src_tokens.size() # embed tokens x = self.embed_tokens(src_tokens) x = F.dropout(x, p=self.dropout_in, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) # Allows compatibility with Caffe2 inputs for tracing (int32) # as well as the current format of Fairseq-Py inputs (int64) if src_lengths.dtype is torch.int64: src_lengths = src_lengths.int() # Generate packed seq to deal with varying source seq length # packed_input is of type PackedSequence, which consists of: # element [0]: a tensor, the packed data, and # element [1]: a list of integers, the batch size for each step packed_input = pack_padded_sequence(x, src_lengths) final_hiddens, final_cells = [], [] for i, rnn_layer in enumerate(self.layers): if self.bidirectional and i == 0: h0 = x.new(2, bsz, self.hidden_dim // 2).zero_() c0 = x.new(2, bsz, self.hidden_dim // 2).zero_() else: h0 = x.new(1, bsz, self.hidden_dim).zero_() c0 = x.new(1, bsz, self.hidden_dim).zero_() # apply LSTM along entire sequence current_output, (h_last, c_last) = rnn_layer(packed_input, (h0, c0)) # final state shapes: (bsz, hidden_dim) if self.bidirectional and i == 0: # concatenate last states for forward and backward LSTM h_last = torch.cat((h_last[0, :, :], h_last[1, :, :]), dim=1) c_last = torch.cat((c_last[0, :, :], c_last[1, :, :]), dim=1) else: h_last = h_last.squeeze(dim=0) c_last = c_last.squeeze(dim=0) final_hiddens.append(h_last) final_cells.append(c_last) if self.residual_level is not None and i >= self.residual_level: packed_input[0] = packed_input.clone()[0] + current_output[0] else: packed_input = current_output # Reshape to [num_layer, batch_size, hidden_dim] final_hiddens = torch.cat(final_hiddens, dim=0).view(self.num_layers, *final_hiddens[0].size()) final_cells = torch.cat(final_cells, dim=0).view(self.num_layers, *final_cells[0].size()) # [max_seqlen, batch_size, hidden_dim] unpacked_output, _ = pad_packed_sequence( packed_input, padding_value=self.padding_value) return (unpacked_output, final_hiddens, final_cells, src_lengths, src_tokens)
def forward(self, sentence, p_sentence, pos_tags, lengths, target_idx_in, region_marks, local_roles_voc, frames, local_roles_mask, sent_pred_lemmas_idx, dep_tags, dep_heads, targets, specific_dep_tags, specific_dep_relations, test=False): #contruct input for DEP pos_embeds = self.pos_embeddings(pos_tags) region_marks = region_marks.view(self.batch_size, len(sentence[0]), 1) fixed_embeds = self.word_fixed_embeddings(p_sentence) fixed_embeds = fixed_embeds.view(self.batch_size, len(sentence[0]), self.word_emb_dim) h_label_embeddings = self.DEP_Label_embeddings(dep_tags) h_link_embeddings = self.DEP_Link_embeddings(specific_dep_relations) sent_pred_lemmas_embeds = self.p_lemma_embeddings(sent_pred_lemmas_idx) embeds_SRL = self.word_embeddings_SRL(sentence) embeds_SRL = embeds_SRL.view(self.batch_size, len(sentence[0]), self.word_emb_dim) SRL_hidden_states = torch.cat( (embeds_SRL, fixed_embeds, sent_pred_lemmas_embeds, pos_embeds, region_marks, h_label_embeddings), 2) SRL_hidden_states = self.SRL_input_dropout(SRL_hidden_states) # SRL layer embeds_sort, lengths_sort, unsort_idx = self.sort_batch( SRL_hidden_states, lengths) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort.cpu().numpy(), batch_first=True) # hidden states [time_steps * batch_size * hidden_units] hidden_states, self.hidden_4 = self.BiLSTM_SRL(embeds_sort, self.hidden_4) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) # hidden_states = hidden_states.transpose(0, 1) hidden_states = hidden_states[unsort_idx] hidden_states = self.hidden_state_dropout(hidden_states) # B * H hidden_states_3 = hidden_states predicate_embeds = hidden_states_3[ np.arange(0, hidden_states_3.size()[0]), target_idx_in] # T * B * H added_embeds = torch.zeros(hidden_states_3.size()[1], hidden_states_3.size()[0], hidden_states_3.size()[2]).to(device) predicate_embeds = added_embeds + predicate_embeds # B * T * H predicate_embeds = predicate_embeds.transpose(0, 1) hidden_states = torch.cat((hidden_states_3, predicate_embeds), 2) # print(hidden_states) # non-linear map and rectify the roles' embeddings # roles = Variable(torch.from_numpy(np.arange(0, self.tagset_size))) # B * roles # log(local_roles_voc) # log(frames) # B * roles * h role_embeds = self.role_embeddings(local_roles_voc) frame_embeds = self.frame_embeddings(frames) role_embeds = torch.cat((role_embeds, frame_embeds), 2) mapped_roles = F.relu(self.role_map(role_embeds)) mapped_roles = torch.transpose(mapped_roles, 1, 2) # b, times, roles tag_space = torch.matmul(hidden_states, mapped_roles) #tag_space = hidden_states.mm(mapped_roles) # b, roles #sub = torch.div(torch.add(local_roles_mask, -1.0), _BIG_NUMBER) sub = torch.add(local_roles_mask, -1.0) * _BIG_NUMBER sub = torch.FloatTensor(sub.cpu().numpy()).to(device) # b, roles, times tag_space = torch.transpose(tag_space, 0, 1) tag_space += sub # b, T, roles tag_space = torch.transpose(tag_space, 0, 1) tag_space = tag_space.view(len(sentence[0]) * self.batch_size, -1) SRLprobs = F.softmax(tag_space, dim=1) #loss_function = nn.NLLLoss(ignore_index=0) targets = targets.view(-1) #tag_scores = F.log_softmax(tag_space) #loss = loss_function(tag_scores, targets) loss_function = nn.CrossEntropyLoss(ignore_index=0) SRLloss = loss_function(tag_space, targets) #weight = float(SRLloss.cpu().data.numpy()) #if weight > 0.1: # weight = 0.1 #p = nr.rand() #if p<0.2: # loss = SRLloss + DEPloss + SPEDEPloss #else: # loss = SRLloss loss = SRLloss return SRLloss, SRLloss, SRLloss, loss, SRLprobs, 1, 1, 1, 1, \ 1, 1, 1,\ 1, 1, 1
def forward(self, batch): """ input: word_inputs: (batch_size, sent_len) feature_inputs: [(batch_size, sent_len), ...] list of variables word_seq_lengths: list of batch_size, (batch_size,1) char_inputs: (batch_size*sent_len, word_length) char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1) char_seq_recover: variable which records the char order information, used to recover char order output: Variable(batch_size, sent_len, hidden_dim) """ word_inputs = batch["word"] word_seq_lengths = batch["word_length"] char_inputs = batch["char"] char_seq_lengths = batch["char_length"] char_seq_recover = batch["char_recover"] xupos_inputs = batch["xupos"] mask = batch["mask"].transpose(1, 0) emb = batch.get("emb", None) if self.use_bert: raw = batch["raw"] seq_max_len = len(raw[0]) all_input_ids = np.zeros((len(raw), 2048), dtype=int) all_input_mask = np.zeros((len(raw), 2048), dtype=int) all_word_end_mask = np.zeros((len(raw), 2048), dtype=int) subword_max_len = 0 for snum, sentence in enumerate(raw): tokens = [] word_end_mask = [] tokens.append("[CLS]") word_end_mask.append(1) cleaned_words = [] for word in sentence[1:]: word = BERT_TOKEN_MAPPING.get(word, word) if word == "n't" and cleaned_words: cleaned_words[-1] = cleaned_words[-1] + "n" word = "'t" cleaned_words.append(word) for word in cleaned_words: word_tokens = self.bert_tokenizer.tokenize(word) if len(word_tokens) == 0: word_tokens = ['.'] for _ in range(len(word_tokens)): word_end_mask.append(0) word_end_mask[-1] = 1 tokens.extend(word_tokens) tokens.append("[SEP]") # pad to sequence length for every sentence for i in range(seq_max_len - len(sentence)): word_end_mask.append(1) input_ids = self.bert_tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) subword_max_len = max(subword_max_len, len(word_end_mask) + 1) all_input_ids[snum, :len(input_ids)] = input_ids all_input_mask[snum, :len(input_mask)] = input_mask all_word_end_mask[snum, :len(word_end_mask)] = word_end_mask all_input_ids = from_numpy( np.ascontiguousarray(all_input_ids[:, :subword_max_len])) all_input_mask = from_numpy( np.ascontiguousarray(all_input_mask[:, :subword_max_len])) all_word_end_mask = from_numpy( np.ascontiguousarray(all_word_end_mask[:, :subword_max_len])) all_encoder_layers, _ = self.bert_model( all_input_ids, attention_mask=all_input_mask) del _ features = all_encoder_layers features_packed = features.masked_select( all_word_end_mask.to(torch.bool).unsqueeze(-1)).reshape( len(raw), seq_max_len, features.shape[-1]) outputs = self.bert_project(features_packed) elif self.use_transformer: word_represent = self.wordrep(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, xupos_inputs, emb=emb) outputs = self.lstm(word_represent, (1 - batch["mask_h"])) else: word_represent = self.wordrep(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, xupos_inputs, emb=emb) packed_words = pack_padded_sequence(word_represent, word_seq_lengths, True) hidden = None lstm_out, hidden = self.lstm(packed_words, hidden) lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True) feature_out = self.droplstm(lstm_out) outputs = feature_out return outputs
def forward(self, inputs: List[List[Tweet]], tweets_per_user: List[int]): """ TODO: 1) use word2vec to create as sequence of vectors for each tweet (i.e each tweet is a sequence of words) 2) create a batch out of all of the sequences (num_users*tweets_per_user, max_seq_len, embedding_dim) (make sure to remember which tweets belong to which users) 3) feed the batch into the recurrent feature extractor (num_users*tweets_per_user, max_seq_len, hidden_dim) 4) use only the last output (or the last few outputs) of each sequence. 5) create a tensor for each user made out of the tensors that came out of their tweets (concat or something) 6) add some other relevant data about each tweet to the tensors (like post time and stuff like that) 7) create a batch from those tensors (num_users, hidden_dim*tweets_per_user) 8) feed these these tensors into the linear feature extractor and return it's output """ handmade_features = [] # TASK 1 device = next(self.parameters()).device sequences = embed(self.word2vec_model, sum(inputs, []), device) sorted_indices, sorted_lengths = self.sorted_seq_by_len(sequences) num_tweets = len(sorted_indices) # TASK 2 # DON'T FORGET TO USE PADDING AND PACKING FOR INPUT padded_seq_batch = pad_sequence(sequences, batch_first=True) if self.use_TCN: padded_seq_batch = torch.stack( [m.t() for m in padded_seq_batch[sorted_indices]]) else: packed_seq_batch = pack_padded_sequence( padded_seq_batch[sorted_indices], sorted_lengths, batch_first=True) # TASK 3 # DON'T FORGET TO UNDO THE PADDING AND PACKING FROM TASK 3 if self.use_TCN: recurrent_features = torch.stack( [m.t() for m in self.temporal_extractor(padded_seq_batch)]) else: recurrent_features, _ = self.temporal_extractor(packed_seq_batch) recurrent_features, _ = pad_packed_sequence(recurrent_features, batch_first=True) # TASK 4 seq_end_indices = [l - 1 for l in sorted_lengths] used_recurrent_features = recurrent_features[range(num_tweets), seq_end_indices] # also reorder the tweets back used_recurrent_features = used_recurrent_features[sorted_indices] # ADD HANDMADE FEATURES # favorite count handmade_features.append( torch.Tensor([tweet.favorite_count for tweet in sum(inputs, []) ]).to(device).unsqueeze(1)) # is the tweet a quote? handmade_features.append( torch.Tensor([tweet.is_quote for tweet in sum(inputs, []) ]).to(device).unsqueeze(1)) # whether or not there is a retweeted status handmade_features.append( torch.Tensor([ tweet.retweeted_status is None for tweet in sum(inputs, []) ]).to(device).unsqueeze(1)) # the number of entities in the tweet handmade_features.append( torch.Tensor([ sum([len(entity) for entity in tweet.entities.values()]) for tweet in sum(inputs, []) ]).to(device).unsqueeze(1)) handmade_features = (torch.cat(handmade_features, dim=1) - self.means.to(device)) / self.stds.to(device) features_dim = self.hidden_dim + self.num_handmade_features used_recurrent_features = torch.cat( (used_recurrent_features, handmade_features), dim=1) # TASK 5 used_recurrent_features = list( torch.split(used_recurrent_features, tweets_per_user)) for i, urf in enumerate(used_recurrent_features): dim0 = urf.shape[0] if dim0 != 100: used_recurrent_features[i] = torch.cat([ urf, torch.zeros(100 - dim0, features_dim, device=urf.device) ], 0) used_recurrent_features = torch.cat(used_recurrent_features) recurrent_features_batch = used_recurrent_features.view( len(inputs), -1, features_dim) # TASK 7 recurrent_features_batch = recurrent_features_batch.view( -1, features_dim) # TASK 8 return self.feature_extractor(recurrent_features_batch).view( len(inputs), -1, self.output_dim)
def loss(self, batch): xq = batch['xq_padded'] # n_class * n_query * max_len * mfcc_dim xs = batch['xs_padded'] # n_class * n_support * max_len * mfcc_dim xq_len = batch['xq_len'] # n_class * n_query xs_len = batch['xs_len'] # n_class * n_support assert xq.shape[0] == xq_len.shape[0] assert xs.shape[0] == xs_len.shape[0] n_class = xq_len.shape[0] n_query = xq_len.shape[1] n_support = xs_len.shape[1] target_inds = torch.arange(0, n_class).view(n_class, 1, 1).expand( n_class, n_query, 1).long() target_inds = Variable(target_inds, requires_grad=False) if xq.is_cuda: target_inds = target_inds.cuda() seq_len = torch.cat([ xq_len.view(n_class * n_query, -1).squeeze(-1), xs_len.view(n_class * n_support, -1).squeeze(-1) ], 0) seq_len = Variable(seq_len, requires_grad=False) x = torch.cat([ xs.view(n_class * n_support, *xs.size()[2:]), xq.view(n_class * n_query, *xq.size()[2:]) ], 0) _len, perm_idx = seq_len.sort(0, descending=True) x = x[perm_idx] packed_input = pack_padded_sequence( x, _len.cpu().numpy().astype(dtype=np.int32), batch_first=True) packed_output, _ = self.encoder_rnn.forward(packed_input) z, _ = pad_packed_sequence(packed_output, batch_first=True) _, unperm_idx = perm_idx.sort(0) z = z[unperm_idx] #z, _ = self.encoder_rnn.forward(x) z = self.encoder_linear.forward((z, seq_len)) z_dim = z.size(-1) z_proto = z[:n_class * n_support].view(n_class, n_support, z_dim).mean(1) zq = z[n_class * n_support:] dists = euclidean_dist(zq, z_proto) log_p_y = F.log_softmax(-dists, dim=1).view(n_class, n_query, -1) loss_val = -log_p_y.gather(2, target_inds).squeeze().view(-1).mean() _, y_hat = log_p_y.max(2) acc_val = torch.eq(y_hat, target_inds.squeeze(-1)).float().mean() logger.info(f'loss: {loss_val.item()}, acc: {acc_val.item()}') return loss_val, {'loss': loss_val.item(), 'acc': acc_val.item()}
def forward(self, e_q, e_s, qseq_len, n_sents, seq_len): # e : word embedding # e_q - shape - [batch_size, q_len, emb_size] # e_d - shape - [batch_size, max_s, s_len, emb_size] # # SHARED Q&D MODELLING # batch_size = e_q.shape[0] emb_size = self.config.emb_size max_s = self.config.max_s s_len = self.config.max_slen q_len = self.config.max_qlen hidden_size = self.config.hidden_size device = self.config.device # shape of e_q - [batch_size, q_len, emb_size] emb_q = pack_padded_sequence(e_q, qseq_len, batch_first=True, enforce_sorted=False) u_q, _ = self.bilstm1( emb_q) # shape of u_q - [batch_size, q_len, 2*hidden_size] u_q, _ = pad_packed_sequence(u_q, batch_first=True) q_len = u_q.shape[1] e_s = e_s.view(batch_size * max_s, s_len, emb_size) seq_len = seq_len.view(batch_size * max_s) emb_d = pack_padded_sequence(e_s, seq_len, batch_first=True, enforce_sorted=False) u_d, _ = self.bilstm2(emb_d) u_d, _ = pad_packed_sequence(u_d, batch_first=True) # shape of u_d - [batch_size*max_s, s_len, 2*hidden_size] s_len = u_d.shape[1] # shape of u_d - [batch_size, max_s, s_len, 2*hidden_size] u_d = u_d.view(batch_size, max_s, s_len, 2 * hidden_size) seq_len = seq_len.view(batch_size, max_s) # # Co-attention and Fusion # # shape of u_q_temp - [batch_size, q_len, 2*hidden_size] u_q_temp = F.relu(self.linear1(u_q)) # shape of u_d_temp - [batch_size, max_s, s_len, 2*hidden_size] u_d_temp = F.relu(self.linear2(u_d)) u_q_temp2 = u_q_temp.transpose( 1, 2) # shape - [batch_size, 2*hidden_size, q_len] s = torch.matmul(u_d_temp, u_q_temp2.view(batch_size, 1, 2 * hidden_size, q_len)) alpha = F.softmax(s, dim=3) # shape -[batch_size, max_s, s_len, q_len] u_d_att = torch.matmul( alpha, u_q_temp.view(batch_size, 1, q_len, 2 * hidden_size)) #shape - [batch_size, max_s, s_len, 2*hidden_size] v_d = self.fuse_linear1(torch.cat( [u_d, u_d_att], dim=3)) #shape - [batch_size, max_s, s_len, 2*hidden_size] # # Self-attention and Fusion # v_d_temp = self.linear3(v_d) s = torch.matmul(v_d_temp, v_d_temp.transpose(2, 3)) beta = F.softmax(s, dim=3) # shape - [batch_size, max_s, s_len, s_len] v_d_att = torch.matmul(beta, v_d) d_d = self.fuse_linear2(torch.cat( [v_d, v_d_att], dim=3)) #shape - [batch_size, max_s, s_len, 2*hidden_size] # # Self-align for query # s = self.align_linear1(u_q).view(batch_size, q_len) gamma = F.softmax(s, dim=1) r_q = torch.matmul(gamma.view(batch_size, 1, q_len), u_q) r_q = r_q.view(batch_size, 2 * hidden_size) # # SENTENCE RANKING # #shape of d_d - [batch_size, max_s, s_len, 2*hidden_size] s = self.align_linear2(d_d).view(batch_size, max_s, s_len) mu = F.softmax(s, dim=2) r_d = torch.matmul(mu.view(batch_size, max_s, 1, s_len), d_d) r_d = r_d.view(batch_size, max_s, 2 * hidden_size) r_d2 = r_d.transpose(0, 1) s_d = torch.zeros(max_s, batch_size, device=device) for n in range(max_s): s_d[n] = self.bilinear(r_q, r_d2[n]).view(batch_size) s_d_norm = torch.sigmoid(s_d) s_d_norm2 = s_d_norm.transpose(0, 1) return s_d_norm2
def forward(self, word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, lemma, head, deprel, word_orig_idx, sentlens, wordlens): def pack(x): return pack_padded_sequence(x, sentlens, batch_first=True) inputs = [] if self.args['pretrain']: pretrained_emb = self.pretrained_emb(pretrained) pretrained_emb = self.trans_pretrained(pretrained_emb) pretrained_emb = pack(pretrained_emb) inputs += [pretrained_emb] #def pad(x): # return pad_packed_sequence(PackedSequence(x, pretrained_emb.batch_sizes), batch_first=True)[0] if self.args['word_emb_dim'] > 0: word_emb = self.word_emb(word) word_emb = pack(word_emb) lemma_emb = self.lemma_emb(lemma) lemma_emb = pack(lemma_emb) inputs += [word_emb, lemma_emb] if self.args['tag_emb_dim'] > 0: pos_emb = self.upos_emb(upos) if isinstance(self.vocab['xpos'], CompositeVocab): for i in range(len(self.vocab['xpos'])): pos_emb += self.xpos_emb[i](xpos[:, :, i]) else: pos_emb += self.xpos_emb(xpos) pos_emb = pack(pos_emb) feats_emb = 0 for i in range(len(self.vocab['feats'])): feats_emb += self.ufeats_emb[i](ufeats[:, :, i]) feats_emb = pack(feats_emb) inputs += [pos_emb, feats_emb] if self.args['char'] and self.args['char_emb_dim'] > 0: char_reps = self.charmodel(wordchars, wordchars_mask, word_orig_idx, sentlens, wordlens) char_reps = PackedSequence( self.trans_char(self.drop(char_reps.data)), char_reps.batch_sizes) inputs += [char_reps] lstm_inputs = torch.cat([x.data for x in inputs], 1) lstm_inputs = self.worddrop(lstm_inputs, self.drop_replacement) lstm_inputs = self.drop(lstm_inputs) lstm_inputs = PackedSequence(lstm_inputs, inputs[0].batch_sizes) lstm_outputs, _ = self.parserlstm( lstm_inputs, sentlens, hx=(self.parserlstm_h_init.expand( 2 * self.args['num_layers'], word.size(0), self.args['hidden_dim']).contiguous(), self.parserlstm_c_init.expand( 2 * self.args['num_layers'], word.size(0), self.args['hidden_dim']).contiguous())) lstm_outputs, _ = pad_packed_sequence(lstm_outputs, batch_first=True) unlabeled_scores = self.unlabeled(self.drop(lstm_outputs), self.drop(lstm_outputs)).squeeze(3) deprel_scores = self.deprel(self.drop(lstm_outputs), self.drop(lstm_outputs)) #goldmask = head.new_zeros(*head.size(), head.size(-1)+1, dtype=torch.uint8) #goldmask.scatter_(2, head.unsqueeze(2), 1) if self.args['linearization'] or self.args['distance']: head_offset = torch.arange(word.size(1), device=head.device).view( 1, 1, -1).expand(word.size(0), -1, -1) - torch.arange( word.size(1), device=head.device).view(1, -1, 1).expand( word.size(0), -1, -1) if self.args['linearization']: lin_scores = self.linearization(self.drop(lstm_outputs), self.drop(lstm_outputs)).squeeze(3) unlabeled_scores += F.logsigmoid( lin_scores * torch.sign(head_offset).float()).detach() if self.args['distance']: dist_scores = self.distance(self.drop(lstm_outputs), self.drop(lstm_outputs)).squeeze(3) dist_pred = 1 + F.softplus(dist_scores) dist_target = torch.abs(head_offset) dist_kld = -torch.log((dist_target.float() - dist_pred)**2 / 2 + 1) unlabeled_scores += dist_kld.detach() diag = torch.eye(head.size(-1) + 1, dtype=torch.bool, device=head.device).unsqueeze(0) unlabeled_scores.masked_fill_(diag, -float('inf')) preds = [] if self.training: unlabeled_scores = unlabeled_scores[:, 1:, :] # exclude attachment for the root symbol unlabeled_scores = unlabeled_scores.masked_fill( word_mask.unsqueeze(1), -float('inf')) unlabeled_target = head.masked_fill(word_mask[:, 1:], -1) loss = self.crit( unlabeled_scores.contiguous().view(-1, unlabeled_scores.size(2)), unlabeled_target.view(-1)) deprel_scores = deprel_scores[:, 1:] # exclude attachment for the root symbol #deprel_scores = deprel_scores.masked_select(goldmask.unsqueeze(3)).view(-1, len(self.vocab['deprel'])) deprel_scores = torch.gather( deprel_scores, 2, head.unsqueeze(2).unsqueeze(3).expand( -1, -1, -1, len(self.vocab['deprel']))).view(-1, len(self.vocab['deprel'])) deprel_target = deprel.masked_fill(word_mask[:, 1:], -1) loss += self.crit(deprel_scores.contiguous(), deprel_target.view(-1)) if self.args['linearization']: #lin_scores = lin_scores[:, 1:].masked_select(goldmask) lin_scores = torch.gather(lin_scores[:, 1:], 2, head.unsqueeze(2)).view(-1) lin_scores = torch.cat([ -lin_scores.unsqueeze(1) / 2, lin_scores.unsqueeze(1) / 2 ], 1) #lin_target = (head_offset[:, 1:] > 0).long().masked_select(goldmask) lin_target = torch.gather((head_offset[:, 1:] > 0).long(), 2, head.unsqueeze(2)) loss += self.crit(lin_scores.contiguous(), lin_target.view(-1)) if self.args['distance']: #dist_kld = dist_kld[:, 1:].masked_select(goldmask) dist_kld = torch.gather(dist_kld[:, 1:], 2, head.unsqueeze(2)) loss -= dist_kld.sum() loss /= wordchars.size(0) # number of words else: loss = 0 preds.append( F.log_softmax(unlabeled_scores, 2).detach().cpu().numpy()) preds.append(deprel_scores.max(3)[1].detach().cpu().numpy()) return loss, preds
def forward(self, data, id_): # features = self.LL(features) batch_size = 1 if (data[id_][3] == 0): # print "data" features_fast = self.get_imputed_feats( data[id_][0], data[id_][1], data[id_][2], self.dict_selected_feats_fast, self.imputation_layer_in_fast, len(self.fast_features_indexes)) lenghts_fast = [features_fast.shape[1]] lengths_fast = torch.cuda.LongTensor(lenghts_fast) lengths_fast = autograd.Variable(lengths_fast) packed_fast = pack_padded_sequence(features_fast, lengths_fast, batch_first=True) self.hidden_fast = self.init_hidden(batch_size, self.hidden_dim_fast) packed_output_fast, self.hidden_fast = self.lstm_fast( packed_fast, self.hidden_fast) lstm_out_fast = pad_packed_sequence(packed_output_fast, batch_first=True)[0] if (self.attn_category == 'dot'): pad_attn_fast = self.attn_fast( (lstm_out_fast, torch.cuda.LongTensor(lengths_fast))) else: pad_attn_fast = torch.cuda.FloatTensor( np.zeros([1, self.hidden_dim_fast])) if (data[id_][7] == 0): # print "data" features_slow = self.get_imputed_feats( data[id_][4], data[id_][5], data[id_][6], self.dict_selected_feats_slow, self.imputation_layer_in_slow, len(self.slow_features_indexes)) lenghts_slow = [features_slow.shape[1]] lengths_slow = torch.cuda.LongTensor(lenghts_slow) lengths_slow = autograd.Variable(lengths_slow) packed_slow = pack_padded_sequence(features_slow, lengths_slow, batch_first=True) self.hidden_slow = self.init_hidden(batch_size, self.hidden_dim_slow) packed_output_slow, self.hidden_slow = self.lstm_slow( packed_slow, self.hidden_slow) lstm_out_slow = pad_packed_sequence(packed_output_slow, batch_first=True)[0] if (self.attn_category == 'dot'): pad_attn_slow = self.attn_slow( (lstm_out_slow, torch.cuda.LongTensor(lengths_slow))) else: pad_attn_slow = torch.cuda.FloatTensor( np.zeros([1, self.hidden_dim_slow])) # print pad_attn_final.shape # else: # For now this won't work # tag_space = self.hidden2tag(lstm_out[:,-1,:]) # print pad_attn_fast.shape # print pad_attn_slow.shape pad_attn_final = torch.cat([pad_attn_fast, pad_attn_slow], 1) tag_space = self.hidden2tag(pad_attn_final) # print tag_space tag_score = F.log_softmax(tag_space, dim=1) return tag_score
def forward(self, input_seqs): """ Forward pass. # Arguments: input_seqs: Can be one of Numpy array, Torch.LongTensor, Torch.Variable, Torch.PackedSequence. # Return: Same format as input format (except for PackedSequence returned as Variable). """ # Check if we have Torch.LongTensor inputs or not Torch.Variable (assume Numpy array in this case), take note to return same format return_numpy = False if isinstance(input_seqs, (torch.LongTensor, torch.cuda.LongTensor)): input_seqs = Variable(input_seqs) elif not isinstance(input_seqs, Variable): input_seqs = Variable( torch.from_numpy(input_seqs.astype('int64')).long()) return_numpy = True # If we don't have a packed inputs, let's pack it reorder_output = False if not isinstance(input_seqs, PackedSequence): ho = self.lstm_0.weight_hh_l0.data.new(2, input_seqs.size()[0], self.hidden_size).zero_() co = self.lstm_0.weight_hh_l0.data.new(2, input_seqs.size()[0], self.hidden_size).zero_() # Reorder batch by sequence length input_lengths = torch.LongTensor([ torch.max(input_seqs[i, :].data.nonzero()) + 1 for i in range(input_seqs.size()[0]) ]) input_lengths, perm_idx = input_lengths.sort(0, descending=True) input_seqs = input_seqs[perm_idx][:, :input_lengths.max()] # Pack sequence and work on data tensor to reduce embeddings/dropout computations packed_input = pack_padded_sequence(input_seqs, input_lengths.cpu().numpy(), batch_first=True) reorder_output = True else: ho = self.lstm_0.weight_hh_l0.data.data.new( 2, input_seqs.size()[0], self.hidden_size).zero_() co = self.lstm_0.weight_hh_l0.data.data.new( 2, input_seqs.size()[0], self.hidden_size).zero_() input_lengths = input_seqs.batch_sizes packed_input = input_seqs hidden = (Variable(ho, requires_grad=False), Variable(co, requires_grad=False)) # Embed with an activation function to bound the values of the embeddings x = self.embed(packed_input.data) x = nn.Tanh()(x) # pyTorch 2D dropout2d operate on axis 1 which is fine for us x = self.embed_dropout(x) # Update packed sequence data for RNN packed_input = PackedSequence(x, packed_input.batch_sizes) # skip-connection from embedding to output eases gradient-flow and allows access to lower-level features # ordering of the way the merge is done is important for consistency with the pretrained model lstm_0_output, _ = self.lstm_0(packed_input, hidden) lstm_1_output, _ = self.lstm_1(lstm_0_output, hidden) # Update packed sequence data for attention layer packed_input = PackedSequence( torch.cat( (lstm_1_output.data, lstm_0_output.data, packed_input.data), dim=1), packed_input.batch_sizes) input_seqs, _ = pad_packed_sequence(packed_input, batch_first=True) x, att_weights = self.attention_layer(input_seqs, input_lengths) # output class probabilities or penultimate feature vector if not self.feature_output: x = self.final_dropout(x) outputs = self.output_layer(x) else: outputs = x # Reorder output if needed if reorder_output: reorered = Variable(outputs.data.new(outputs.size())) reorered[perm_idx] = outputs outputs = reorered # Adapt return format if needed if return_numpy: outputs = outputs.data.numpy() if self.return_attention: return outputs, att_weights else: return outputs
def forward( self, embedded_tokens: torch.Tensor, seq_lengths: torch.Tensor, states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Given an input batch of sequential data such as word embeddings, produces a bidirectional LSTM representation of the sequential input and new state tensors. Args: embedded_tokens (torch.Tensor): Input tensor of shape (bsize x seq_len x input_dim). seq_lengths (torch.Tensor): List of sequences lengths of each batch element. states (Tuple[torch.Tensor, torch.Tensor]): Tuple of tensors containing the initial hidden state and the cell state of each element in the batch. Each of these tensors have a dimension of (bsize x num_layers * num_directions x nhid). Defaults to `None`. Returns: Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: Bidirectional LSTM representation of input and the state of the LSTM `t = seq_len`. Shape of representation is (bsize x seq_len x representation_dim). Shape of each state is (bsize x num_layers * num_directions x nhid). """ if self.dropout.p > 0.0: embedded_tokens = self.dropout(embedded_tokens) if states is not None: # convert (h0, c0) from (bsz x num_layers*num_directions x nhid) to # (num_layers*num_directions x bsz x nhid) states = ( states[0].transpose(0, 1).contiguous(), states[1].transpose(0, 1).contiguous(), ) else: # We need to send in a zero state that matches the batch size, because # torch.jit tracing currently traces this as constant and therefore # locks the traced model into a static batch size. # see https://github.com/pytorch/pytorch/issues/16664 state = torch.zeros( self.config.num_layers * (2 if self.config.bidirectional else 1), embedded_tokens.size(0), # batch size self.config.lstm_dim, device=torch.cuda.current_device() if cuda.CUDA_ENABLED else None, ) states = (state, state) if torch.onnx.is_in_onnx_export(): lstm_in = [embedded_tokens, states[0], states[1]] + [ param.detach() for param in self.lstm._flat_weights ] rep, new_state_0, new_state_1 = torch.ops._caffe2.InferenceLSTM( lstm_in, self.lstm.num_layers, self.lstm.bias, True, self.lstm.bidirectional, ) new_state = (new_state_0, new_state_1) else: if self.pack_sequence: rnn_input = pack_padded_sequence(embedded_tokens, seq_lengths, batch_first=True, enforce_sorted=False) else: rnn_input = embedded_tokens rep, new_state = self.lstm(rnn_input, states) if self.pack_sequence: rep, _ = pad_packed_sequence( rep, padding_value=self.padding_value, batch_first=True, total_length=embedded_tokens.size(1), ) # Make sure the output from LSTM is padded to input's sequence length. # convert states back to (bsz x num_layers*num_directions x nhid) to be # used in data parallel model new_state = (new_state[0].transpose(0, 1), new_state[1].transpose(0, 1)) return rep, new_state
criterion = nn.L1Loss() def set_requires_grad(net, requires_grad=False): for param in net.parameters(): param.requires_grad = requires_grad step = 0 t = trange(config.epoch) for epoch in t: for l, r in train_loader: b_size = l.size(0) seq_padded, lens = rnn_utils.pad_packed_sequence(r, batch_first=False) max_len = seq_padded.shape[0] pad_mask = torch.arange(max_len)[None, :] < lens[:, None] seq_padded = seq_padded.to(device) pad_mask = ~pad_mask.to(device) optimizer_g.zero_grad() fake_y = netG(seq_padded, pad_mask) y = l.to(device) ########################### train G ############################ loss = criterion(fake_y, l)
def forward(self, sentence, p_sentence, pos_tags, lengths, target_idx_in, region_marks, local_roles_voc, frames, local_roles_mask, sent_pred_lemmas_idx, dep_tags, dep_heads, targets, specific_dep_tags, specific_dep_relations, test=False): """ elmo_embedding_0 = self.elmo_embeddings_0(sentence).view(self.batch_size, len(sentence[0]), 1024) elmo_embedding_1 = self.elmo_embeddings_1(sentence).view(self.batch_size, len(sentence[0]), 1024) w = F.softmax(self.elmo_word, dim=0) elmo_emb = self.elmo_gamma_word * (w[0] * elmo_embedding_0 + w[1] * elmo_embedding_1) elmo_emb_word = self.elmo_mlp_word(elmo_emb) """ log(sentence) log(p_sentence) log(pos_tags) log(region_marks) log(sent_pred_lemmas_idx) embeds_DEP = self.word_embeddings_DEP(sentence) add_zero = torch.zeros( (self.batch_size, 1, self.word_emb_dim)).to(device) embeds_DEP = embeds_DEP.view(self.batch_size, len(sentence[0]), self.word_emb_dim) embeds_DEP_cat = torch.cat( (self.VR_word_embedding_random + add_zero, embeds_DEP), 1) pos_embeds = self.pos_embeddings(pos_tags) add_zero = torch.zeros((self.batch_size, 1, 16)).to(device) pos_embeds_cat = torch.cat( (self.VR_POS_embedding + add_zero, pos_embeds), 1) fixed_embeds_DEP = self.word_fixed_embeddings(p_sentence) fixed_embeds_DEP = fixed_embeds_DEP.view(self.batch_size, len(sentence[0]), self.word_emb_dim) add_zero = torch.zeros( (self.batch_size, 1, self.word_emb_dim)).to(device) fixed_embeds_DEP_cat = torch.cat( (self.VR_word_embedding + add_zero, fixed_embeds_DEP), 1) embeds_forDEP = torch.cat( (embeds_DEP_cat, fixed_embeds_DEP_cat, pos_embeds_cat), 2) embeds_forDEP = self.DEP_input_dropout(embeds_forDEP) # first layer embeds_sort, lengths_sort, unsort_idx = self.sort_batch( embeds_forDEP, lengths + 1) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort, batch_first=True) # hidden states [time_steps * batch_size * hidden_units] hidden_states, self.hidden = self.BiLSTM_0(embeds_sort, self.hidden) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) # hidden_states = hidden_states.transpose(0, 1) hidden_states_0 = hidden_states[unsort_idx] # second_layer embeds_sort, lengths_sort, unsort_idx = self.sort_batch( hidden_states_0, lengths + 1) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort, batch_first=True) # hidden states [time_steps * batch_size * hidden_units] hidden_states, self.hidden_2 = self.BiLSTM_1(embeds_sort, self.hidden_2) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) # hidden_states = hidden_states.transpose(0, 1) hidden_states_1 = hidden_states[unsort_idx] ########################################## Head_hidden = F.relu(self.hidLayerFOH(hidden_states_1)) Dependent_hidden = F.relu(self.hidLayerFOM(hidden_states_1)) bias_one = torch.ones( (self.batch_size, len(sentence[0]) + 1, 1)).to(device) Head_hidden = torch.cat((Head_hidden, Variable(bias_one)), 2) bias_one = torch.ones( (self.batch_size, len(sentence[0]) + 1, 1)).to(device) Dependent_hidden = torch.cat((Dependent_hidden, Variable(bias_one)), 2) left_part = torch.mm( Dependent_hidden.view(self.batch_size * (len(sentence[0]) + 1), -1), self.W_R) left_part = left_part.view(self.batch_size, (len(sentence[0]) + 1), -1) Head_hidden = Head_hidden.view(self.batch_size, (len(sentence[0]) + 1), -1).transpose(1, 2) tag_space = torch.bmm(left_part, Head_hidden).view( (len(sentence[0]) + 1) * self.batch_size, len(sentence[0]) + 1) heads = np.argmax(tag_space.cpu().data.numpy(), axis=1) nums = 0.0 wrong_nums = 0.0 for a, b in zip(heads, dep_heads.flatten()): if b == -1: continue nums += 1 if a != b: wrong_nums += 1 loss_function = nn.CrossEntropyLoss(ignore_index=-1) DEPloss = loss_function( tag_space, torch.from_numpy(dep_heads).to(device).view(-1)) #+++++++++++++++++++++++++++++++++++++++++++++++++++++ Head_hidden_tag = F.relu(self.hidLayerFOH_tag(hidden_states_1)) Dependent_hidden_tag = F.relu(self.hidLayerFOM_tag(hidden_states_1)) bias_one = torch.ones( (self.batch_size, len(sentence[0]) + 1, 1)).to(device) Head_hidden_tag = torch.cat((Head_hidden_tag, Variable(bias_one)), 2) bias_one = torch.ones( (self.batch_size, len(sentence[0]) + 1, 1)).to(device) Dependent_hidden_tag = torch.cat( (Dependent_hidden_tag, Variable(bias_one)), 2) left_part = torch.mm( Dependent_hidden_tag.view(self.batch_size * (len(sentence[0]) + 1), -1), self.W_R_tag) left_part = left_part.view(self.batch_size, (len(sentence[0]) + 1) * self.dep_size, -1) Head_hidden_tag = Head_hidden_tag.view(self.batch_size, (len(sentence[0]) + 1), -1).transpose(1, 2) tag_space_tag = torch.bmm(left_part, Head_hidden_tag).view( (len(sentence[0]) + 1) * self.batch_size, self.dep_size, len(sentence[0]) + 1).transpose(1, 2) tag_space_tag = tag_space_tag[np.arange(0, (len(sentence[0]) + 1) * self.batch_size), dep_heads.flatten()] tag_space_tag = tag_space_tag.view( (len(sentence[0]) + 1) * self.batch_size, -1) heads_tag = np.argmax(tag_space_tag.cpu().data.numpy(), axis=1) nums_tag = 0.0 wrong_nums_tag = 0.0 for a, b in zip(heads_tag, dep_tags.view(-1).cpu().data.numpy()): if b == -1 or b == 0: continue nums_tag += 1 if a != b: wrong_nums_tag += 1 loss_function = nn.CrossEntropyLoss(ignore_index=0) DEPloss_tag = loss_function(tag_space_tag, dep_tags.view(-1)) h_layer_0 = hidden_states_0[:, 1:] # .detach() h_layer_1 = hidden_states_1[:, 1:] # .detach() w = F.softmax(self.elmo_w, dim=0) SRL_composer = self.elmo_gamma * (w[0] * h_layer_0 + w[1] * h_layer_1) SRL_composer = self.elmo_mlp(SRL_composer) fixed_embeds = self.word_fixed_embeddings(p_sentence) fixed_embeds = fixed_embeds.view(self.batch_size, len(sentence[0]), self.word_emb_dim) sent_pred_lemmas_embeds = self.p_lemma_embeddings(sent_pred_lemmas_idx) embeds_SRL = self.word_embeddings_SRL(sentence) embeds_SRL = embeds_SRL.view(self.batch_size, len(sentence[0]), self.word_emb_dim) region_marks = self.region_embeddings(region_marks).view( self.batch_size, len(sentence[0]), 16) SRL_hidden_states = torch.cat( (embeds_SRL, fixed_embeds, sent_pred_lemmas_embeds, pos_embeds, region_marks, SRL_composer), 2) SRL_hidden_states = self.SRL_input_dropout(SRL_hidden_states) # SRL layer embeds_sort, lengths_sort, unsort_idx = self.sort_batch( SRL_hidden_states, lengths) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort.cpu().numpy(), batch_first=True) # hidden states [time_steps * batch_size * hidden_units] hidden_states, self.hidden_4 = self.BiLSTM_SRL(embeds_sort, self.hidden_4) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) # hidden_states = hidden_states.transpose(0, 1) hidden_states = hidden_states[unsort_idx] hidden_states = self.hidden_state_dropout(hidden_states) # B * H hidden_states_3 = hidden_states predicate_embeds = hidden_states_3[ np.arange(0, hidden_states_3.size()[0]), target_idx_in] # T * B * H added_embeds = torch.zeros(hidden_states_3.size()[1], hidden_states_3.size()[0], hidden_states_3.size()[2]).to(device) predicate_embeds = added_embeds + predicate_embeds # B * T * H predicate_embeds = predicate_embeds.transpose(0, 1) hidden_states = torch.cat((hidden_states_3, predicate_embeds), 2) # print(hidden_states) # non-linear map and rectify the roles' embeddings # roles = Variable(torch.from_numpy(np.arange(0, self.tagset_size))) # B * roles # log(local_roles_voc) # log(frames) # B * roles * h role_embeds = self.role_embeddings(local_roles_voc) frame_embeds = self.frame_embeddings(frames) role_embeds = torch.cat((role_embeds, frame_embeds), 2) mapped_roles = F.relu(self.role_map(role_embeds)) mapped_roles = torch.transpose(mapped_roles, 1, 2) # b, times, roles tag_space = torch.matmul(hidden_states, mapped_roles) #tag_space = hidden_states.mm(mapped_roles) # b, roles #sub = torch.div(torch.add(local_roles_mask, -1.0), _BIG_NUMBER) sub = torch.add(local_roles_mask, -1.0) * _BIG_NUMBER sub = torch.FloatTensor(sub.cpu().numpy()).to(device) # b, roles, times tag_space = torch.transpose(tag_space, 0, 1) tag_space += sub # b, T, roles tag_space = torch.transpose(tag_space, 0, 1) tag_space = tag_space.view(len(sentence[0]) * self.batch_size, -1) SRLprobs = F.softmax(tag_space, dim=1) targets = targets.view(-1) loss_function = nn.CrossEntropyLoss(ignore_index=0) SRLloss = loss_function(tag_space, targets) loss = SRLloss + DEPloss + DEPloss_tag return SRLloss, DEPloss, DEPloss_tag, loss, SRLprobs, wrong_nums, nums, wrong_nums, nums, \ wrong_nums, nums, nums,\ wrong_nums_tag, nums_tag, nums_tag
def pad_unsort_packed_sequence(input, inv_ix): tmp, _ = pad_packed_sequence(input, batch_first=True) tmp = tmp[inv_ix] return tmp
def forward(self, src_tokens, src_lengths): if LanguagePairDataset.LEFT_PAD_SOURCE: # convert left-padding to right-padding src_tokens = utils.convert_padding_direction(src_tokens, self.padding_idx, left_to_right=True) if self.word_dropout_module is not None: src_tokens = self.word_dropout_module(src_tokens) bsz, seqlen = src_tokens.size() # embed tokens x = self.embed_tokens(src_tokens) x = F.dropout(x, p=self.dropout_in, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) # Generate packed seq to deal with varying source seq length packed_input, batch_sizes = pack_padded_sequence(x, src_lengths) final_hiddens, final_cells = [], [] next_hiddens = [] for i, rnn_layer in enumerate(self.layers): current_hidden_size = (self.hidden_dim // 2 if rnn_layer.is_bidirectional else self.hidden_dim) if self.cell_type in ["lstm", "milstm", "layer_norm_lstm"]: prev_hidden = ( x.new(bsz, current_hidden_size).zero_(), x.new(bsz, current_hidden_size).zero_(), ) else: raise Exception(f"{self.cell_type} not implemented") hidden, current_output = rnn_layer.forward(packed_input, prev_hidden, batch_sizes) next_hiddens.append(hidden) prev_hidden = next_hiddens[-1] if self.dropout_out != 0: current_output = F.dropout(current_output, p=self.dropout_out, training=self.training) if self.residual_level is not None and i >= self.residual_level: packed_input = packed_input.clone() + current_output else: packed_input = current_output final_hiddens, final_cells = zip(*next_hiddens) # Reshape to [num_layer, batch_size, hidden_dim] final_hiddens = torch.cat(final_hiddens, dim=0).view(self.num_layers, *final_hiddens[0].size()) final_cells = torch.cat(final_cells, dim=0).view(self.num_layers, *final_cells[0].size()) # [max_seqlen, batch_size, hidden_dim] unpacked_output, _ = pad_packed_sequence( PackedSequence(packed_input, batch_sizes), padding_value=self.padding_value) return (unpacked_output, final_hiddens, final_cells, src_lengths, src_tokens)
def forward(self, sentence, p_sentence, pos_tags, lengths, target_idx_in, region_marks, local_roles_voc, frames, local_roles_mask, sent_pred_lemmas_idx, dep_tags, dep_heads, targets, P_identification, all_l_ids, Predicate_link, Predicate_Labels_nd, Predicate_Labels, unlabeled_sentence=None, p_unlabeled_sentence=None, unlabeled_lengths=None, test=False, cvt_train=False): if cvt_train: CVT_SRL_Loss = self.CVT_train(unlabeled_sentence, p_unlabeled_sentence, unlabeled_lengths) return CVT_SRL_Loss """ perform predicate Identificaiton first """ Predicate_Identification_Space = self.Predicate_Id( sentence, p_sentence, lengths) # +++++++++++++++++++++++ wrong_l_nums = 0.0 all_l_nums = 0.0 right_noNull_predict = 0.0 noNull_predict = 0.0 noNUll_truth = 0.0 PI_labels = np.argmax( Predicate_Identification_Space.cpu().data.numpy(), axis=1) for predict_l, gold_l in zip( PI_labels, P_identification.cpu().view(-1).data.numpy()): if predict_l > 1 and gold_l != 0: noNull_predict += 1 if gold_l != 0: all_l_nums += 1 if gold_l != 1: noNUll_truth += 1 if gold_l == predict_l: right_noNull_predict += 1 if predict_l != gold_l and gold_l != 0: wrong_l_nums += 1 """ construct DEP_input """ # contruct input for DEP embeds_DEP = self.word_embeddings_DEP(sentence) embeds_DEP = embeds_DEP.view(self.batch_size, len(sentence[0]), self.word_emb_dim) region_marks = self.region_embeddings(region_marks).view( self.batch_size, len(sentence[0]), 16) # sharing pretrained word_embeds fixed_embeds_DEP = self.word_fixed_embeddings_DEP(p_sentence) fixed_embeds_DEP = fixed_embeds_DEP.view(self.batch_size, len(sentence[0]), self.word_emb_dim) embeds_forDEP = torch.cat((embeds_DEP, fixed_embeds_DEP, region_marks), 2) embeds_forDEP = self.DEP_input_dropout(embeds_forDEP) # first layer embeds_sort, lengths_sort, unsort_idx = self.sort_batch( embeds_forDEP, lengths) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort, batch_first=True) # hidden states [time_steps * batch_size * hidden_units] hidden_states, self.hidden_DEP_base = self.BiLSTM_0( embeds_sort, self.hidden_DEP_base) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) # hidden_states = hidden_states.transpose(0, 1) hidden_states_0 = hidden_states[unsort_idx] # second_layer embeds_sort, lengths_sort, unsort_idx = self.sort_batch( hidden_states_0, lengths) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort, batch_first=True) # hidden states [time_steps * batch_size * hidden_units] hidden_states, self.hidden_DEP = self.BiLSTM_DEP( embeds_sort, self.hidden_DEP) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) # hidden_states = hidden_states.transpose(0, 1) hidden_states_1 = hidden_states[unsort_idx] ########################################### hidden_states_3 = self.hidden_state_dropout_DEP(hidden_states_1) hidden_states_word = self.dropout_1_DEP( F.relu(self.Non_Predicate_Proj_DEP(hidden_states_3))) predicate_embeds = hidden_states_3[ np.arange(0, hidden_states_3.size()[0]), target_idx_in] hidden_states_predicate = self.dropout_2_DEP( F.relu(self.Predicate_Proj_DEP(predicate_embeds))) bias_one = torch.ones( (self.batch_size, len(sentence[0]), 1)).to(device) hidden_states_word = torch.cat( (hidden_states_word, Variable(bias_one)), 2) left_part = torch.mm( hidden_states_word.view(self.batch_size * len(sentence[0]), -1), self.W_R_DEP) left_part = left_part.view(self.batch_size, len(sentence[0]) * self.dep_size, -1) hidden_states_predicate = hidden_states_predicate.view( self.batch_size, -1, 1) tag_space_DEP = torch.bmm(left_part, hidden_states_predicate).view( len(sentence[0]) * self.batch_size, -1) # +++++++++++++++++++++++ h_layer_0 = hidden_states_0 # .detach() h_layer_1 = hidden_states_1 # .detach() w = F.softmax(self.elmo_w, dim=0) SRL_composer = self.elmo_gamma * (w[0] * h_layer_0 + w[1] * h_layer_1) SRL_composer = self.elmo_mlp(SRL_composer) fixed_embeds = self.word_fixed_embeddings(p_sentence) fixed_embeds = fixed_embeds.view(self.batch_size, len(sentence[0]), self.word_emb_dim) embeds_SRL = self.word_embeddings_SRL(sentence) embeds_SRL = embeds_SRL.view(self.batch_size, len(sentence[0]), self.word_emb_dim) SRL_hidden_states = torch.cat( (embeds_SRL, fixed_embeds, region_marks, SRL_composer), 2) SRL_hidden_states = self.SRL_input_dropout(SRL_hidden_states) # SRL layer embeds_sort, lengths_sort, unsort_idx = self.sort_batch( SRL_hidden_states, lengths) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort, batch_first=True) # hidden states [time_steps * batch_size * hidden_units] hidden_states, self.hidden_SRL_base = self.BiLSTM_1( embeds_sort, self.hidden_SRL_base) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) # hidden_states = hidden_states.transpose(0, 1) hidden_states_0 = hidden_states[unsort_idx] embeds_sort, lengths_sort, unsort_idx = self.sort_batch( hidden_states_0, lengths) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort.cpu().numpy(), batch_first=True) # hidden states [time_steps * batch_size * hidden_units] hidden_states, self.hidden_SRL = self.BiLSTM_SRL( embeds_sort, self.hidden_SRL) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) # hidden_states = hidden_states.transpose(0, 1) hidden_states = hidden_states[unsort_idx] hidden_states = self.hidden_state_dropout_SRL(hidden_states) # B * H hidden_states_3 = hidden_states hidden_states_word = self.dropout_1( F.relu(self.Non_Predicate_Proj(hidden_states_3))) predicate_embeds = hidden_states_3[ np.arange(0, hidden_states_3.size()[0]), target_idx_in] hidden_states_predicate = self.dropout_2( F.relu(self.Predicate_Proj(predicate_embeds))) bias_one = torch.ones( (self.batch_size, len(sentence[0]), 1)).to(device) hidden_states_word = torch.cat( (hidden_states_word, Variable(bias_one)), 2) bias_one = torch.ones((self.batch_size, 1)).to(device) hidden_states_predicate = torch.cat( (hidden_states_predicate, Variable(bias_one)), 1) left_part = torch.mm( hidden_states_word.view(self.batch_size * len(sentence[0]), -1), self.W_R) left_part = left_part.view(self.batch_size, len(sentence[0]) * self.tagset_size, -1) hidden_states_predicate = hidden_states_predicate.view( self.batch_size, -1, 1) tag_space = torch.bmm(left_part, hidden_states_predicate).view( len(sentence[0]) * self.batch_size, -1) SRLprobs = F.softmax(tag_space, dim=1) right_noNull_predict_DEP = 0.0 noNull_predict_DEP = 0.0 noNUll_truth_DEP = 0.0 PI_labels = np.argmax(tag_space_DEP.cpu().data.numpy(), axis=1) for predict_l, gold_l in zip( PI_labels, Predicate_Labels_nd.cpu().view(-1).data.numpy()): if predict_l > 1 and gold_l != 0: noNull_predict_DEP += 1 if gold_l != 0: all_l_nums += 1 if gold_l != 1: noNUll_truth_DEP += 1 if gold_l == predict_l: right_noNull_predict_DEP += 1 if predict_l != gold_l and gold_l != 0: wrong_l_nums += 1 loss_function = nn.CrossEntropyLoss(ignore_index=0) SRLloss = loss_function(tag_space, targets.view(-1)) DEPloss = loss_function(tag_space_DEP, Predicate_Labels_nd.view(-1)) IDloss = loss_function(Predicate_Identification_Space, P_identification.view(-1)) return SRLloss, IDloss, DEPloss, SRLprobs, wrong_l_nums, all_l_nums, wrong_l_nums, all_l_nums, \ right_noNull_predict, noNull_predict, noNUll_truth,\ right_noNull_predict_DEP, noNull_predict_DEP, noNUll_truth_DEP
def forward(self, mode, original_words_batch, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask): """ input: word_inputs: (batch_size, sent_len) feature_inputs: [(batch_size, sent_len), ...] list of variables word_seq_lengths: list of batch_size, (batch_size,1) char_inputs: (batch_size*sent_len, word_length) char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1) char_seq_recover: variable which records the char order information, used to recover char order output: Variable(batch_size, sent_len, hidden_dim) """ word_represent = self.wordrep(original_words_batch, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask) ## word_embs (batch_size, seq_len, embed_size) if self.word_feature_extractor == "CNN": batch_size = word_inputs.size(0) word_in = torch.tanh(self.word2cnn(word_represent)).transpose( 2, 1).contiguous() for idx in range(self.cnn_layer): if idx == 0: cnn_feature = F.relu(self.cnn_list[idx](word_in)) else: cnn_feature = F.relu(self.cnn_list[idx](cnn_feature)) cnn_feature = self.cnn_drop_list[idx](cnn_feature) if batch_size > 1: cnn_feature = self.cnn_batchnorm_list[idx](cnn_feature) feature_out = cnn_feature.transpose(2, 1).contiguous() outputs = self.hidden2tag(feature_out) elif self.word_feature_extractor == "LSTM": # lstm packed_words = pack_padded_sequence(word_represent, word_seq_lengths.cpu().numpy(), True) hidden = None lstm_out, hidden = self.lstm(packed_words, hidden) lstm_out, _ = pad_packed_sequence(lstm_out) ## lstm_out (seq_len, seq_len, hidden_size) feature_out = self.droplstm(lstm_out.transpose(1, 0)) outputs = self.hidden2tag(feature_out) ## feature_out (batch_size, seq_len, hidden_size) # outputs = self.hidden2tag(feature_out) elif self.word_feature_extractor == "MultiCellLSTM": # MultiCellLSTM hidden = None # (batch_size, seq_len, cell_num, hidden_size) hidden_outputs_forward, cell_states_forward, atten_probs_forward = self.lstm( word_represent, mask, hidden) if self.bilstm_flag: back_hidden = None hidden_outputs_back, cell_states_back, atten_probs_back = self.lstm_back( word_represent, mask, back_hidden) hidden_outputs = torch.cat( [hidden_outputs_forward, hidden_outputs_back], dim=-1) cell_states = torch.cat( [cell_states_forward, cell_states_back], dim=-1) atten_probs = (atten_probs_forward + atten_probs_back) / 2 hidden_outputs = self.droplstm(hidden_outputs) cell_states = self.droplstm(cell_states) cell_out = self.cell2entity(cell_states) if mode == 'LM': return hidden_outputs_forward, hidden_outputs_back, cell_out, atten_probs elif mode == 'NER': outputs = self.hidden2tag(hidden_outputs) return outputs, cell_out, atten_probs
def forward(self, inputs, input_raw, hidden=None): """ forward """ if isinstance(inputs, tuple): inputs, lengths = inputs else: inputs, lengths = inputs, None if self.embedder is not None: rnn_inputs = self.embedder(inputs) else: rnn_inputs = inputs elmo_embed = self.elmo_embedder.sents2elmo(input_raw) elmo_length = [x.shape[0] for x in elmo_embed] batch_size_1 = len(elmo_length) max_l = max(elmo_length) size = (batch_size_1, max_l, 1024) tensor_1 = torch.zeros(size, dtype=torch.float) for i in range(batch_size_1): tensor_1[i][:elmo_length[i]] = torch.tensor(elmo_embed[i]) elmo_embed = tensor_1.cuda() rnn_inputs = torch.cat([rnn_inputs, elmo_embed], dim=-1) batch_size = rnn_inputs.size(0) if lengths is not None: num_valid = lengths.gt(0).int().sum().item() sorted_lengths, indices = lengths.sort(descending=True) rnn_inputs = rnn_inputs.index_select(0, indices) rnn_inputs = pack_padded_sequence( rnn_inputs[:num_valid], sorted_lengths[:num_valid].tolist(), batch_first=True) if hidden is not None: hidden = hidden.index_select(1, indices)[:, :num_valid] outputs, last_hidden = self.rnn(rnn_inputs, hidden) if self.bidirectional: last_hidden = self._bridge_bidirectional_hidden(last_hidden) if lengths is not None: outputs, _ = pad_packed_sequence(outputs, batch_first=True) if num_valid < batch_size: zeros = outputs.new_zeros(batch_size - num_valid, outputs.size(1), self.hidden_size) outputs = torch.cat([outputs, zeros], dim=0) zeros = last_hidden.new_zeros(self.num_layers, batch_size - num_valid, self.hidden_size) last_hidden = torch.cat([last_hidden, zeros], dim=1) _, inv_indices = indices.sort() outputs = outputs.index_select(0, inv_indices) last_hidden = last_hidden.index_select(1, inv_indices) return outputs, last_hidden
def forward(self, word_info, mode, *input_tensor): if mode == 'train': teacher_forcing = True query_batch, query, context_batch, context, doc_batch, document, response_batch, response, p_star_copy = input_tensor decoder_ip_batch = response_batch[:, :-1] else: teacher_forcing = False query_batch, query, context_batch, context, doc_batch, document, response_batch, respons, target_length = input_tensor decoder_ip_batch = response_batch[:, 0] query_embed, context_embed, doc_embed, decoder_ip_embed = self.word_embed( [query_batch, context_batch, doc_batch, decoder_ip_batch]) query_encoding, query_repr, doc_encoding, doc_ht, context_encoding, context_ht = self.passage_context_encode( query_embed, query, doc_embed, document, context_embed, context) doc_unpack, _ = pad_packed_sequence(doc_encoding, batch_first=True) context_unpack, _ = pad_packed_sequence(context_encoding, batch_first=True) query_repr, doc_ht = self.check_direction(query_repr, doc_ht) query_repr_original = query.get_original_order(query_repr) doc_padded_original = document.get_original_order(doc_unpack) context_padded_original = context.get_original_order(context_unpack) context_repr_original, context_attn_wts, context_attn_scores = self.context_enc( context_padded_original, query_repr_original, context.masks) doc_repr_original, start_attn_wts, start_attn_scores = self.doc_enc( doc_padded_original, context_repr_original, query_repr_original, document.masks) start_max_score, start_max_ind = torch.max(start_attn_scores, dim=1) end_attn_scores = self.span_pred(doc_padded_original, doc_repr_original, start_attn_wts, start_max_ind, context_repr_original, query_repr_original, document) end_max_score, end_max_ind = torch.max(end_attn_scores, dim=1) hidden_state = doc_ht #In place operation check if teacher_forcing == True: all_gen_op, all_gen_prob, all_copy_prob, add_ind = self.create_batch_variable( decoder_ip_batch.size(1), doc_batch.size(1), mode, decoder_ip_batch.size(0)) for step, batch_data in enumerate(decoder_ip_embed.transpose(0, 1)): decoder_step_ip = batch_data if self.prev_connection == True and step - 1 >= 0: to_put, non_zero_ind = self.get_prev_connection( p_star_copy.transpose(0, 1)[step - 1], doc_batch, end_max_ind, add_ind) if non_zero_ind.size() != torch.LongTensor().size( ) and to_put.size() != torch.LongTensor().size(): last_word_embed = self.word_embed([to_put.squeeze(1) ])[0] decoder_step_ip.data.index_copy_( 0, non_zero_ind.data, last_word_embed.data) gen_prob, copy_prob = self.gen_copy_prob( context_repr_original, doc_repr_original, hidden_state) output_scores, output_prob, hidden_state = self.decoder( decoder_step_ip, context_repr_original, doc_repr_original, hidden_state) all_gen_op[step] = output_scores all_gen_prob[step] = gen_prob.squeeze(1) all_copy_prob[step] = copy_prob.squeeze(1) return all_gen_op.transpose( 0, 1), start_attn_scores, end_attn_scores, all_gen_prob.transpose( 0, 1), all_copy_prob.transpose(0, 1) else: all_gen_op, all_gen_prob, all_copy_prob, add_ind, all_top_ind = self.create_batch_variable( target_length, doc_batch.size(1), mode, decoder_ip_batch.size(0)) decoder_step_ip = decoder_ip_embed.clone() for step in range(target_length): gen_prob, copy_prob = self.gen_copy_prob( context_repr_original, doc_repr_original, hidden_state) output_scores, output_prob, hidden_state = self.decoder( decoder_step_ip, context_repr_original, doc_repr_original, hidden_state) comp_prob = copy_prob > gen_prob all_gen_op[step] = output_scores all_gen_prob[step] = gen_prob.squeeze(1) all_copy_prob[step] = copy_prob.squeeze(1) top_elem, top_ind = torch.topk(output_prob, 1, dim=1) all_top_ind[step] = top_ind.squeeze() changed_top_ind = top_ind.clone() if self.prev_connection == True and step - 1 >= 0: to_put, non_zero_ind = self.get_prev_connection( prev_comp_prob, doc_batch, end_max_ind, add_ind) if non_zero_ind.size() != torch.LongTensor().size( ) and to_put.size() != torch.LongTensor().size(): changed_top_ind.data.index_copy_( 0, non_zero_ind.data, to_put.data) decoder_step_ip = self.word_embed([changed_top_ind.squeeze(1) ])[0] prev_comp_prob = comp_prob.clone().squeeze(1) return all_gen_op.transpose( 0, 1 ), start_attn_scores, end_attn_scores, start_max_ind, end_max_ind, all_gen_prob.transpose( 0, 1), all_copy_prob.transpose(0, 1), all_top_ind.transpose(0, 1)
def forward( self, inputs: PackedSequence, states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None ) -> Tuple[PackedSequence, Tuple[torch.Tensor, torch.Tensor]]: """ Warning: Would be better to use the BiAugmentedLstm class in a regular model Given an input batch of sequential data such as word embeddings, produces a single layer unidirectional AugmentedLSTM representation of the sequential input and new state tensors. # Parameters inputs : `PackedSequence` `bsize` sequences of shape `(len, input_dim)` each, in PackedSequence format states : `Tuple[torch.Tensor, torch.Tensor]` Tuple of tensors containing the initial hidden state and the cell state of each element in the batch. Each of these tensors have a dimension of (1 x bsize x nhid). Defaults to `None`. # Returns `Tuple[PackedSequence, Tuple[torch.Tensor, torch.Tensor]]` AugmentedLSTM representation of input and the state of the LSTM `t = seq_len`. Shape of representation is (bsize x seq_len x representation_dim). Shape of each state is (1 x bsize x nhid). """ if not isinstance(inputs, PackedSequence): raise Exception("inputs must be PackedSequence but got %s" % (type(inputs))) sequence_tensor, batch_lengths = pad_packed_sequence(inputs, batch_first=True) batch_size = sequence_tensor.size()[0] total_timesteps = sequence_tensor.size()[1] output_accumulator = sequence_tensor.new_zeros(batch_size, total_timesteps, self.lstm_dim) if states is None: full_batch_previous_memory = sequence_tensor.new_zeros(batch_size, self.lstm_dim) full_batch_previous_state = sequence_tensor.data.new_zeros(batch_size, self.lstm_dim) else: full_batch_previous_state = states[0].squeeze(0) full_batch_previous_memory = states[1].squeeze(0) current_length_index = batch_size - 1 if self.go_forward else 0 if self.recurrent_dropout_probability > 0.0: dropout_mask = get_dropout_mask( self.recurrent_dropout_probability, full_batch_previous_memory ) else: dropout_mask = None for timestep in range(total_timesteps): index = timestep if self.go_forward else total_timesteps - timestep - 1 if self.go_forward: while batch_lengths[current_length_index] <= index: current_length_index -= 1 # If we're going backwards, we are _picking up_ more indices. else: # First conditional: Are we already at the maximum # number of elements in the batch? # Second conditional: Does the next shortest # sequence beyond the current batch # index require computation use this timestep? while ( current_length_index < (len(batch_lengths) - 1) and batch_lengths[current_length_index + 1] > index ): current_length_index += 1 previous_memory = full_batch_previous_memory[0: current_length_index + 1].clone() previous_state = full_batch_previous_state[0: current_length_index + 1].clone() timestep_input = sequence_tensor[0: current_length_index + 1, index] timestep_output, memory = self.cell( timestep_input, (previous_state, previous_memory), dropout_mask[0: current_length_index + 1] if dropout_mask is not None else None, ) full_batch_previous_memory = full_batch_previous_memory.data.clone() full_batch_previous_state = full_batch_previous_state.data.clone() full_batch_previous_memory[0: current_length_index + 1] = memory full_batch_previous_state[0: current_length_index + 1] = timestep_output output_accumulator[0: current_length_index + 1, index, :] = timestep_output output_accumulator = pack_padded_sequence( output_accumulator, batch_lengths, batch_first=True ) # Mimic the pytorch API by returning state in the following shape: # (num_layers * num_directions, batch_size, lstm_dim). As this # LSTM cannot be stacked, the first dimension here is just 1. final_state = ( full_batch_previous_state.unsqueeze(0), full_batch_previous_memory.unsqueeze(0), ) return output_accumulator, final_state
def encode( self, source_padded: torch.Tensor, source_lengths: List[int] ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None ### YOUR CODE HERE (~ 8 Lines) ### TODO: ### 1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings. ### src_len = maximum source sentence length, b = batch size, e = embedding size. Note ### that there is no initial hidden state or cell for the decoder. ### 2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`. ### - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X. ### - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens. ### - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to ### return a tensor of shape (b, src_len, h*2) as `enc_hiddens`. ### 3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell): ### - `init_decoder_hidden`: ### `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the h_projection layer to this in order to compute init_decoder_hidden. ### This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size ### - `init_decoder_cell`: ### `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the c_projection layer to this in order to compute init_decoder_cell. ### This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size ### ### See the following docs, as you may need to use some of the following functions in your implementation: ### Pack the padded sequence X before passing to the encoder: ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence ### Pad the packed sequence, enc_hiddens, returned by the encoder: ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tensor Permute: ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute # 1 X = self.model_embeddings.source(source_padded) #2 enc_hiddens, (last_hidden, last_cell) = self.encoder( pack_padded_sequence(X, lengths=source_lengths)) enc_hiddens, _ = pad_packed_sequence(enc_hiddens) enc_hiddens = enc_hiddens.permute(1, 0, 2) #3 batch_size = enc_hiddens.size(0) init_decoder_hidden = self.h_projection( last_hidden.permute(1, 0, 2).contiguous().view( [batch_size, 2 * self.hidden_size])) init_decoder_cell = self.c_projection( last_cell.permute(1, 0, 2).contiguous().view(batch_size, 2 * self.hidden_size)) dec_init_state = (init_decoder_hidden, init_decoder_cell) ### END YOUR CODE return enc_hiddens, dec_init_state
def get_decoded_output(self, decoder_input, hidden, lens): # TODO: adapt this output, hidden = self.decoder(decoder_input, hidden, lens) output, _ = pad_packed_sequence(output) output = self.out_layer(output) return output, hidden
def CVT_train(self, sentence, p_sentence, lengths): ## start unlabeled training: Predicate_Identification_Space = self.Predicate_Id( sentence, p_sentence, lengths) Predicate_probs = Predicate_Identification_Space.view( self.batch_size, len(sentence[0]), -1).cpu().data.numpy() Predicate_idx_batch = [0] * self.batch_size for i in range(self.batch_size): candidate_set = [] for j in range(len(sentence[0])): if j >= lengths[i]: break if Predicate_probs[i][j][2] > Predicate_probs[i][j][ 1] and Predicate_probs[i][j][2] > Predicate_probs[i][ j][0]: candidate_set.append(j) if len(candidate_set) > 0: index = random.sample(candidate_set, 1) Predicate_idx_batch[i] = index[0] # +++++++++++++++++++++++ """ construct DEP_input """ # contruct input for DEP unlabeled_region_mark = np.zeros(sentence.size(), dtype='int64') for i in range(len(unlabeled_region_mark)): unlabeled_region_mark[i][Predicate_idx_batch[i]] = 1 unlabeled_region_mark_in = torch.from_numpy(unlabeled_region_mark).to( device) region_marks = self.region_embeddings(unlabeled_region_mark_in).view( self.batch_size, len(sentence[0]), 16) embeds_DEP = self.word_embeddings_DEP(sentence) embeds_DEP = embeds_DEP.view(self.batch_size, len(sentence[0]), self.word_emb_dim) # sharing pretrained word_embeds fixed_embeds_DEP = self.word_fixed_embeddings_DEP(p_sentence) fixed_embeds_DEP = fixed_embeds_DEP.view(self.batch_size, len(sentence[0]), self.word_emb_dim) embeds_forDEP = torch.cat((embeds_DEP, fixed_embeds_DEP, region_marks), 2) embeds_forDEP = self.DEP_input_dropout(embeds_forDEP) # first layer embeds_sort, lengths_sort, unsort_idx = self.sort_batch( embeds_forDEP, lengths) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort, batch_first=True) # hidden states [time_steps * batch_size * hidden_units] hidden_states, self.hidden_DEP_base = self.BiLSTM_0( embeds_sort, self.hidden_DEP_base) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) # hidden_states = hidden_states.transpose(0, 1) hidden_states_0_DEP = hidden_states[unsort_idx] # second_layer embeds_sort, lengths_sort, unsort_idx = self.sort_batch( hidden_states_0_DEP, lengths) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort, batch_first=True) # hidden states [time_steps * batch_size * hidden_units] hidden_states, self.hidden_DEP = self.BiLSTM_DEP( embeds_sort, self.hidden_DEP) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) # hidden_states = hidden_states.transpose(0, 1) hidden_states_1 = hidden_states[unsort_idx] ########################################### # +++++++++++++++++++++++ h_layer_0 = hidden_states_0_DEP.detach() h_layer_1 = hidden_states_1.detach() w = F.softmax(self.elmo_w, dim=0) SRL_composer = self.elmo_gamma * (w[0] * h_layer_0 + w[1] * h_layer_1) SRL_composer = self.elmo_mlp(SRL_composer) fixed_embeds = self.word_fixed_embeddings(p_sentence) fixed_embeds = fixed_embeds.view(self.batch_size, len(sentence[0]), self.word_emb_dim) embeds_SRL = self.word_embeddings_SRL(sentence) embeds_SRL = embeds_SRL.view(self.batch_size, len(sentence[0]), self.word_emb_dim) SRL_hidden_states = torch.cat( (embeds_SRL, fixed_embeds, region_marks, SRL_composer), 2) SRL_hidden_states = self.SRL_input_dropout(SRL_hidden_states) # SRL layer embeds_sort, lengths_sort, unsort_idx = self.sort_batch( SRL_hidden_states, lengths) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort, batch_first=True) # hidden states [time_steps * batch_size * hidden_units] hidden_states, self.hidden_SRL_base = self.BiLSTM_1( embeds_sort, self.hidden_SRL_base) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) # hidden_states = hidden_states.transpose(0, 1) hidden_states_0 = hidden_states[unsort_idx] embeds_sort, lengths_sort, unsort_idx = self.sort_batch( hidden_states_0, lengths) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort.cpu().numpy(), batch_first=True) # hidden states [time_steps * batch_size * hidden_units] hidden_states, self.hidden_SRL = self.BiLSTM_SRL( embeds_sort, self.hidden_SRL) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) # hidden_states = hidden_states.transpose(0, 1) hidden_states = hidden_states[unsort_idx] hidden_states = self.hidden_state_dropout_SRL(hidden_states) # B * H hidden_states_3 = hidden_states hidden_states_word = self.dropout_1( F.relu(self.Non_Predicate_Proj(hidden_states_3))) predicate_embeds = hidden_states_3[ np.arange(0, hidden_states_3.size()[0]), Predicate_idx_batch] hidden_states_predicate = self.dropout_2( F.relu(self.Predicate_Proj(predicate_embeds))) bias_one = torch.ones( (self.batch_size, len(sentence[0]), 1)).to(device) hidden_states_word = torch.cat( (hidden_states_word, Variable(bias_one)), 2) bias_one = torch.ones((self.batch_size, 1)).to(device) hidden_states_predicate = torch.cat( (hidden_states_predicate, Variable(bias_one)), 1) left_part = torch.mm( hidden_states_word.view(self.batch_size * len(sentence[0]), -1), self.W_R) left_part = left_part.view(self.batch_size, len(sentence[0]) * self.tagset_size, -1) hidden_states_predicate = hidden_states_predicate.view( self.batch_size, -1, 1) tag_space = torch.bmm(left_part, hidden_states_predicate).view( self.batch_size, len(sentence[0]), -1) ## obtain the teacher probs SRLprobs_teacher = tag_space.detach() hidden_forward, hidden_backward = hidden_states_0_DEP.split( self.hidden_dim, 2) CVT_SRL_Loss = self.Semi_SRL_Loss(hidden_forward, hidden_backward, Predicate_idx_batch, sentence, SRLprobs_teacher, lengths) return CVT_SRL_Loss
def _lstm_forward(self, inputs: PackedSequence, initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> \ Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Parameters ---------- inputs : ``PackedSequence``, required. A batch first ``PackedSequence`` to run the stacked LSTM over. initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None) A tuple (state, memory) representing the initial hidden state and memory of the LSTM, with shape (num_layers, batch_size, 2 * hidden_size) and (num_layers, batch_size, 2 * cell_size) respectively. Returns ------- output_sequence : ``torch.FloatTensor`` The encoded sequence of shape (num_layers, batch_size, sequence_length, hidden_size) final_states: ``Tuple[torch.FloatTensor, torch.FloatTensor]`` The per-layer final (state, memory) states of the LSTM, with shape (num_layers, batch_size, 2 * hidden_size) and (num_layers, batch_size, 2 * cell_size) respectively. The last dimension is duplicated because it contains the state/memory for both the forward and backward layers. """ if initial_state is None: hidden_states: List[Optional[Tuple[torch.Tensor, torch.Tensor]]] = [None] * len( self.forward_layers) elif initial_state[0].size()[0] != len(self.forward_layers): raise Exception( "Initial states were passed to forward() but the number of " "initial states does not match the number of layers.") else: hidden_states = list( zip(initial_state[0].split(1, 0), initial_state[1].split(1, 0))) inputs, batch_lengths = pad_packed_sequence(inputs, batch_first=True) forward_output_sequence = inputs backward_output_sequence = inputs final_states = [] sequence_outputs = [] for layer_index, state in enumerate(hidden_states): forward_layer = getattr(self, 'forward_layer_{}'.format(layer_index)) backward_layer = getattr(self, 'backward_layer_{}'.format(layer_index)) forward_cache = forward_output_sequence backward_cache = backward_output_sequence if state is not None: forward_hidden_state, backward_hidden_state = state[0].split( self.hidden_size, 2) forward_memory_state, backward_memory_state = state[1].split( self.cell_size, 2) forward_state = (forward_hidden_state, forward_memory_state) backward_state = (backward_hidden_state, backward_memory_state) else: forward_state = None backward_state = None forward_output_sequence, forward_state = forward_layer( forward_output_sequence, batch_lengths, forward_state) backward_output_sequence, backward_state = backward_layer( backward_output_sequence, batch_lengths, backward_state) # Skip connections, just adding the input to the output. if layer_index != 0: forward_output_sequence += forward_cache backward_output_sequence += backward_cache sequence_outputs.append( torch.cat([forward_output_sequence, backward_output_sequence], -1)) # Append the state tuples in a list, so that we can return # the final states for all the layers. final_states.append( (torch.cat([forward_state[0], backward_state[0]], -1), torch.cat([forward_state[1], backward_state[1]], -1))) stacked_sequence_outputs: torch.FloatTensor = torch.stack( sequence_outputs) # Stack the hidden state and memory for each layer in。to 2 tensors of shape # (num_layers, batch_size, hidden_size) and (num_layers, batch_size, cell_size) # respectively. final_hidden_states, final_memory_states = zip(*final_states) final_state_tuple: Tuple[torch.FloatTensor, torch.FloatTensor] = (torch.cat( final_hidden_states, 0), torch.cat(final_memory_states, 0)) return stacked_sequence_outputs, final_state_tuple
def forward(self, query, keys, keys_length, mask=None): """ Parameters ---------- query: 2D tensor, [B, H] keys: (masked_interests), 3D tensor, [b, T, H] keys_length: 1D tensor, [B] Returns ------- outputs: 2D tensor, [B, H] """ batch_size, dim = query.size() max_length = keys.size()[1] # check batch validation zero_outputs = torch.zeros(batch_size, dim, device=query.device) mask = keys_length > 0 # [B] -> [b] keys_length = keys_length[mask] if keys_length.shape[0] == 0: return zero_outputs # [B, H] -> [b, 1, H] query = torch.masked_select(query, mask.view(-1, 1)).view(-1, dim).unsqueeze(1) if self.gru_type == 'GRU': packed_keys = pack_padded_sequence(keys, lengths=keys_length, batch_first=True, enforce_sorted=False) packed_interests, _ = self.interest_evolution(packed_keys) interests, _ = pad_packed_sequence(packed_interests, batch_first=True, padding_value=0.0, total_length=max_length) outputs = self.attention(query, interests, keys_length.unsqueeze(1)) # [b, 1, H] outputs = outputs.squeeze(1) # [b, H] elif self.gru_type == 'AIGRU': att_scores = self.attention(query, keys, keys_length.unsqueeze(1)) # [b, 1, T] interests = keys * att_scores.transpose(1, 2) # [b, T, H] packed_interests = pack_padded_sequence(interests, lengths=keys_length, batch_first=True, enforce_sorted=False) _, outputs = self.interest_evolution(packed_interests) outputs = outputs.squeeze(0) # [b, H] elif self.gru_type == 'AGRU' or self.gru_type == 'AUGRU': att_scores = self.attention( query, keys, keys_length.unsqueeze(1)).squeeze(1) # [b, T] packed_interests = pack_padded_sequence(keys, lengths=keys_length, batch_first=True, enforce_sorted=False) packed_scores = pack_padded_sequence(att_scores, lengths=keys_length, batch_first=True, enforce_sorted=False) outputs = self.interest_evolution(packed_interests, packed_scores) outputs, _ = pad_packed_sequence(outputs, batch_first=True, padding_value=0.0, total_length=max_length) # pick last state outputs = InterestEvolving._get_last_state(outputs, keys_length) # [b, H] # [b, H] -> [B, H] zero_outputs[mask] = outputs return zero_outputs
def forward(self, input_ids=None, attention_mask=None, labels=None): # def forward(self, args, input_ids=None, attention_mask=None, labels=None): # BERT outputs = self.bert_layer(input_ids, attention_mask=attention_mask) # output 0 = batch size 6, tokens 512, each token dimension 768 [CLS] token # output 1 = batch size 6, each token dimension 768 # output 2 = layers 13, batch 6 (hidden states), tokens 512, each token dimension 768 sequence_output = outputs[2] # Last layer of each token prediction num_layer_sum = 4 summed_last_4_layers = torch.stack( sequence_output[:num_layer_sum]).mean(0) # lstm with masks (same as attention masks) packed_input, perm_idx, seq_lengths = get_packed_padded_output( summed_last_4_layers, input_ids, attention_mask, self.tokenizer) packed_output, (ht, ct) = self.lstm_layer(packed_input) # Unpack and reorder the output output, input_sizes = pad_packed_sequence(packed_output, batch_first=True) _, unperm_idx = perm_idx.sort(0) lstm_output = output[ unperm_idx] # lstm_output.shape = shorter than the padded torch.Size([6, 388, 512]) seq_lengths_ordered = seq_lengths[unperm_idx] # shorten the labels as per the batchsize labels = labels[:, :lstm_output.shape[1]] # Apply mask before calculating the matmul of inputs and the K, Q, V weights attention_mask_ = attention_mask[:, :lstm_output.shape[1]] attention_mask_ = attention_mask_.bool() # Apply attention here attention_applied, attention_weights = self.self_attention( lstm_output, lstm_output, lstm_output, key_padding_mask=None, need_weights=True, attn_mask=None) # mask the unimportant tokens before attention is applied mask = ( (input_ids[:, :attention_applied.shape[1]] != self.tokenizer.pad_token_id) & (input_ids[:, :attention_applied.shape[1]] != self.tokenizer.convert_tokens_to_ids(self.tokenizer.sep_token)) & (labels != 100)) mask_expanded = mask.unsqueeze(-1).expand(attention_applied.size()) attention_applied *= mask_expanded.float() labels *= mask.long() # log reg probablity = F.relu(self.hidden2tag(attention_applied)) # CRF emissions (coarse) loss = self.crf_layer(probablity, labels, mask=mask, reduction='token_mean', weights=self.weights) emissions = self.crf_layer.decode(probablity, mask=mask) emissions_ = [item for sublist in emissions for item in sublist ] # flatten the nest list of emissions # mask labels here according to masks labels_masked = labels[mask] return loss, emissions_, labels_masked, mask
def _get_instr_embedding(self, instr): if self.lang_model == 'gru': _, hidden = self.instr_rnn(self.word_embedding(instr)) return hidden[-1] elif self.lang_model in ['bigru', 'attgru']: lengths = (instr != 0).sum(1).long() masks = (instr != 0).float() if lengths.shape[0] > 1: seq_lengths, perm_idx = lengths.sort(0, descending=True) iperm_idx = torch.LongTensor(perm_idx.shape).fill_(0) if instr.is_cuda: iperm_idx = iperm_idx.cuda() for i, v in enumerate(perm_idx): iperm_idx[v.data] = i inputs = self.word_embedding(instr) inputs = inputs[perm_idx] inputs = pack_padded_sequence(inputs, seq_lengths.data.cpu().numpy(), batch_first=True) outputs, final_states = self.instr_rnn(inputs) else: instr = instr[:, 0:lengths[0]] outputs, final_states = self.instr_rnn( self.word_embedding(instr)) iperm_idx = None final_states = final_states.transpose(0, 1).contiguous() final_states = final_states.view(final_states.shape[0], -1) if iperm_idx is not None: outputs, _ = pad_packed_sequence(outputs, batch_first=True) outputs = outputs[iperm_idx] final_states = final_states[iperm_idx] if outputs.shape[1] < masks.shape[1]: masks = masks[:, :(outputs.shape[1] - masks.shape[1])] # the packing truncated the original length # so we need to change mask to fit it return outputs if self.lang_model == 'attgru' else final_states elif self.lang_model == 'conv': inputs = self.word_embedding(instr).unsqueeze(1) # (B,1,T,D) inputs = [ F.relu(conv(inputs)).squeeze(3) for conv in self.instr_convs ] inputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in inputs] return torch.cat(inputs, 1) elif self.lang_model == 'bow': device = torch.device("cuda" if instr.is_cuda else "cpu") input_dim = self.obs_space["instr"] input = torch.zeros((instr.size(0), input_dim), device=device) idx = torch.arange(instr.size(0), dtype=torch.int64) input[idx.unsqueeze(1), instr] = 1. return self.instr_bow(input) else: ValueError("Undefined instruction architecture: {}".format( self.use_instr))
def forward(self, input, lengths, h_0=None): input_packed = pack_padded_sequence(input, lengths=lengths, batch_first=self.batch_first) out_packed, h_n = self.rnn(input_packed, h_0) out = pad_packed_sequence(out_packed, batch_first=self.batch_first)[0] return out.contiguous(), h_n
def forward(self, words, feats): r""" Args: words (~torch.LongTensor): ``[batch_size, seq_len]``. Word indices. feats (list[~torch.LongTensor]): A list of feat indices. The size of indices is ``[batch_size, seq_len, fix_len]`` if feat is ``'char'`` or ``'bert'``, or ``[batch_size, seq_len]`` otherwise. Returns: ~torch.Tensor, ~torch.Tensor, ~torch.Tensor: Scores of all possible edges of shape ``[batch_size, seq_len, seq_len]``, dependent-head-sibling triples of shape ``[batch_size, seq_len, seq_len, seq_len]`` and all possible labels on each edge of shape ``[batch_size, seq_len, seq_len, n_labels]``. """ _, seq_len = words.shape # get the mask and lengths of given batch mask = words.ne(self.pad_index) ext_words = words # set the indices larger than num_embeddings to unk_index if hasattr(self, 'pretrained'): ext_mask = words.ge(self.word_embed.num_embeddings) ext_words = words.masked_fill(ext_mask, self.unk_index) # get outputs from embedding layers word_embed = self.word_embed(ext_words) if hasattr(self, 'pretrained'): word_embed = torch.cat( (word_embed, self.embed_proj(self.pretrained(words))), -1) feat_embeds = [] if 'tag' in self.args.feat: feat_embeds.append(self.tag_embed(feats.pop())) if 'char' in self.args.feat: feat_embeds.append(self.char_embed(feats.pop(0))) if 'bert' in self.args.feat: feat_embeds.append(self.bert_embed(feats.pop(0))) if 'lemma' in self.args.feat: feat_embeds.append(self.lemma_embed(feats.pop(0))) word_embed, feat_embed = self.embed_dropout(word_embed, torch.cat(feat_embeds, -1)) # concatenate the word and feat representations embed = torch.cat((word_embed, feat_embed), -1) x = pack_padded_sequence(embed, mask.sum(1), True, False) x, _ = self.lstm(x) x, _ = pad_packed_sequence(x, True, total_length=seq_len) x = self.lstm_dropout(x) # apply MLPs to the BiLSTM output states un_d = self.mlp_un_d(x) un_h = self.mlp_un_h(x) bin_d = self.mlp_bin_d(x) bin_h = self.mlp_bin_h(x) bin_g = self.mlp_bin_g(x) label_h = self.mlp_label_h(x) label_d = self.mlp_label_d(x) label_h = self.mlp_label_h(x) # [batch_size, seq_len, seq_len] s_egde = self.edge_attn(un_d, un_h) # [batch_size, seq_len, seq_len, n_labels] s_sib = self.sib_attn(bin_d, bin_d, bin_h).triu_() s_sib = (s_sib + s_sib.transpose(-1, -2)).permute(0, 3, 1, 2) # [batch_size, seq_len, seq_len, n_labels] s_cop = self.cop_attn(bin_h, bin_d, bin_h).permute(0, 3, 1, 2).triu_() s_cop = s_cop + s_cop.transpose(-1, -2) # [batch_size, seq_len, seq_len, n_labels] s_grd = self.grd_attn(bin_g, bin_d, bin_h).permute(0, 3, 1, 2) # [batch_size, seq_len, seq_len, n_labels] s_label = self.label_attn(label_d, label_h).permute(0, 2, 3, 1) return s_egde, s_sib, s_cop, s_grd, s_label
def forward( self, inputs: PackedSequence, # pylint: disable=arguments-differ # pylint: disable=unused-argument initial_state: torch.Tensor = None ) -> Tuple[PackedSequence, torch.Tensor]: """ Parameters ---------- inputs : ``PackedSequence``, required. A batch first ``PackedSequence`` to run the stacked LSTM over. initial_state : Tuple[torch.Tensor, torch.Tensor], optional, (default = None) Currently, this is ignored. Returns ------- output_sequence : ``PackedSequence`` The encoded sequence of shape (batch_size, sequence_length, hidden_size) final_states: ``torch.Tensor`` The per-layer final (state, memory) states of the LSTM, each with shape (num_layers, batch_size, hidden_size). """ inputs, lengths = pad_packed_sequence(inputs, batch_first=True) # Kernel takes sequence length first tensors. inputs = inputs.transpose(0, 1) sequence_length, batch_size, _ = inputs.size() accumulator_shape = [ self.num_layers, sequence_length + 1, batch_size, self.hidden_size ] state_accumulator = Variable( inputs.data.new(*accumulator_shape).zero_(), requires_grad=False) memory_accumulator = Variable( inputs.data.new(*accumulator_shape).zero_(), requires_grad=False) dropout_weights = inputs.data.new().resize_( self.num_layers, batch_size, self.hidden_size).fill_(1.0) if self.training: # Normalize by 1 - dropout_prob to preserve the output statistics of the layer. dropout_weights.bernoulli_(1 - self.recurrent_dropout_probability)\ .div_((1 - self.recurrent_dropout_probability)) dropout_weights = Variable(dropout_weights, requires_grad=False) gates = Variable(inputs.data.new().resize_(self.num_layers, sequence_length, batch_size, 6 * self.hidden_size)) lengths_variable = Variable(torch.IntTensor(lengths)) implementation = _AlternatingHighwayLSTMFunction( self.input_size, self.hidden_size, num_layers=self.num_layers, train=self.training) output, _ = implementation(inputs, self.weight, self.bias, state_accumulator, memory_accumulator, dropout_weights, lengths_variable, gates) # TODO(Mark): Also return the state here by using index_select with the lengths so we can use # it as a Seq2VecEncoder. output = output.transpose(0, 1) output = pack_padded_sequence(output, lengths, batch_first=True) return output, None