def encode_table_header(self, tables): # input, ids of table word: (batch_size, max_column_num) # encode_output: (max_head_word_num, batch_size, max_column_num, hidden_size) # (batch_size, max_column_num, max_head_word_num) # table_head_mask: (batch_size, max_column_num) # table_col_lens: (batch_size, max_column_num) table_head_wids, table_col_lens = WikiSqlBatch.get_table_header_input_tensor(tables, self.vocab.source, cuda=self.args.cuda) # hack: pack_padded_sequence requires seq length to be greater than 1 for tbl in table_col_lens: for i in range(len(tbl)): if tbl[i] == 0: tbl[i] = 1 table_header_mask = WikiSqlBatch.get_table_header_mask(tables, cuda=self.args.cuda) # (batch_size, max_column_num, max_head_word_num, word_embed_size) table_head_word_embeds = self.src_embed(table_head_wids.view(-1)).view(list(table_head_wids.size()) + [self.src_embed.embedding_dim]) batch_size = table_head_word_embeds.size(0) max_col_num = table_head_word_embeds.size(1) max_col_word_num = table_head_word_embeds.size(2) # (batch_size * max_column_num, max_head_word_num, word_embed_size) table_head_word_embeds_flatten = table_head_word_embeds.view(batch_size * max_col_num, max_col_word_num, -1) table_col_lens_flatten = list(chain.from_iterable(table_col_lens)) sorted_col_ids = sorted(list(range(len(table_col_lens_flatten))), key=lambda x: -table_col_lens_flatten[x]) sorted_table_col_lens_flatten = [table_col_lens_flatten[i] for i in sorted_col_ids] col_old_pos_map = [-1] * len(sorted_col_ids) for new_pos, old_pos in enumerate(sorted_col_ids): col_old_pos_map[old_pos] = new_pos # (batch_size * max_column_num, max_head_word_num, word_embed_size) sorted_table_head_word_embeds = table_head_word_embeds_flatten[sorted_col_ids, :, :] packed_table_head_word_embeds = pack_padded_sequence(sorted_table_head_word_embeds, sorted_table_col_lens_flatten, batch_first=True) # column_word_encodings: (batch_size * max_column_num, max_head_word_num, hidden_size) column_word_encodings, (table_header_encoding, table_head_last_cell) = self.table_header_lstm(packed_table_head_word_embeds) column_word_encodings, _ = pad_packed_sequence(column_word_encodings, batch_first=True) # (batch_size * max_column_num, max_head_word_num, hidden_size) column_word_encodings = column_word_encodings[col_old_pos_map] # (batch_size, max_column_num, max_head_word_num, hidden_size) column_word_encodings = column_word_encodings.view(batch_size, max_col_num, max_col_word_num, -1) # (batch_size, hidden_size * 2) table_header_encoding = torch.cat([table_header_encoding[0], table_header_encoding[1]], -1) # table_head_last_cell = torch.cat([table_head_last_cell[0], table_head_last_cell[1]], -1) # same table_header_encoding = table_header_encoding[col_old_pos_map] # (batch_size, max_column_num, hidden_size) table_header_encoding = table_header_encoding.view(batch_size, max_col_num, -1) return column_word_encodings, table_header_encoding, table_header_mask
def forward(self, sentences, sentences_len, hidden): sentences_len = sentences_len.cpu().data.numpy() idx = np.argsort(sentences_len).tolist()[::-1] ridx = np.argsort(idx).tolist() sentences = sentences[idx, :] sentences_len = sentences_len[idx, ] embedding = self.embedding(sentences) embedding = nn.Dropout(0.1)(embedding) packed_embedding = pack_padded_sequence(embedding, sentences_len, batch_first=True) packed_rnn_feature, hidden = self.rnn_feature(packed_embedding, hidden) sentence_feature, _ = pad_packed_sequence(packed_rnn_feature, batch_first=True) idx = Variable(LongTensor(sentences_len - 1)) idx = idx.view(-1, 1).expand(sentence_feature.size(0), sentence_feature.size(2)).unsqueeze(1) if sentence_feature.is_cuda: idx = idx.cuda() sentence_feature = sentence_feature.gather(1, idx).squeeze() sentence_feature = sentence_feature[ridx, :] sentences_len = sentences_len[ridx, ] logits = self.classifier(sentence_feature) pred = F.log_softmax(logits, dim=0) return pred
def forward(self, question,length): length = list(length.data.cpu().numpy()) emb = self.drop(self.encoder(question)) emb = self.tanh(emb) hidden = self.init_hidden(len(length)) seqs = trnn.pack_padded_sequence(emb, length, batch_first=True) seqs, hidden = self.rnn(seqs, hidden) h,_ = trnn.pad_packed_sequence(seqs, batch_first=True) #attention weights = self.softmax(self.att2(torch.transpose(h, 1, 2)).squeeze(1)).unsqueeze(-1) weights = weights.expand_as(h) bilstmout = torch.sum(h*weights, 1).squeeze(1) #bilstmout = torch.cat([hidden[0][0],hidden[0][1]],-1) fc1fea = self.fc1(bilstmout) return fc1fea
def postprocess_sequence(self, X): """Embed (variable-length) sequences Parameters ---------- X : list List of input sequences Returns ------- fX : numpy array Batch of sequence embeddings. """ lengths = torch.tensor([len(x) for x in X]) sorted_lengths, sort = torch.sort(lengths, descending=True) _, unsort = torch.sort(sort) sequences = [torch.tensor(X[i], dtype=torch.float32, device=self.device) for i in sort] padded = pad_sequence(sequences, batch_first=True, padding_value=0) packed = pack_padded_sequence(padded, sorted_lengths, batch_first=True) cpu = torch.device('cpu') fX = self.model(packed).detach().to(cpu).numpy() return fX[unsort]
def forward(self, input, *args): args, seq_lengths = args[:-1], args[-1] input = rnn_utils.pack_padded_sequence(input, seq_lengths, self.batch_first) rets = self.model(input, *args) ret, rets = rets[0], rets[1:] ret, _ = rnn_utils.pad_packed_sequence(ret, self.batch_first) return tuple([ret] + list(rets))
def test_forward_pulls_out_correct_tensor_with_unsorted_batches(self): lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=7, batch_first=True) encoder = PytorchSeq2VecWrapper(lstm) input_tensor = torch.rand([5, 7, 3]) input_tensor[0, 3:, :] = 0 input_tensor[1, 4:, :] = 0 input_tensor[2, 2:, :] = 0 input_tensor[3, 6:, :] = 0 mask = torch.ones(5, 7) mask[0, 3:] = 0 mask[1, 4:] = 0 mask[2, 2:] = 0 mask[3, 6:] = 0 sequence_lengths = get_lengths_from_binary_sequence_mask(mask) sorted_inputs, sorted_sequence_lengths, restoration_indices, _ = sort_batch_by_length(input_tensor, sequence_lengths) packed_sequence = pack_padded_sequence(sorted_inputs, sorted_sequence_lengths.tolist(), batch_first=True) _, state = lstm(packed_sequence) # Transpose output state, extract the last forward and backward states and # reshape to be of dimension (batch_size, 2 * hidden_size). sorted_transposed_state = state[0].transpose(0, 1).index_select(0, restoration_indices) reshaped_state = sorted_transposed_state[:, -2:, :].contiguous() explicitly_concatenated_state = torch.cat([reshaped_state[:, 0, :].squeeze(1), reshaped_state[:, 1, :].squeeze(1)], -1) encoder_output = encoder(input_tensor, mask) assert_almost_equal(encoder_output.data.numpy(), explicitly_concatenated_state.data.numpy())
def test_forward_pulls_out_correct_tensor_for_unsorted_batches(self): lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=7, batch_first=True) encoder = PytorchSeq2SeqWrapper(lstm) input_tensor = torch.rand([5, 7, 3]) input_tensor[0, 3:, :] = 0 input_tensor[1, 4:, :] = 0 input_tensor[2, 2:, :] = 0 input_tensor[3, 6:, :] = 0 mask = torch.ones(5, 7) mask[0, 3:] = 0 mask[1, 4:] = 0 mask[2, 2:] = 0 mask[3, 6:] = 0 sequence_lengths = get_lengths_from_binary_sequence_mask(mask) sorted_inputs, sorted_sequence_lengths, restoration_indices, _ = sort_batch_by_length(input_tensor, sequence_lengths) packed_sequence = pack_padded_sequence(sorted_inputs, sorted_sequence_lengths.data.tolist(), batch_first=True) lstm_output, _ = lstm(packed_sequence) encoder_output = encoder(input_tensor, mask) lstm_tensor, _ = pad_packed_sequence(lstm_output, batch_first=True) assert_almost_equal(encoder_output.data.numpy(), lstm_tensor.index_select(0, restoration_indices).data.numpy())
def forward(self, xs): bsz = len(xs) # embed input tokens xes = F.dropout(self.lt(xs), p=self.dropout, training=self.training) x_lens = [x for x in torch.sum((xs > 0).int(), dim=1).data] xes_packed = pack_padded_sequence(xes, x_lens, batch_first=True) zeros = self.zeros(xs) if zeros.size(1) != bsz: zeros.resize_(self.layers * self.dirs, bsz, self.hsz).fill_(0) h0 = Variable(zeros, requires_grad=False) if type(self.rnn) == nn.LSTM: encoder_output_packed, hidden = self.rnn(xes_packed, (h0, h0)) # take elementwise max between forward and backward hidden states hidden = (hidden[0].view(-1, self.dirs, bsz, self.hsz).max(1)[0], hidden[1].view(-1, self.dirs, bsz, self.hsz).max(1)[0]) else: encoder_output_packed, hidden = self.rnn(xes_packed, h0) # take elementwise max between forward and backward hidden states hidden = hidden.view(-1, self.dirs, bsz, self.hsz).max(1)[0] encoder_output, _ = pad_packed_sequence(encoder_output_packed, batch_first=True) return encoder_output, hidden
def encode(self, src_sents_var: torch.Tensor, src_sent_lens: List[int]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Use a GRU/LSTM to encode source sentences into hidden states Args: src_sents: list of source sentence tokens Returns: src_encodings: hidden states of tokens in source sentences, this could be a variable with shape (batch_size, source_sentence_length, encoding_dim), or in orther formats decoder_init_state: decoder GRU/LSTM's initial state, computed from source encodings """ # (src_sent_len, batch_size, embed_size) src_word_embeds = self.src_embed(src_sents_var) packed_src_embed = pack_padded_sequence(src_word_embeds, src_sent_lens) # src_encodings: (src_sent_len, batch_size, hidden_size * 2) src_encodings, (last_state, last_cell) = self.encoder_lstm(packed_src_embed) src_encodings, _ = pad_packed_sequence(src_encodings) # (batch_size, src_sent_len, hidden_size * 2) src_encodings = src_encodings.permute(1, 0, 2) dec_init_cell = self.decoder_cell_init(torch.cat([last_cell[0], last_cell[1]], dim=1)) dec_init_state = torch.tanh(dec_init_cell) return src_encodings, (dec_init_state, dec_init_cell)
def forward(self, embs, lengths): """ This is the heart of the model. This function, defines how the data passes through the network. Args: embs (): word embeddings lengths (): the lengths of each sentence Returns: the logits for each class """ # pack the batch packed = pack_padded_sequence(embs, list(lengths.data), batch_first=True) out_packed, _ = self.rnn(packed) # unpack output - no need if we are going to use only the last outputs outputs, _ = pad_packed_sequence(out_packed, batch_first=True) # get the outputs from the last *non-masked* timestep for each sentence last_outputs = self.last_timestep(outputs, lengths, self.rnn.bidirectional) # apply dropout to the outputs of the RNN last_outputs = self.drop_rnn(last_outputs) return outputs, last_outputs
def forward(self, vocab): with torch.no_grad(): batch_shape = vocab['sentence'].shape s_embedding = self.embedding(vocab['sentence'].cuda()) a_embedding = self.embedding(vocab['aspect'].cuda()) packed_s = pack_padded_sequence(s_embedding, vocab['sent_len'], batch_first=True) out_s, (h_s, c1) = self.lstm_s(packed_s) # packed output out_a, (h_a, c2) = self.lstm_a(a_embedding) with torch.no_grad(): unpacked_out_s, _ = pad_packed_sequence(out_s, batch_first=True) # Pair-wise interaction matrix I_matrix = torch.bmm(unpacked_out_s, out_a.permute(0,2,1)) # Column-wise softmax a2s_attn = F.softmax(I_matrix, dim=1) # Row-wise softmax => Column-wise average => aspect attention s2a_attn = F.softmax(I_matrix, dim=2) a_attn = torch.mean(s2a_attn, dim=1) # Final sentence attn => weighted sum of each individual a2s_attn s_attn = torch.bmm(a2s_attn, a_attn.unsqueeze(-1)) final_rep = torch.bmm(unpacked_out_s.permute(0,2,1), s_attn).squeeze(-1) pred = self.fc(final_rep) return pred
def encode(self, src_sents_var, src_sents_len): """Encode the input natural language utterance Args: src_sents_var: a variable of shape (src_sent_len, batch_size), representing word ids of the input src_sents_len: a list of lengths of input source sentences, sorted by descending order Returns: src_encodings: source encodings of shape (batch_size, src_sent_len, hidden_size * 2) last_state, last_cell: the last hidden state and cell state of the encoder, of shape (batch_size, hidden_size) """ # (tgt_query_len, batch_size, embed_size) # apply word dropout if self.training and self.args.word_dropout: mask = Variable(self.new_tensor(src_sents_var.size()).fill_(1. - self.args.word_dropout).bernoulli().long()) src_sents_var = src_sents_var * mask + (1 - mask) * self.vocab.source.unk_id src_token_embed = self.src_embed(src_sents_var) packed_src_token_embed = pack_padded_sequence(src_token_embed, src_sents_len) # src_encodings: (tgt_query_len, batch_size, hidden_size) src_encodings, (last_state, last_cell) = self.encoder_lstm(packed_src_token_embed) src_encodings, _ = pad_packed_sequence(src_encodings) # src_encodings: (batch_size, tgt_query_len, hidden_size) src_encodings = src_encodings.permute(1, 0, 2) # (batch_size, hidden_size * 2) last_state = torch.cat([last_state[0], last_state[1]], 1) last_cell = torch.cat([last_cell[0], last_cell[1]], 1) return src_encodings, (last_state, last_cell)
def forward(self, x): """Receives a Variable of indices (n_timesteps, n_samples) and returns their recurrent representations.""" # sort the batch by decreasing length of sequences # oidxs: to recover original order # sidxs: idxs to sort the batch # slens: lengths in sorted order for pack_padded_sequence() oidxs, sidxs, slens, mask = sort_batch(x) # Fetch embeddings for the sorted batch embs = self.emb(x[:, sidxs]) if self.dropout_emb > 0: embs = self.do_emb(embs) # Pack and encode packed_emb = pack_padded_sequence(embs, slens) packed_hs, h_t = self.enc(packed_emb) # Get hidden states and revert the order hs = pad_packed_sequence(packed_hs)[0][:, oidxs] if self.dropout_ctx > 0: hs = self.do_ctx(hs) return hs, mask
def test_forward_pulls_out_correct_tensor_with_sequence_lengths(self): lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=7, batch_first=True) encoder = PytorchSeq2VecWrapper(lstm) tensor = torch.rand([5, 7, 3]) tensor[1, 6:, :] = 0 tensor[2, 4:, :] = 0 tensor[3, 2:, :] = 0 tensor[4, 1:, :] = 0 mask = torch.ones(5, 7) mask[1, 6:] = 0 mask[2, 4:] = 0 mask[3, 2:] = 0 mask[4, 1:] = 0 input_tensor = Variable(tensor) mask = Variable(mask) sequence_lengths = get_lengths_from_binary_sequence_mask(mask) packed_sequence = pack_padded_sequence(input_tensor, list(sequence_lengths.data), batch_first=True) _, state = lstm(packed_sequence) # Transpose output state, extract the last forward and backward states and # reshape to be of dimension (batch_size, 2 * hidden_size). reshaped_state = state[0].transpose(0, 1)[:, -2:, :].contiguous() explicitly_concatenated_state = torch.cat([reshaped_state[:, 0, :].squeeze(1), reshaped_state[:, 1, :].squeeze(1)], -1) encoder_output = encoder(input_tensor, mask) assert_almost_equal(encoder_output.data.numpy(), explicitly_concatenated_state.data.numpy())
def test_augmented_lstm_computes_same_function_as_pytorch_lstm(self): augmented_lstm = AugmentedLstm(10, 11) pytorch_lstm = LSTM(10, 11, num_layers=1, batch_first=True) # Initialize all weights to be == 1. initializer = InitializerApplicator([(".*", lambda tensor: torch.nn.init.constant_(tensor, 1.))]) initializer(augmented_lstm) initializer(pytorch_lstm) initial_state = torch.zeros([1, 5, 11]) initial_memory = torch.zeros([1, 5, 11]) # Use bigger numbers to avoid floating point instability. sorted_tensor, sorted_sequence, _, _ = sort_batch_by_length(self.random_tensor * 5., self.sequence_lengths) lstm_input = pack_padded_sequence(sorted_tensor, sorted_sequence.data.tolist(), batch_first=True) augmented_output, augmented_state = augmented_lstm(lstm_input, (initial_state, initial_memory)) pytorch_output, pytorch_state = pytorch_lstm(lstm_input, (initial_state, initial_memory)) pytorch_output_sequence, _ = pad_packed_sequence(pytorch_output, batch_first=True) augmented_output_sequence, _ = pad_packed_sequence(augmented_output, batch_first=True) numpy.testing.assert_array_almost_equal(pytorch_output_sequence.data.numpy(), augmented_output_sequence.data.numpy(), decimal=4) numpy.testing.assert_array_almost_equal(pytorch_state[0].data.numpy(), augmented_state[0].data.numpy(), decimal=4) numpy.testing.assert_array_almost_equal(pytorch_state[1].data.numpy(), augmented_state[1].data.numpy(), decimal=4)
def forward(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): """ input: word_inputs: (batch_size, sent_len) word_seq_lengths: list of batch_size, (batch_size,1) char_inputs: (batch_size*sent_len, word_length) char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1) char_seq_recover: variable which records the char order information, used to recover char order output: Variable(batch_size, sent_len, hidden_dim) """ word_represent = self.wordrep(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) ## word_embs (batch_size, seq_len, embed_size) if self.word_feature_extractor == "CNN": word_in = F.tanh(self.word2cnn(word_represent)).transpose(2,1).contiguous() for idx in range(self.cnn_layer): if idx == 0: cnn_feature = F.relu(self.cnn_list[idx](word_in)) else: cnn_feature = F.relu(self.cnn_list[idx](cnn_feature)) cnn_feature = self.cnn_drop_list[idx](cnn_feature) cnn_feature = self.cnn_batchnorm_list[idx](cnn_feature) feature_out = cnn_feature.transpose(2,1).contiguous() else: packed_words = pack_padded_sequence(word_represent, word_seq_lengths.cpu().numpy(), True) hidden = None lstm_out, hidden = self.lstm(packed_words, hidden) lstm_out, _ = pad_packed_sequence(lstm_out) ## lstm_out (seq_len, seq_len, hidden_size) feature_out = self.droplstm(lstm_out.transpose(1,0)) ## feature_out (batch_size, seq_len, hidden_size) outputs = self.hidden2tag(feature_out) return outputs
def encode(self, indices, lengths, noise): embeddings = self.embedding(indices) packed_embeddings = pack_padded_sequence(input=embeddings, lengths=lengths, batch_first=True) # Encode packed_output, state = self.encoder(packed_embeddings) hidden, cell = state # batch_size x nhidden hidden = hidden[-1] # get hidden state of last layer of encoder # normalize to unit ball (l2 norm of 1) - p=2, dim=1 norms = torch.norm(hidden, 2, 1) # For older versions of PyTorch use: hidden = torch.div(hidden, norms.expand_as(hidden)) # For newest version of PyTorch (as of 8/25) use this: # hidden = torch.div(hidden, norms.unsqueeze(1).expand_as(hidden)) if noise and self.noise_radius > 0: gauss_noise = torch.normal(means=torch.zeros(hidden.size()), std=self.noise_radius) hidden = hidden + to_gpu(self.gpu, Variable(gauss_noise)) return hidden
def forward(self, features, captions, lengths): """Decode image feature vectors and generates captions.""" embeddings = self.embed(captions) embeddings = torch.cat((features.unsqueeze(1), embeddings), 1) packed = pack_padded_sequence(embeddings, lengths, batch_first=True) hiddens, _ = self.lstm(packed) outputs = self.linear(hiddens[0]) return outputs
def _run_rnns(self, inputs, structures, lengths): ''' Run desired rnns ''' for rnn, structure in zip(self.rnns, [structures]): if isinstance(rnn, ChildSumTreeLSTM): h_all, h_last = rnn(inputs, structure) elif isinstance(rnn, LSTM): packed = pack_padded_sequence(inputs, list(lengths.data), batch_first=True) h_all, (h_last, c_last) = rnn(packed) h_all, _ = pad_packed_sequence(h_all, batch_first=True) elif isinstance(rnn, GRU): packed = pack_padded_sequence(inputs, list(lengths.data), batch_first=True) h_all, h_last = rnn(packed) h_all, _ = pad_packed_sequence(h_all, batch_first=True) inputs = h_all.squeeze() return h_all, h_last
def test_variable_length_sequences_run_backward_return_correctly_padded_outputs(self): sorted_tensor, sorted_sequence, _, _ = sort_batch_by_length(self.random_tensor, self.sequence_lengths) tensor = pack_padded_sequence(sorted_tensor, sorted_sequence.data.tolist(), batch_first=True) lstm = AugmentedLstm(10, 11, go_forward=False) output, _ = lstm(tensor) output_sequence, _ = pad_packed_sequence(output, batch_first=True) numpy.testing.assert_array_equal(output_sequence.data[1, 6:, :].numpy(), 0.0) numpy.testing.assert_array_equal(output_sequence.data[2, 4:, :].numpy(), 0.0) numpy.testing.assert_array_equal(output_sequence.data[3, 3:, :].numpy(), 0.0) numpy.testing.assert_array_equal(output_sequence.data[4, 2:, :].numpy(), 0.0)
def forward(self, input, seq_lens): embedded = self.embedding(input) packed = pack_padded_sequence(embedded, seq_lens, batch_first=True) output, hidden = self.lstm(packed) h, _ = pad_packed_sequence(output, batch_first=True) # h dim = B x t_k x n h = h.contiguous() max_h, _ = h.max(dim=1) return h, hidden, max_h
def test_stacked_bidirectional_lstm_completes_forward_pass(self): input_tensor = torch.rand(4, 5, 3) input_tensor[1, 4:, :] = 0. input_tensor[2, 2:, :] = 0. input_tensor[3, 1:, :] = 0. input_tensor = pack_padded_sequence(input_tensor, [5, 4, 2, 1], batch_first=True) lstm = StackedBidirectionalLstm(3, 7, 3) output, _ = lstm(input_tensor) output_sequence, _ = pad_packed_sequence(output, batch_first=True) numpy.testing.assert_array_equal(output_sequence.data[1, 4:, :].numpy(), 0.0) numpy.testing.assert_array_equal(output_sequence.data[2, 2:, :].numpy(), 0.0) numpy.testing.assert_array_equal(output_sequence.data[3, 1:, :].numpy(), 0.0)
def forward(self, inputs: PackedSequence, # pylint: disable=arguments-differ # pylint: disable=unused-argument initial_state: torch.Tensor = None)-> Tuple[PackedSequence, torch.Tensor]: """ Parameters ---------- inputs : ``PackedSequence``, required. A batch first ``PackedSequence`` to run the stacked LSTM over. initial_state : Tuple[torch.Tensor, torch.Tensor], optional, (default = None) Currently, this is ignored. Returns ------- output_sequence : ``PackedSequence`` The encoded sequence of shape (batch_size, sequence_length, hidden_size) final_states: ``torch.Tensor`` The per-layer final (state, memory) states of the LSTM, each with shape (num_layers, batch_size, hidden_size). """ inputs, lengths = pad_packed_sequence(inputs, batch_first=True) # Kernel takes sequence length first tensors. inputs = inputs.transpose(0, 1) sequence_length, batch_size, _ = inputs.size() accumulator_shape = [self.num_layers, sequence_length + 1, batch_size, self.hidden_size] state_accumulator = Variable(inputs.data.new(*accumulator_shape).zero_(), requires_grad=False) memory_accumulator = Variable(inputs.data.new(*accumulator_shape).zero_(), requires_grad=False) dropout_weights = inputs.data.new().resize_(self.num_layers, batch_size, self.hidden_size).fill_(1.0) if self.training: # Normalize by 1 - dropout_prob to preserve the output statistics of the layer. dropout_weights.bernoulli_(1 - self.recurrent_dropout_probability)\ .div_((1 - self.recurrent_dropout_probability)) dropout_weights = Variable(dropout_weights, requires_grad=False) gates = Variable(inputs.data.new().resize_(self.num_layers, sequence_length, batch_size, 6 * self.hidden_size)) lengths_variable = Variable(torch.IntTensor(lengths)) implementation = _AlternatingHighwayLSTMFunction(self.input_size, self.hidden_size, num_layers=self.num_layers, train=self.training) output, _ = implementation(inputs, self.weight, self.bias, state_accumulator, memory_accumulator, dropout_weights, lengths_variable, gates) # TODO(Mark): Also return the state here by using index_select with the lengths so we can use # it as a Seq2VecEncoder. output = output.transpose(0, 1) output = pack_padded_sequence(output, lengths, batch_first=True) return output, None
def test_stacked_alternating_lstm_completes_forward_pass(self): input_tensor = torch.autograd.Variable(torch.rand(4, 5, 3)) input_tensor[1, 4:, :] = 0. input_tensor[2, 2:, :] = 0. input_tensor[3, 1:, :] = 0. input_tensor = pack_padded_sequence(input_tensor, [5, 4, 2, 1], batch_first=True) lstm = StackedAlternatingLstm(3, 7, 3) output, _ = lstm(input_tensor) output_sequence, _ = pad_packed_sequence(output, batch_first=True) numpy.testing.assert_array_equal(output_sequence.data[1, 4:, :].numpy(), 0.0) numpy.testing.assert_array_equal(output_sequence.data[2, 2:, :].numpy(), 0.0) numpy.testing.assert_array_equal(output_sequence.data[3, 1:, :].numpy(), 0.0)
def forward(self, x, lens): B, T = x.shape # 获取词嵌入向量 x = self.embed(x) x = self.drop(x) x = pack_padded_sequence(x, lens, True) x, _ = self.lstm(x) x, _ = pad_packed_sequence(x, True) x = self.drop(x) return self.out(x)
def sort_and_run_forward(self, module, inputs, mask): batch_size = mask.size(0) sequence_lengths = mask.long().sum(-1) sorted_inputs, sorted_sequence_lengths, restoration_indices, sorting_indices = sort_batch_by_length(inputs, sequence_lengths) packed_sequence_input = pack_padded_sequence(sorted_inputs[:, :, :], sorted_sequence_lengths[:].data.tolist(), batch_first=True) module_output, final_states = module(packed_sequence_input, None) return module_output, final_states, restoration_indices
def forward(self, # pylint: disable=arguments-differ inputs: PackedSequence, initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None): """ Parameters ---------- inputs : ``PackedSequence``, required. A batch first ``PackedSequence`` to run the stacked LSTM over. initial_state : Tuple[torch.Tensor, torch.Tensor], optional, (default = None) A tuple (state, memory) representing the initial hidden state and memory of the LSTM. Each tensor has shape (1, batch_size, output_dimension * 2). Returns ------- output_sequence : PackedSequence The encoded sequence of shape (batch_size, sequence_length, hidden_size * 2) final_states: torch.Tensor The per-layer final (state, memory) states of the LSTM, each with shape (num_layers, batch_size, hidden_size * 2). """ if not initial_state: hidden_states = [None] * len(self.lstm_layers) elif initial_state[0].size()[0] != len(self.lstm_layers): raise ConfigurationError("Initial states were passed to forward() but the number of " "initial states does not match the number of layers.") else: hidden_states = list(zip(initial_state[0].split(1, 0), initial_state[1].split(1, 0))) output_sequence = inputs final_h = [] final_c = [] for i, state in enumerate(hidden_states): forward_layer = getattr(self, 'forward_layer_{}'.format(i)) backward_layer = getattr(self, 'backward_layer_{}'.format(i)) # The state is duplicated to mirror the Pytorch API for LSTMs. forward_output, final_forward_state = forward_layer(output_sequence, state) backward_output, final_backward_state = backward_layer(output_sequence, state) forward_output, lengths = pad_packed_sequence(forward_output, batch_first=True) backward_output, _ = pad_packed_sequence(backward_output, batch_first=True) output_sequence = torch.cat([forward_output, backward_output], -1) output_sequence = pack_padded_sequence(output_sequence, lengths, batch_first=True) final_h.extend([final_forward_state[0], final_backward_state[0]]) final_c.extend([final_forward_state[1], final_backward_state[1]]) final_h = torch.cat(final_h, dim=0) final_c = torch.cat(final_c, dim=0) final_state_tuple = (final_h, final_c) return output_sequence, final_state_tuple
def forward(self, input, hidden, no_pack=False): emb = self.drop(self.encoder(input)) # if eval, pack padded sequence (we don't pack during training because # we have no padding in our input samples) if not self.training and not no_pack: emb_lens = [x for x in torch.sum((input > 0).int(), dim=0).data] emb_packed = pack_padded_sequence(emb, emb_lens, batch_first=False) packed_output, hidden = self.rnn(emb_packed, hidden) output, _ = pad_packed_sequence(packed_output, batch_first=False) else: output, hidden = self.rnn(emb, hidden) output = self.drop(output) decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2))) return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
def _sort_and_run_forward(self, module, inputs, mask): batch_size = mask.size(0) num_valid = torch.sum(mask[:, 0]).int().item() # just in case some instances may be of zero length. sequence_lengths = mask.long().sum(-1) sorted_inputs, sorted_sequence_lengths, restoration, sorting = sort_batch_by_length(inputs, sequence_lengths) packed_sequence_input = pack_padded_sequence(sorted_inputs[:num_valid, :, :], sorted_sequence_lengths[:num_valid].data.tolist(), batch_first=True) initial_states = self._get_initial_states(batch_size, num_valid, sorting) module_output, final_states = module(packed_sequence_input, initial_states) return module_output, final_states, restoration
def forward(self, img_feats, captions, lengths): embeddings = self.embedding(captions) # img_feats是2048维的向量,通过全连接层转为256维的向量,和词向量一样 img_feats = self.fc(img_feats).unsqueeze(0) # 将img_feats看成第一个词的词向量 embeddings = t.cat([img_feats, embeddings], 0) # PackedSequence packed_embeddings = pack_padded_sequence(embeddings, lengths) outputs, state = self.rnn(packed_embeddings) # lstm的输出作为特征用来分类预测下一个词的序号 # 因为输入是PackedSequence,所以输出的output也是PackedSequence # PackedSequence第一个元素是Variable,第二个元素是batch_sizes, # 即batch中每个样本的长度 pred = self.classifier(outputs[0]) return pred, state
def _reorder_by_length_and_package( self, pred_out, molecule_graphs, mol_to_graph_idx, num_initial_reactants_in_mol_to_graph_idx, original_syn_trees): # Now package everything together! # --> compute the order PAD_VALUE = settings.PAD_VALUE seq_sizes = np.array([p.sequence_choices.size for p in pred_out]) array_containing_original_indcs = np.argsort( seq_sizes)[::-1] # we need to put the largest sequence first. seq_size_with_padding = seq_sizes.max() new_seq_sizes = seq_sizes[array_containing_original_indcs] # --> the input DoGs can just be stacked together. dags_for_input = [ pred_out[i].dag_for_input for i in array_containing_original_indcs ] dags_for_input = dags_for_input[0].concatenate(dags_for_input) dags_for_input.inplace_from_np_to_torch() # We also record where the root molecule for each of these lives in the graphs (last poistion of each DAG) final_molecule_indcs = np.cumsum( np.bincount(dags_for_input.node_to_graph_id, minlength=dags_for_input.max_num_graphs)) - 1 final_molecule_indcs = torch.tensor(final_molecule_indcs, dtype=settings.TORCH_INT) # --> the other parts we want to put in PackedSequence or have more clearer indication of where they live # inside the other parts construction_dags: typing.List[grphs.DirectedGraphAsAdjList] = [] dags_id_at_index = [] sequence_action_kinds = [] sequence_choices = [] number_edge_choices_including_both_stops = len(mol_to_graph_idx) + 2 edge_masks = np.full( (len(array_containing_original_indcs), seq_size_with_padding, number_edge_choices_including_both_stops), PAD_VALUE) reactant_masks = np.full( (len(array_containing_original_indcs), seq_size_with_padding, num_initial_reactants_in_mol_to_graph_idx), PAD_VALUE) for new_idx, old_idx in enumerate(array_containing_original_indcs): p = pred_out[old_idx] p_seq_size = p.sequence_choices.size # --> We will deal with the construction DAGS first. The empty DAG is the same for all of them (this is # at index 0 and should be None so can be shared) num_construction_dags_seen_so_far = len(construction_dags) construction_dags.extend( filter(lambda x: x is not None, p.dags_at_construction_stages)) dags_id_with_correct_shift = p.dags_id_at_index dags_id_with_correct_shift[dags_id_with_correct_shift != 0] += num_construction_dags_seen_so_far # ^ The DAG ID will get shifted when we concatenate them but not the index for 0 as this is the empty DAG. assert dags_id_with_correct_shift.size == p_seq_size dags_id_at_index.append( np.pad(dags_id_with_correct_shift, (0, seq_size_with_padding - p_seq_size), 'constant', constant_values=PAD_VALUE)) assert p.sequence_action_kinds.size == p_seq_size new_seq_action_kinds = np.pad( p.sequence_action_kinds, (0, seq_size_with_padding - p_seq_size), 'constant', constant_values=PAD_VALUE) sequence_action_kinds.append(new_seq_action_kinds) assert p.sequence_choices.size == p_seq_size sequence_choices.append( np.pad(p.sequence_choices, (0, seq_size_with_padding - p_seq_size), 'constant', constant_values=PAD_VALUE)) if p.sequence_masks_for_edge_steps is not None: edge_masks[ new_idx, new_seq_action_kinds == EDGE_ADD_STEP_VAL, :] = p.sequence_masks_for_edge_steps else: assert (new_seq_action_kinds == EDGE_ADD_STEP_VAL).sum() == 0 reactant_masks[ new_idx, new_seq_action_kinds == REACTANT_CHOOSE_STEP_VAL, :] = p.sequence_masks_for_reactant_steps # --> Put the construction DAGs together construction_dags: grphs.DirectedGraphAsAdjList = construction_dags[ 0].concatenate(construction_dags) construction_dags.inplace_from_np_to_torch() # --> Pack the padded sequences together seq_sizes = torch.tensor(new_seq_sizes) dags_id_at_index = torch.tensor(np.stack(dags_id_at_index), dtype=settings.TORCH_INT) dags_id_at_index = rnn.pack_padded_sequence(dags_id_at_index, seq_sizes, batch_first=True) sequence_action_kinds = torch.tensor(np.stack(sequence_action_kinds), dtype=settings.TORCH_INT) sequence_action_kinds = rnn.pack_padded_sequence(sequence_action_kinds, seq_sizes, batch_first=True) sequence_choices = torch.tensor(np.stack(sequence_choices), dtype=settings.TORCH_INT) sequence_choices = rnn.pack_padded_sequence(sequence_choices, seq_sizes, batch_first=True) edge_masks = rnn.pack_padded_sequence(torch.tensor(edge_masks, dtype=torch.bool), seq_sizes, batch_first=True) reactant_masks = rnn.pack_padded_sequence(torch.tensor( reactant_masks, dtype=torch.bool), seq_sizes, batch_first=True) original_syn_trees = [ original_syn_trees[i] for i in array_containing_original_indcs ] return PredOutBatch(dags_for_inputs=dags_for_input, dags_at_construction_stages=construction_dags, molecular_graphs=molecule_graphs, dags_id_at_index=dags_id_at_index, sequence_action_kinds=sequence_action_kinds, sequence_choices=sequence_choices, sequence_masks_for_edge_steps=edge_masks, sequence_masks_for_reactant_steps=reactant_masks, mol_to_graph_idx=mol_to_graph_idx, num_that_are_initial_reactants=num_initial_reactants_in_mol_to_graph_idx, final_molecule_indcs=final_molecule_indcs, syn_trees=original_syn_trees), \ array_containing_original_indcs
def forward(self, sentence, p_sentence, pos_tags, lengths, target_idx_in, region_marks, local_roles_voc, frames, local_roles_mask, sent_pred_lemmas_idx, dep_tags, dep_heads, targets, P_identification, all_l_ids, Predicate_link, Predicate_Labels_nd, Predicate_Labels, Chars_in, unlabeled_sentence=None, p_unlabeled_sentence=None, unlabeled_lengths=None, test=False, cvt_train=False): if cvt_train: CVT_SRL_Loss = self.CVT_train(unlabeled_sentence, p_unlabeled_sentence, unlabeled_lengths) return CVT_SRL_Loss """ perform predicate Identificaiton first """ Predicate_Identification_Space = self.Predicate_Id( sentence, p_sentence, lengths) # +++++++++++++++++++++++ wrong_l_nums = 0.0 all_l_nums = 0.0 right_noNull_predict = 0.0 noNull_predict = 0.0 noNUll_truth = 0.0 PI_labels = np.argmax( Predicate_Identification_Space.cpu().data.numpy(), axis=1) for predict_l, gold_l in zip( PI_labels, P_identification.cpu().view(-1).data.numpy()): if predict_l > 1 and gold_l != 0: noNull_predict += 1 if gold_l != 0: all_l_nums += 1 if gold_l != 1: noNUll_truth += 1 if gold_l == predict_l: right_noNull_predict += 1 if predict_l != gold_l and gold_l != 0: wrong_l_nums += 1 """ construct DEP_input """ embeds_SRL = self.word_embeddings_SRL(sentence) embeds_SRL = embeds_SRL.view(self.batch_size, len(sentence[0]), self.word_emb_dim) region_marks = self.region_embeddings(region_marks).view( self.batch_size, len(sentence[0]), 16) embeds_Memory = F.tanh(torch.matmul(embeds_SRL, self.Memory_space)) SRL_hidden_states = torch.cat((embeds_Memory, region_marks), 2) SRL_hidden_states = self.SRL_input_dropout(SRL_hidden_states) # SRL layer embeds_sort, lengths_sort, unsort_idx = self.sort_batch( SRL_hidden_states, lengths) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort, batch_first=True) # hidden states [time_steps * batch_size * hidden_units] hidden_states, self.hidden_SRL_base = self.BiLSTM_1( embeds_sort, self.hidden_SRL_base) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) # hidden_states = hidden_states.transpose(0, 1) hidden_states_0 = hidden_states[unsort_idx] embeds_sort, lengths_sort, unsort_idx = self.sort_batch( hidden_states_0, lengths) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort.cpu().numpy(), batch_first=True) # hidden states [time_steps * batch_size * hidden_units] hidden_states, self.hidden_SRL = self.BiLSTM_SRL( embeds_sort, self.hidden_SRL) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) # hidden_states = hidden_states.transpose(0, 1) hidden_states = hidden_states[unsort_idx] hidden_states = self.hidden_state_dropout_SRL(hidden_states) # B * H hidden_states_3 = hidden_states hidden_states_word = self.dropout_1( F.relu(self.Non_Predicate_Proj(hidden_states_3))) predicate_embeds = hidden_states_3[ np.arange(0, hidden_states_3.size()[0]), target_idx_in] hidden_states_predicate = self.dropout_2( F.relu(self.Predicate_Proj(predicate_embeds))) bias_one = torch.ones( (self.batch_size, len(sentence[0]), 1)).to(device) hidden_states_word = torch.cat( (hidden_states_word, Variable(bias_one)), 2) bias_one = torch.ones((self.batch_size, 1)).to(device) hidden_states_predicate = torch.cat( (hidden_states_predicate, Variable(bias_one)), 1) left_part = torch.mm( hidden_states_word.view(self.batch_size * len(sentence[0]), -1), self.W_R) left_part = left_part.view(self.batch_size, len(sentence[0]) * self.tagset_size, -1) hidden_states_predicate = hidden_states_predicate.view( self.batch_size, -1, 1) tag_space = torch.bmm(left_part, hidden_states_predicate).view( len(sentence[0]) * self.batch_size, -1) SRLprobs = F.softmax(tag_space, dim=1) loss_function = nn.CrossEntropyLoss(ignore_index=0) SRLloss = loss_function(tag_space, targets.view(-1)) IDloss = loss_function(Predicate_Identification_Space, P_identification.view(-1)) return SRLloss, IDloss, IDloss, SRLprobs, wrong_l_nums, all_l_nums, wrong_l_nums, all_l_nums, \ right_noNull_predict, noNull_predict, noNUll_truth,\ right_noNull_predict, noNull_predict, noNUll_truth
def forward(self, src_tokens, src_lengths): if self.left_pad: # convert left-padding to right-padding src_tokens = utils.convert_padding_direction( src_tokens, self.padding_idx, left_to_right=True ) if self.word_dropout_module is not None: src_tokens = self.word_dropout_module(src_tokens) bsz, seqlen = src_tokens.size() # embed tokens x = self.embed_tokens(src_tokens) x = F.dropout(x, p=self.dropout_in, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) embedded_words = x # Generate packed seq to deal with varying source seq length packed_input, batch_sizes = pack_padded_sequence(x, src_lengths) final_hiddens, final_cells = [], [] next_hiddens = [] for i, rnn_layer in enumerate(self.layers): current_hidden_size = ( self.hidden_dim // 2 if rnn_layer.is_bidirectional else self.hidden_dim ) if self.cell_type in ["lstm", "milstm", "layer_norm_lstm"]: prev_hidden = ( x.new(bsz, current_hidden_size).zero_(), x.new(bsz, current_hidden_size).zero_(), ) else: raise Exception(f"{self.cell_type} not implemented") hidden, current_output = rnn_layer.forward( packed_input, prev_hidden, batch_sizes ) next_hiddens.append(hidden) prev_hidden = next_hiddens[-1] if self.dropout_out != 0: current_output = F.dropout( current_output, p=self.dropout_out, training=self.training ) if self.residual_level is not None and i >= self.residual_level: packed_input = packed_input.clone() + current_output else: packed_input = current_output final_hiddens, final_cells = zip(*next_hiddens) # Reshape to [num_layer, batch_size, hidden_dim] final_hiddens = torch.cat(final_hiddens, dim=0).view( self.num_layers, *final_hiddens[0].size() ) final_cells = torch.cat(final_cells, dim=0).view( self.num_layers, *final_cells[0].size() ) # [max_seqlen, batch_size, hidden_dim] unpacked_output, _ = pad_packed_sequence( PackedSequence(packed_input, batch_sizes), padding_value=self.padding_value ) return ( unpacked_output, final_hiddens, final_cells, src_lengths, src_tokens, embedded_words, )
def sort_and_run_forward(self, module: Callable[[PackedSequence, Optional[RnnState]], Tuple[Union[PackedSequence, torch.Tensor], RnnState]], inputs: torch.Tensor, mask: torch.Tensor, hidden_state: Optional[RnnState] = None): """ This function exists because Pytorch RNNs require that their inputs be sorted before being passed as input. As all of our Seq2xxxEncoders use this functionality, it is provided in a base class. This method can be called on any module which takes as input a ``PackedSequence`` and some ``hidden_state``, which can either be a tuple of tensors or a tensor. As all of our Seq2xxxEncoders have different return types, we return `sorted` outputs from the module, which is called directly. Additionally, we return the indices into the batch dimension required to restore the tensor to it's correct, unsorted order and the number of valid batch elements (i.e the number of elements in the batch which are not completely masked). This un-sorting and re-padding of the module outputs is left to the subclasses because their outputs have different types and handling them smoothly here is difficult. Parameters ---------- module : ``Callable[[PackedSequence, Optional[RnnState]], Tuple[Union[PackedSequence, torch.Tensor], RnnState]]``, required. A function to run on the inputs. In most cases, this is a ``torch.nn.Module``. inputs : ``torch.Tensor``, required. A tensor of shape ``(batch_size, sequence_length, embedding_size)`` representing the inputs to the Encoder. mask : ``torch.Tensor``, required. A tensor of shape ``(batch_size, sequence_length)``, representing masked and non-masked elements of the sequence for each element in the batch. hidden_state : ``Optional[RnnState]``, (default = None). A single tensor of shape (num_layers, batch_size, hidden_size) representing the state of an RNN with or a tuple of tensors of shapes (num_layers, batch_size, hidden_size) and (num_layers, batch_size, memory_size), representing the hidden state and memory state of an LSTM-like RNN. Returns ------- module_output : ``Union[torch.Tensor, PackedSequence]``. A Tensor or PackedSequence representing the output of the Pytorch Module. The batch size dimension will be equal to ``num_valid``, as sequences of zero length are clipped off before the module is called, as Pytorch cannot handle zero length sequences. final_states : ``Optional[RnnState]`` A Tensor representing the hidden state of the Pytorch Module. This can either be a single tensor of shape (num_layers, num_valid, hidden_size), for instance in the case of a GRU, or a tuple of tensors, such as those required for an LSTM. restoration_indices : ``torch.LongTensor`` A tensor of shape ``(batch_size,)``, describing the re-indexing required to transform the outputs back to their original batch order. """ # In some circumstances you may have sequences of zero length. ``pack_padded_sequence`` # requires all sequence lengths to be > 0, so remove sequences of zero length before # calling self._module, then fill with zeros. # First count how many sequences are empty. batch_size = mask.size(0) num_valid = torch.sum(mask[:, 0]).int().item() sequence_lengths = get_lengths_from_binary_sequence_mask(mask) sorted_inputs, sorted_sequence_lengths, restoration_indices, sorting_indices =\ sort_batch_by_length(inputs, sequence_lengths) # Now create a PackedSequence with only the non-empty, sorted sequences. packed_sequence_input = pack_padded_sequence(sorted_inputs[:num_valid, :, :], sorted_sequence_lengths[:num_valid].data.tolist(), batch_first=True) # Prepare the initial states. if not self.stateful: if hidden_state is None: initial_states = hidden_state elif isinstance(hidden_state, tuple): initial_states = [state.index_select(1, sorting_indices)[:, :num_valid, :] for state in hidden_state] else: initial_states = hidden_state.index_select(1, sorting_indices)[:, :num_valid, :] else: initial_states = self._get_initial_states(batch_size, num_valid, sorting_indices) # Actually call the module on the sorted PackedSequence. module_output, final_states = module(packed_sequence_input, initial_states) return module_output, final_states, restoration_indices
def encode( self, source_padded: torch.Tensor, source_lengths: List[int] ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None ### YOUR CODE HERE (~ 8 Lines) ### TODO: ### 1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings. ### src_len = maximum source sentence length, b = batch size, e = embedding size. Note ### that there is no initial hidden state or cell for the decoder. ### 2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`. ### - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X. ### - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens. ### - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to ### return a tensor of shape (b, src_len, h*2) as `enc_hiddens`. ### 3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell): ### - `init_decoder_hidden`: ### `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the h_projection layer to this in order to compute init_decoder_hidden. ### This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size ### - `init_decoder_cell`: ### `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the c_projection layer to this in order to compute init_decoder_cell. ### This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size ### ### See the following docs, as you may need to use some of the following functions in your implementation: ### Pack the padded sequence X before passing to the encoder: ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence ### Pad the packed sequence, enc_hiddens, returned by the encoder: ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tensor Permute: ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute X = self.model_embeddings.source(source_padded) X = pack_padded_sequence(input=X, lengths=source_lengths) enc_hiddens, (last_hidden, last_cell) = self.encoder(X) enc_hiddens = pad_packed_sequence(enc_hiddens)[0].permute(1, 0, 2) h = torch.cat([last_hidden[0], last_hidden[1]], dim=1) init_decoder_hidden = self.h_projection(h) c = torch.cat([last_cell[0], last_cell[1]], dim=1) init_decoder_cell = self.c_projection(c) dec_init_state = (init_decoder_hidden, init_decoder_cell) ### END YOUR CODE return enc_hiddens, dec_init_state
def forward(self, sentence, p_sentence, pos_tags, lengths, target_idx_in, region_marks, local_roles_voc, frames, local_roles_mask, sent_pred_lemmas_idx, dep_tags, dep_heads, targets, predicate_identification, all_l_ids, Predicate_link, Predicate_Labels_nd, Predicate_Labels, unlabeled_sentence_in=False, p_unlabeled_sentence_in=False, unlabeled_sen_lengths = False,test=False, cvt_train=False): """ elmo_embedding_0 = self.elmo_embeddings_0(sentence).view(self.batch_size, len(sentence[0]), 1024) elmo_embedding_1 = self.elmo_embeddings_1(sentence).view(self.batch_size, len(sentence[0]), 1024) w = F.softmax(self.elmo_word, dim=0) elmo_emb = self.elmo_gamma_word * (w[0] * elmo_embedding_0 + w[1] * elmo_embedding_1) elmo_emb_word = self.elmo_mlp_word(elmo_emb) """ region_marks = self.region_embeddings(region_marks).view(self.batch_size, len(sentence[0]), 16) fixed_embeds = self.word_fixed_embeddings(p_sentence) fixed_embeds = fixed_embeds.view(self.batch_size, len(sentence[0]), self.word_emb_dim) sent_pred_lemmas_embeds = self.p_lemma_embeddings(sent_pred_lemmas_idx) embeds_SRL = self.word_embeddings_SRL(sentence) embeds_SRL = embeds_SRL.view(self.batch_size, len(sentence[0]), self.word_emb_dim) pos_embeds = self.pos_embeddings(pos_tags) SRL_hidden_states = torch.cat((embeds_SRL, fixed_embeds, region_marks), 2) SRL_hidden_states = self.SRL_input_dropout(SRL_hidden_states) # SRL layer embeds_sort, lengths_sort, unsort_idx = self.sort_batch(SRL_hidden_states, lengths) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort.cpu().numpy(), batch_first=True) # hidden states [time_steps * batch_size * hidden_units] hidden_states, self.hidden_4 = self.BiLSTM_SRL(embeds_sort, self.hidden_4) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) # hidden_states = hidden_states.transpose(0, 1) hidden_states = hidden_states[unsort_idx] hidden_states = self.hidden_state_dropout(hidden_states) # B * H hidden_states_3 = hidden_states hidden_states_word = self.dropout_1(F.relu(self.Non_Predicate_Proj(hidden_states_3))) predicate_embeds = hidden_states_3[np.arange(0, hidden_states_3.size()[0]), target_idx_in] added_embeds = torch.zeros(1, hidden_states_3.size()[0], hidden_states_3.size()[2]).to( device) predicate_embeds = added_embeds + predicate_embeds # B * T * H predicate_embeds = predicate_embeds.transpose(0, 1) hidden_states_predicate = self.dropout_2(F.relu(self.Predicate_Proj(predicate_embeds))) tag_space = self.rel_biaffine(hidden_states_word, hidden_states_predicate).view(self.batch_size*len(sentence[0]), self.tagset_size) SRLprobs = F.softmax(tag_space, dim=1) # +++++++++++++++++++++++ wrong_l_nums = 0.0 all_l_nums = 0.0 right_noNull_predict = 10.0 noNull_predict = 10.0 noNUll_truth = 10.0 loss_function = nn.CrossEntropyLoss(ignore_index=0) SRLloss = loss_function(tag_space, targets.view(-1)) return SRLloss, SRLloss, SRLloss, SRLprobs, wrong_l_nums, all_l_nums, wrong_l_nums, all_l_nums, \ right_noNull_predict, noNull_predict, noNUll_truth,\ right_noNull_predict, noNull_predict, noNUll_truth
def forward(self, cmaps_f, cmaps_b, cmarkers_f, cmarkers_b, wmaps, tmaps, wmap_lengths, cmap_lengths, pos_mask): """ Forward propagation. :param cmaps_f: padded encoded forward character sequences, a tensor of dimensions (batch_size, char_pad_len) :param cmaps_b: padded encoded backward character sequences, a tensor of dimensions (batch_size, char_pad_len) :param cmarkers_f: padded forward character markers, a tensor of dimensions (batch_size, word_pad_len) :param cmarkers_b: padded backward character markers, a tensor of dimensions (batch_size, word_pad_len) :param wmaps: padded encoded word sequences, a tensor of dimensions (batch_size, word_pad_len) :param tmaps: padded tag sequences, a tensor of dimensions (batch_size, word_pad_len) :param wmap_lengths: word sequence lengths, a tensor of dimensions (batch_size) :param cmap_lengths: character sequence lengths, a tensor of dimensions (batch_size, word_pad_len) """ self.batch_size = cmaps_f.size(0) self.word_pad_len = wmaps.size(1) # Sort by decreasing true char. sequence length cmap_lengths, char_sort_ind = cmap_lengths.sort(dim=0, descending=True) cmaps_f = cmaps_f[char_sort_ind] cmaps_b = cmaps_b[char_sort_ind] cmarkers_f = cmarkers_f[char_sort_ind] cmarkers_b = cmarkers_b[char_sort_ind] wmaps = wmaps[char_sort_ind] tmaps = tmaps[char_sort_ind] pos_mask = pos_mask[char_sort_ind] wmap_lengths = wmap_lengths[char_sort_ind] # Embedding look-up for characters cf = self.char_embeds( cmaps_f) # (batch_size, char_pad_len, char_emb_dim) cb = self.char_embeds(cmaps_b) # Dropout cf = self.dropout(cf) # (batch_size, char_pad_len, char_emb_dim) cb = self.dropout(cb) # Pack padded sequence cf = pack_padded_sequence( cf, cmap_lengths.tolist(), batch_first=True ) # packed sequence of char_emb_dim, with real sequence lengths cb = pack_padded_sequence(cb, cmap_lengths.tolist(), batch_first=True) # LSTM cf, _ = self.forw_char_lstm( cf) # packed sequence of char_rnn_dim, with real sequence lengths cb, _ = self.back_char_lstm(cb) # Unpack packed sequence cf, _ = pad_packed_sequence( cf, batch_first=True ) # (batch_size, max_char_len_in_batch, char_rnn_dim) cb, _ = pad_packed_sequence(cb, batch_first=True) # Sanity check assert cf.size(1) == max( cmap_lengths.tolist()) == list(cmap_lengths)[0] # Select RNN outputs only at marker points (spaces in the character sequence) cmarkers_f = cmarkers_f.unsqueeze(2).expand(self.batch_size, self.word_pad_len, self.char_rnn_dim) cmarkers_b = cmarkers_b.unsqueeze(2).expand(self.batch_size, self.word_pad_len, self.char_rnn_dim) cf_selected = torch.gather( cf, 1, cmarkers_f) # (batch_size, word_pad_len, char_rnn_dim) cb_selected = torch.gather(cb, 1, cmarkers_b) # Only for co-training, not useful for tagging after model is trained if self.training: lm_f = self.forw_lm_hw(self.dropout( cf_selected)) # (batch_size, word_pad_len, char_rnn_dim) lm_b = self.back_lm_hw(self.dropout(cb_selected)) lm_f_scores = self.forw_lm_out(self.dropout( lm_f)) # (batch_size, word_pad_len, lm_vocab_size) lm_b_scores = self.back_lm_out(self.dropout(lm_b)) # Sort by decreasing true word sequence length wmap_lengths, word_sort_ind = wmap_lengths.sort(dim=0, descending=True) wmaps = wmaps[word_sort_ind] tmaps = tmaps[word_sort_ind] pos_mask = pos_mask[word_sort_ind] cf_selected = cf_selected[word_sort_ind] # for language model cb_selected = cb_selected[word_sort_ind] if self.training: lm_f_scores = lm_f_scores[word_sort_ind] lm_b_scores = lm_b_scores[word_sort_ind] # Embedding look-up for words w = self.word_embeds(wmaps) # (batch_size, word_pad_len, word_emb_dim) w = self.dropout(w) # Sub-word information at each word subword = self.subword_hw( self.dropout(torch.cat( (cf_selected, cb_selected), dim=2))) # (batch_size, word_pad_len, 2 * char_rnn_dim) subword = self.dropout(subword) # Concatenate word embeddings and sub-word features w = torch.cat( (w, subword), dim=2 ) # (batch_size, word_pad_len, word_emb_dim + 2 * char_rnn_dim) # Concatenate pos tag and word embeddings if self.use_pos_mask: pos_mask = pos_mask.unsqueeze(2).to(self.device) w = torch.cat((w, pos_mask), dim=2) # Pack padded sequence w = pack_padded_sequence( w, list(wmap_lengths), batch_first=True ) # packed sequence of word_emb_dim + 2 * char_rnn_dim, with real sequence lengths # LSTM w, _ = self.word_blstm( w) # packed sequence of word_rnn_dim, with real sequence lengths # Unpack packed sequence w, _ = pad_packed_sequence( w, batch_first=True ) # (batch_size, max_word_len_in_batch, word_rnn_dim) w = self.dropout(w) crf_scores = self.crf( w) # (batch_size, max_word_len_in_batch, tagset_size, tagset_size) if self.training: return crf_scores, lm_f_scores, lm_b_scores, wmaps, tmaps, wmap_lengths, word_sort_ind, char_sort_ind else: return crf_scores, wmaps, tmaps, wmap_lengths, word_sort_ind, char_sort_ind # sort inds to reorder, if req.
def forward(self, pair, premise_len, hypothesis_len, mask_id, seg_id): batch_size = pair.shape[0] # feed the pair token ids into BertModel pair = self.bert(pair, token_type_ids=seg_id, attention_mask=mask_id)[0] pair = self.dropout_emb(pair) premise = [ torch.tensor(pair[i][1:2 + premise_len[i]]) for i in range(batch_size) ] # including the end [SEP] hypothesis = [ torch.tensor(pair[i][2 + premise_len[i]:2 + premise_len[i] + hypothesis_len[i]]) for i in range(batch_size) ] premise = pad_sequence(premise, batch_first=True) hypothesis = pad_sequence(hypothesis, batch_first=True) # premise prem_max_len = premise.shape[1] premise_len += 1 # we add 1 for the ending [SEP]. This is only for the premise but not the hypothesis premise_len, p_idxes = torch.sort(premise_len, descending=True) _, p_idx_unsort = torch.sort( p_idxes) # in order to restore the original order premise = premise[p_idxes] packed_premise = pack_padded_sequence(premise, premise_len, batch_first=True) # (max_len, batch_size, hidden_size) h_s, (_, _) = self.lstm_prem(packed_premise) h_s, _ = pad_packed_sequence(h_s) h_s = h_s[:, p_idx_unsort] # because we have two sentences here, we need to restore the order to ensuring matching # hypothesis # hypothesis = hypothesis.to(self.device) hypothesis_max_len = hypothesis.shape[1] hypothesis_len, h_idxes = torch.sort(hypothesis_len, descending=True) _, h_idx_unsort = torch.sort(h_idxes) hypothesis = hypothesis[h_idxes] packed_hypothesis = pack_padded_sequence(hypothesis, hypothesis_len, batch_first=True) # (max_len, batch_size, hidden_size) h_t, (_, _) = self.lstm_hypo(packed_hypothesis) h_t, _ = pad_packed_sequence(h_t) h_t = h_t[:, h_idx_unsort] hypothesis_len = hypothesis_len[ h_idx_unsort] # because we have two sentences here, we need to restore the order to ensuring matching # matchLSTM. This is the core of this paper. batch_size = premise.shape[0] h_m_k = torch.zeros((batch_size, self.config.hidden_size), device=self.device) c_m_k = torch.zeros((batch_size, self.config.hidden_size), device=self.device) h_last = torch.zeros((batch_size, self.config.hidden_size), device=self.device) for k in range(hypothesis_max_len): h_t_k = h_t[k] # Equation (6) # e_kj: (prem_max_len, batch_size) e_kj = torch.zeros((prem_max_len, batch_size), device=self.device) w_e_expand = self.w_e.expand(batch_size, self.config.hidden_size) for j in range(prem_max_len): # tanh_stm: (batch_size, hidden_size) tanh_s_t_m = torch.tanh( self.w_s(h_s[j]) + self.w_t(h_t_k) + self.w_m(h_m_k)) # dot product # https://github.com/pytorch/pytorch/issues/18027#issuecomment-473404765 e_kj[j] = (w_e_expand * tanh_s_t_m).sum(-1) # Equation (3) # (prem_max_len, batch_size) alpha_kj = F.softmax(e_kj, dim=0) # Equation (2) # (batch_size, hidden_size) a_k = torch.bmm(torch.unsqueeze(alpha_kj.t(), 1), h_s.permute(1, 0, 2)) a_k = torch.squeeze(a_k, dim=1) # Equation (7) # (batch_size, 2 * hidden_size) m_k = torch.cat((a_k, h_t_k), 1) # Equation (8) # (batch_size, hidden_size) h_m_k, c_m_k = self.lstm_match(m_k, (h_m_k, c_m_k)) # handle variable length sequences: hypothesis # (batch_size) for batch_idx, hl in enumerate(hypothesis_len): if k + 1 == hl: h_last[batch_idx] = h_m_k[batch_idx] h_last = self.dropout_fc(h_last) return self.fc(h_last)
def validate(val_loader, encoder, decoder, criterion): """ Performs one epoch's validation. :param val_loader: DataLoader for validation data. :param encoder: encoder model :param decoder: decoder model :param criterion: loss layer :return: BLEU-4 score """ decoder.eval() # eval mode (no dropout or batchnorm) if encoder is not None: encoder.eval() batch_time = AverageMeter() losses = AverageMeter() top5accs = AverageMeter() start = time.time() references = list() # references (true captions) for calculating BLEU-4 score hypotheses = list() # hypotheses (predictions) # explicitly disable gradient calculation to avoid CUDA memory error # solves the issue #57 with torch.no_grad(): # Batches for i, (imgs, caps, caplens, allcaps) in enumerate(val_loader): # Move to device, if available imgs = imgs.to(device) caps = caps.to(device) caplens = caplens.to(device) # Forward prop. if encoder is not None: imgs = encoder(imgs) scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(imgs, caps, caplens) # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> targets = caps_sorted[:, 1:] # Remove timesteps that we didn't decode at, or are pads # pack_padded_sequence is an easy trick to do this scores_copy = scores.clone() scores = pack_padded_sequence(scores, decode_lengths, batch_first=True)[0] targets = pack_padded_sequence(targets, decode_lengths, batch_first=True)[0] # Calculate loss loss = criterion(scores, targets) # Add doubly stochastic attention regularization loss += alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean() # Keep track of metrics losses.update(loss.item(), sum(decode_lengths)) top5 = accuracy(scores, targets, 5) top5accs.update(top5, sum(decode_lengths)) batch_time.update(time.time() - start) start = time.time() if i % print_freq == 0: print('Validation: [{0}/{1}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format(i, len(val_loader), batch_time=batch_time, loss=losses, top5=top5accs)) # Store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] # References allcaps = allcaps[sort_ind] # because images were sorted in the decoder for j in range(allcaps.shape[0]): img_caps = allcaps[j].tolist() img_captions = list( map(lambda c: [w for w in c if w not in {word_map['<start>'], word_map['<pad>']}], img_caps)) # remove <start> and pads references.append(img_captions) # Hypotheses _, preds = torch.max(scores_copy, dim=2) preds = preds.tolist() temp_preds = list() for j, p in enumerate(preds): temp_preds.append(preds[j][:decode_lengths[j]]) # remove pads preds = temp_preds hypotheses.extend(preds) assert len(references) == len(hypotheses) # Calculate BLEU-4 scores bleu4 = corpus_bleu(references, hypotheses) print( '\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}\n'.format( loss=losses, top5=top5accs, bleu=bleu4)) return bleu4
def train(train_loader, encoder, decoder, criterion, encoder_optimizer, decoder_optimizer, epoch): """ Performs one epoch's training. :param train_loader: DataLoader for training data :param encoder: encoder model :param decoder: decoder model :param criterion: loss layer :param encoder_optimizer: optimizer to update encoder's weights (if fine-tuning) :param decoder_optimizer: optimizer to update decoder's weights :param epoch: epoch number """ decoder.train() # train mode (dropout and batchnorm is used) encoder.train() batch_time = AverageMeter() # forward prop. + back prop. time data_time = AverageMeter() # data loading time losses = AverageMeter() # loss (per word decoded) top5accs = AverageMeter() # top5 accuracy start = time.time() # Batches for i, (imgs, caps, caplens) in enumerate(train_loader): data_time.update(time.time() - start) # Move to GPU, if available imgs = imgs.to(device) caps = caps.to(device) caplens = caplens.to(device) # Forward prop. imgs = encoder(imgs) scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder( imgs, caps, caplens) # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> targets = caps_sorted[:, 1:] # Remove timesteps that we didn't decode at, or are pads # pack_padded_sequence is an easy trick to do this scores = pack_padded_sequence(scores, decode_lengths, batch_first=True)[0] targets = pack_padded_sequence(targets, decode_lengths, batch_first=True)[0] # Calculate loss loss = criterion(scores, targets) # Add doubly stochastic attention regularization loss += alpha_c * ((1. - alphas.sum(dim=1))**2).mean() # Back prop. decoder_optimizer.zero_grad() if encoder_optimizer is not None: encoder_optimizer.zero_grad() loss.backward() # Clip gradients if grad_clip is not None: clip_gradient(decoder_optimizer, grad_clip) if encoder_optimizer is not None: clip_gradient(encoder_optimizer, grad_clip) # Update weights decoder_optimizer.step() if encoder_optimizer is not None: encoder_optimizer.step() # Keep track of metrics top5 = accuracy(scores, targets, 5) losses.update(loss.item(), sum(decode_lengths)) top5accs.update(top5, sum(decode_lengths)) batch_time.update(time.time() - start) start = time.time() # Print status # Print status if i % print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data Load Time {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top5=top5accs))
def forward(self, src_tokens, src_lengths): if self.left_pad: # convert left-padding to right-padding src_tokens = utils.convert_padding_direction( src_tokens, self.padding_idx, left_to_right=True ) # If we're generating adversarial examples we need to keep track of # some internal variables self.tracker.reset() if self.word_dropout_module is not None: src_tokens = self.word_dropout_module(src_tokens) bsz, seqlen = src_tokens.size() # embed tokens x = self.embed_tokens(src_tokens) # Track token embeddings self.tracker.track(x, "token_embeddings", retain_grad=self.track_gradients) x = F.dropout(x, p=self.dropout_in, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) embedded_words = x # Allows compatibility with Caffe2 inputs for tracing (int32) # as well as the current format of Fairseq-Py inputs (int64) if src_lengths.dtype is torch.int64: src_lengths = src_lengths.int() # Generate packed seq to deal with varying source seq length # packed_input is of type PackedSequence, which consists of: # element [0]: a tensor, the packed data, and # element [1]: a list of integers, the batch size for each step packed_input = pack_padded_sequence(x, src_lengths) final_hiddens, final_cells = [], [] for i, rnn_layer in enumerate(self.layers): if self.bidirectional and i == 0: h0 = x.new(2, bsz, self.hidden_dim // 2).zero_() c0 = x.new(2, bsz, self.hidden_dim // 2).zero_() else: h0 = x.new(1, bsz, self.hidden_dim).zero_() c0 = x.new(1, bsz, self.hidden_dim).zero_() # apply LSTM along entire sequence current_output, (h_last, c_last) = rnn_layer(packed_input, (h0, c0)) # final state shapes: (bsz, hidden_dim) if self.bidirectional and i == 0: # concatenate last states for forward and backward LSTM h_last = torch.cat((h_last[0, :, :], h_last[1, :, :]), dim=1) c_last = torch.cat((c_last[0, :, :], c_last[1, :, :]), dim=1) else: h_last = h_last.squeeze(dim=0) c_last = c_last.squeeze(dim=0) final_hiddens.append(h_last) final_cells.append(c_last) if self.residual_level is not None and i >= self.residual_level: packed_input[0] = packed_input.clone()[0] + current_output[0] else: packed_input = current_output # Reshape to [num_layer, batch_size, hidden_dim] final_hiddens = torch.cat(final_hiddens, dim=0).view( self.num_layers, *final_hiddens[0].size() ) final_cells = torch.cat(final_cells, dim=0).view( self.num_layers, *final_cells[0].size() ) # [max_seqlen, batch_size, hidden_dim] unpacked_output, _ = pad_packed_sequence( packed_input, padding_value=self.padding_value ) return ( unpacked_output, final_hiddens, final_cells, src_lengths, src_tokens, embedded_words, )
def forward(self, inputs, input_raw, hidden=None): """ forward """ if isinstance(inputs, tuple): inputs, lengths = inputs else: inputs, lengths = inputs, None if self.embedder is not None: rnn_inputs = self.embedder(inputs) else: rnn_inputs = inputs elmo_embed = self.elmo_embedder.sents2elmo(input_raw) elmo_length = [x.shape[0] for x in elmo_embed] batch_size_1 = len(elmo_length) max_l = max(elmo_length) size = (batch_size_1, max_l, 1024) tensor_1 = torch.zeros(size, dtype=torch.float) for i in range(batch_size_1): tensor_1[i][:elmo_length[i]] = torch.tensor(elmo_embed[i]) elmo_embed = tensor_1.cuda() rnn_inputs = torch.cat([rnn_inputs, elmo_embed], dim=-1) batch_size = rnn_inputs.size(0) if lengths is not None: num_valid = lengths.gt(0).int().sum().item() sorted_lengths, indices = lengths.sort(descending=True) rnn_inputs = rnn_inputs.index_select(0, indices) rnn_inputs = pack_padded_sequence( rnn_inputs[:num_valid], sorted_lengths[:num_valid].tolist(), batch_first=True) if hidden is not None: hidden = hidden.index_select(1, indices)[:, :num_valid] outputs, last_hidden = self.rnn(rnn_inputs, hidden) if self.bidirectional: last_hidden = self._bridge_bidirectional_hidden(last_hidden) if lengths is not None: outputs, _ = pad_packed_sequence(outputs, batch_first=True) if num_valid < batch_size: zeros = outputs.new_zeros(batch_size - num_valid, outputs.size(1), self.hidden_size) outputs = torch.cat([outputs, zeros], dim=0) zeros = last_hidden.new_zeros(self.num_layers, batch_size - num_valid, self.hidden_size) last_hidden = torch.cat([last_hidden, zeros], dim=1) _, inv_indices = indices.sort() outputs = outputs.index_select(0, inv_indices) last_hidden = last_hidden.index_select(1, inv_indices) return outputs, last_hidden
def train_encoder(): # transform = torchvision.transforms.Compose([torchvision.transforms.RandomCrop(224,224,pad_if_needed=True)]) # transform = torchvision.transforms.Compose([torchvision.transforms.RandomCrop(512,512,pad_if_needed=True)]) transform = transforms.Compose([ transforms.Resize((512, 512), 2), transforms.RandomCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), transforms.ToPILImage() ]) train_loader = data_loader.get_loader( './data/images/train/', './data/annotations/captions_train2014.json', traincaption_ids, vocab, transform, 64, True, 0) val_loader = data_loader.get_loader( './data/images/train/', './data/annotations/captions_train2014.json', valcaption_ids, vocab, transform, 64, True, 0) criterion = nn.CrossEntropyLoss() encoder = EncoderCNN(embed_size=256).cuda() decoder = DecoderRNN(embed_size=256, hidden_size=512, vocab_size=len(vocab), num_layers=1).cuda() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=1e-3) loss_train = [] loss_val = [] outputs = None f = open('train.txt', 'a') f_val = open('val.txt', 'a') encoded_output = None for epoch in range(0, 1000): for i, (images, captions, lengths) in enumerate(train_loader): if i % 50 == 0: print('Training epoch {}, iteration {}'.format(epoch, i)) images = images.cuda() captions = captions.cuda() encoded_output = encoder.forward(images) outputs = decoder(encoded_output, captions, lengths) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] loss = criterion(outputs, targets) if i % 100 == 0: loss_train.append(loss.item()) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() if i % 100 == 0: f.write('Epoch: {}, Iter: {}, Train loss: {}\n'.format( epoch, i, loss_train)) f.flush() encoder.train() decoder.train() if i % 200 == 0: with torch.no_grad(): val_loss = 0 print('Validation for epoch {}'.format(epoch)) f_val.write('Epoch {}\n'.format(epoch)) for j, (images, captions, lengths) in enumerate(val_loader): images = images.cuda() captions = captions.cuda() encoded_output = encoder.forward(images) outputs = decoder(encoded_output, captions, lengths) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] val_loss += criterion(outputs, targets).item() sampled_ids = decoder.sample( encoded_output).cpu().numpy() if j == 3: for item in range(0, 5): t = np.random.randint(0, len(sampled_ids)) sampled_caption = [] actual_caption = [] for word_id_2 in sampled_ids[t]: word = vocab.idx2word[word_id_2] sampled_caption.append(word) if word == '<end>': break for word_id_2 in captions[t].cpu().numpy(): word = vocab.idx2word[word_id_2] actual_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) actual_sentence = ' '.join(actual_caption) f_val.write( 'Generated Caption: {} \n Actual Caption: {}\n\n' .format(str(sampled_caption), str(actual_caption))) f_val.flush() val_loss /= j loss_val.append(val_loss) f_val.write('Loss for epoch {}: {}'.format( epoch, loss_val)) f_val.flush() if i % 500 == 0: torch.save( decoder.state_dict(), os.path.join('./models/decoder-{}-{}.ckpt'.format( epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join('./models/encoder-{}-{}.ckpt'.format( epoch + 1, i + 1))) f.close() f_val.close()
def forward(self, query, keys, keys_length, mask=None): """ Parameters ---------- query: 2D tensor, [B, H] keys: (masked_interests), 3D tensor, [b, T, H] keys_length: 1D tensor, [B] Returns ------- outputs: 2D tensor, [B, H] """ batch_size, dim = query.size() max_length = keys.size()[1] # check batch validation zero_outputs = torch.zeros(batch_size, dim, device=query.device) mask = keys_length > 0 # [B] -> [b] keys_length = keys_length[mask] if keys_length.shape[0] == 0: return zero_outputs # [B, H] -> [b, 1, H] query = torch.masked_select(query, mask.view(-1, 1)).view(-1, dim).unsqueeze(1) if self.gru_type == 'GRU': packed_keys = pack_padded_sequence(keys, lengths=keys_length, batch_first=True, enforce_sorted=False) packed_interests, _ = self.interest_evolution(packed_keys) interests, _ = pad_packed_sequence(packed_interests, batch_first=True, padding_value=0.0, total_length=max_length) outputs = self.attention(query, interests, keys_length.unsqueeze(1)) # [b, 1, H] outputs = outputs.squeeze(1) # [b, H] elif self.gru_type == 'AIGRU': att_scores = self.attention(query, keys, keys_length.unsqueeze(1)) # [b, 1, T] interests = keys * att_scores.transpose(1, 2) # [b, T, H] packed_interests = pack_padded_sequence(interests, lengths=keys_length, batch_first=True, enforce_sorted=False) _, outputs = self.interest_evolution(packed_interests) outputs = outputs.squeeze(0) # [b, H] elif self.gru_type == 'AGRU' or self.gru_type == 'AUGRU': att_scores = self.attention( query, keys, keys_length.unsqueeze(1)).squeeze(1) # [b, T] packed_interests = pack_padded_sequence(keys, lengths=keys_length, batch_first=True, enforce_sorted=False) packed_scores = pack_padded_sequence(att_scores, lengths=keys_length, batch_first=True, enforce_sorted=False) outputs = self.interest_evolution(packed_interests, packed_scores) outputs, _ = pad_packed_sequence(outputs, batch_first=True, padding_value=0.0, total_length=max_length) # pick last state outputs = InterestEvolving._get_last_state(outputs, keys_length) # [b, H] # [b, H] -> [B, H] zero_outputs[mask] = outputs return zero_outputs
def _get_instr_embedding(self, instr): if self.lang_model == 'gru': _, hidden = self.instr_rnn(self.word_embedding(instr)) return hidden[-1] elif self.lang_model in ['bigru', 'attgru']: lengths = (instr != 0).sum(1).long() masks = (instr != 0).float() if lengths.shape[0] > 1: seq_lengths, perm_idx = lengths.sort(0, descending=True) iperm_idx = torch.LongTensor(perm_idx.shape).fill_(0) if instr.is_cuda: iperm_idx = iperm_idx.cuda() for i, v in enumerate(perm_idx): iperm_idx[v.data] = i inputs = self.word_embedding(instr) inputs = inputs[perm_idx] inputs = pack_padded_sequence(inputs, seq_lengths.data.cpu().numpy(), batch_first=True) outputs, final_states = self.instr_rnn(inputs) else: instr = instr[:, 0:lengths[0]] outputs, final_states = self.instr_rnn( self.word_embedding(instr)) iperm_idx = None final_states = final_states.transpose(0, 1).contiguous() final_states = final_states.view(final_states.shape[0], -1) if iperm_idx is not None: outputs, _ = pad_packed_sequence(outputs, batch_first=True) outputs = outputs[iperm_idx] final_states = final_states[iperm_idx] if outputs.shape[1] < masks.shape[1]: masks = masks[:, :(outputs.shape[1] - masks.shape[1])] # the packing truncated the original length # so we need to change mask to fit it return outputs if self.lang_model == 'attgru' else final_states elif self.lang_model == 'conv': inputs = self.word_embedding(instr).unsqueeze(1) # (B,1,T,D) inputs = [ F.relu(conv(inputs)).squeeze(3) for conv in self.instr_convs ] inputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in inputs] return torch.cat(inputs, 1) elif self.lang_model == 'bow': device = torch.device("cuda" if instr.is_cuda else "cpu") input_dim = self.obs_space["instr"] input = torch.zeros((instr.size(0), input_dim), device=device) idx = torch.arange(instr.size(0), dtype=torch.int64) input[idx.unsqueeze(1), instr] = 1. return self.instr_bow(input) else: ValueError("Undefined instruction architecture: {}".format( self.use_instr))
return efactor, images, env_sst, target def __len__(self): return len(self.efactors) def __getitem__(self, idx): return self.efactors[idx], self.images[idx], self.env_sst[ idx], self.targets[idx] if __name__ == '__main__': tc_data = TC_Data_varbatch(years=[2000]) for minibatch in tc_data.get_batches(): images, efactors, envsst, targets, batch_len = minibatch images = rnn_utils.pack_padded_sequence(images, batch_len, batch_first=True) efactors = rnn_utils.pack_padded_sequence(efactors, batch_len, batch_first=True) envsst = rnn_utils.pack_padded_sequence(envsst, batch_len, batch_first=True) targets = rnn_utils.pack_padded_sequence(targets, batch_len, batch_first=True) # targets = targets[:, -1, :] print(envsst) print(efactors) print(targets)
def forward(self, x: Tuple[torch.Tensor, torch.Tensor]) -> PackedSequence: fmap, fmap_length = x fmap = fmap.permute(0, 2, 1) if self._permuting else fmap return pack_padded_sequence(fmap, fmap_length, batch_first=True, enforce_sorted=False)
def forward(self, input_sequence, length): batch_size = input_sequence.size(0) sorted_lengths, sorted_idx = torch.sort(length, descending=True) input_sequence = input_sequence[sorted_idx] # ENCODER input_embedding = self.embedding(input_sequence) packed_input = rnn_utils.pack_padded_sequence( input_embedding, sorted_lengths.data.tolist(), batch_first=True) _, hidden = self.encoder_rnn(packed_input) if self.bidirectional or self.num_layers > 1: # flatten hidden state if isinstance(hidden, tuple): hidden = (hidden[0].view(batch_size, self.hidden_size * self.hidden_factor), hidden[1]) else: hidden = hidden.view(batch_size, self.hidden_size * self.hidden_factor) else: if isinstance(hidden, tuple): hidden = (hidden[0].squeeze(), hidden[1]) else: hidden = hidden.squeeze() # REPARAMETERIZATION if isinstance(hidden, tuple): mean = self.hidden2mean(hidden[0]) logv = self.hidden2logv(hidden[0]) else: mean = self.hidden2mean(hidden) logv = self.hidden2logv(hidden) std = torch.exp(0.5 * logv) z = to_var(torch.randn([batch_size, self.latent_size])) z = z * std + mean # DECODER if isinstance(self.decoder_rnn, nn.LSTM): h_t = self.latent2hidden(z) c_t = torch.zeros_like(h_t) if torch.cuda.is_available(): c_t = c_t.cuda() hidden = (h_t, c_t) else: hidden = self.latent2hidden(z) if self.bidirectional or self.num_layers > 1: # unflatten hidden state if isinstance(hidden, tuple): hidden = (hidden[0].view(self.hidden_factor, batch_size, self.hidden_size), hidden[1].view(self.hidden_factor, batch_size, self.hidden_size)) else: hidden = hidden.view(self.hidden_factor, batch_size, self.hidden_size) else: if isinstance(hidden, tuple): hidden = (hidden[0].unsqueeze(0), hidden[1].unsqueeze(0)) else: hidden = hidden.unsqueeze(0) # decoder input if self.word_dropout_rate > 0: # randomly replace decoder input with <unk> prob = torch.rand(input_sequence.size()) if torch.cuda.is_available(): prob = prob.cuda() prob[(input_sequence.data - self.sos_idx) * (input_sequence.data - self.pad_idx) == 0] = 1 decoder_input_sequence = input_sequence.clone() decoder_input_sequence[ prob < self.word_dropout_rate] = self.unk_idx input_embedding = self.embedding(decoder_input_sequence) input_embedding = self.embedding_dropout(input_embedding) packed_input = rnn_utils.pack_padded_sequence( input_embedding, sorted_lengths.data.tolist(), batch_first=True) # decoder forward pass outputs, _ = self.decoder_rnn(packed_input, hidden) # process outputs padded_outputs = rnn_utils.pad_packed_sequence(outputs, batch_first=True)[0] padded_outputs = padded_outputs.contiguous() _, reversed_idx = torch.sort(sorted_idx) padded_outputs = padded_outputs[reversed_idx] b, s, _ = padded_outputs.size() # project outputs to vocab logp = nn.functional.log_softmax(self.outputs2vocab( padded_outputs.view(-1, padded_outputs.size(2))), dim=-1) logp = logp.view(b, s, self.embedding.num_embeddings) return logp, mean, logv, z
def forward(self, inputs, initial_state=None, **kwargs): is_packed = isinstance(inputs, PackedSequence) if self.stack_mode in ['bidirectional']: if not initial_state: hidden_states = [None] * len(self.lstm_layers) * len( self.lstm_layers[0]) elif initial_state[0].size()[0] != len(self.lstm_layers) * len( self.lstm_layers[0]): raise ValueError( f"initial states does not match the number of layers.") else: hidden_states = list( zip(initial_state[0].split(1, 0), initial_state[1].split(1, 0))) else: if not initial_state: hidden_states = [None] * len(self.lstm_layers) elif initial_state[0].size()[0] != len(self.lstm_layers): raise ValueError( f"initial states does not match the number of layers.") else: hidden_states = list( zip(initial_state[0].split(1, 0), initial_state[1].split(1, 0))) # print(f"nndct_inputs:{inputs}") output_sequence = inputs if self.stack_mode in ['bidirectional']: final_h = [] final_c = [] for i in range(len(self.lstm_layers)): forward_layer = getattr(self, "forward_layer_{}".format(i)) backward_layer = getattr(self, "backward_layer_{}".format(i)) forward_output, final_forward_state = forward_layer( output_sequence, hidden_states[i * 2]) if self.batch_first is not True: output_sequence.transpose_(0, 1) backward_output, final_backward_state = backward_layer( output_sequence, hidden_states[i * 2 + 1]) if is_packed: forward_output, lengths = pad_packed_sequence( forward_output, batch_first=self.batch_first) backward_output, _ = pad_packed_sequence( backward_output, batch_first=self.batch_first) # output_sequence = output_sequence.flip(1) # backward_output = backward_output.flip(1) output_sequence = torch.cat([forward_output, backward_output], -1) if is_packed: output_sequence = pack_padded_sequence( output_sequence, lengths, batch_first=self.batch_first) final_h.extend( [final_forward_state[0], final_backward_state[0]]) final_c.extend( [final_forward_state[1], final_backward_state[1]]) final_hidden_state = torch.cat(final_h, dim=0) final_cell_state = torch.cat(final_c, dim=0) else: final_states = [] for i, state in enumerate(hidden_states): if self.stack_mode == 'alternating': layer = getattr( self, f"forward_layer_{i}") if i % 2 == 0 else getattr( self, f"backward_layer_{i}") else: layer = getattr(self, f"forward_layer_{i}") output_sequence, final_state = layer(output_sequence, state) # print(f"nndct_layer{i} output:{output_sequence}") final_states.append(final_state) final_hidden_state, final_cell_state = tuple( torch.cat(state_list, 0) for state_list in zip(*final_states)) # print(f"nndct_final_output:{output_sequence}") return output_sequence, (final_hidden_state, final_cell_state)
# E_diff_cost = compute_diff_loss(img_embed_com_l2, img_embed_spe_l2) # E_reconst_cost = compute_reconst_loss(input_res, img_reconst) + compute_reconst_loss(input_res, img_text_reconst) + compute_reconst_loss(input_res, img_text_agg_reconst) # E_cost = 1.0 * E_sim_cost + 2.0 * E_cls_cost + 1.0 * E_diff_cost + 1.0 * E_reconst_cost # ## # optimizerE.zero_grad() # E_cost.backward(retain_graph=True) # optimizerE.step() ################################## # (2) Train Explanation network ################################## img_binary = torch.sigmoid(img_binary) lstm_outputs = netR(input_res, img_embed, img_binary, input_wordID, input_cap_len) lstm_targets = pack_padded_sequence(target_wordID, input_cap_len, batch_first=True)[0] # LSTM loss lstm_cost = lstm_criterion(lstm_outputs, lstm_targets) R_cost = lstm_cost optimizerR.zero_grad() R_cost.backward() optimizerR.step() mean_R_loss = lstm_cost.item() # evaluate mode # netE.eval() # Generalized zero-shot learning print('[%d/%d] R_loss: %.4f' % (epoch, opt.nepoch, mean_R_loss)) # Generate sentence # generated_exp = eval_explanation(netR, input_res, data.vocab) ## why it is an error?? # print(generated_exp[0]['caption'])
def forward(self, input, targets, input_lens, target_lens, return_decoder_all_h=False, use_teacher_forcing=False, SOS_index=0): """ input shape: (S, N) targets shape: (S, N) return_decoder_all_h: whether return every sequence value in decoder rnns """ batch_size = input.size()[1] emb = embedded_dropout(self.input_embedding, input, dropout=self.dropoute if self.training else 0) emb = self.lockdrop(emb, self.dropouti) # emb shape: (S, N, emsize) encoder_hidden = self.init_hidden(input.size()[1]) packed_emb = pack_padded_sequence(emb, input_lens, batch_first=False, enforce_sorted=False) encoder_outputs, encoder_hidden = self.encoder_rnns( packed_emb, encoder_hidden) encoder_outputs, _ = pad_packed_sequence(encoder_outputs) encoder_outputs = self.lockdrop(encoder_outputs, self.dropout) # encoder_outputs shape: (S, N, nhid) # encoder_hidden shape: (nlayers*directions, N, nhid) decoder_rnns_output_list = [] decoder_rnns_h_list = [] decoder_input = self.input_embedding.weight.new_full( [1, input.size()[1]], SOS_index, dtype=torch.long) # decoder_input shape: (1, N) decoder_hidden = encoder_hidden for seq_index in range(input.size()[0]): decoder_input = self.target_embedding(decoder_input) h_n_batchfirst = decoder_hidden.transpose(0, 1) h_n_batchfirst = h_n_batchfirst.reshape(batch_size, -1) # h_n_batchfirst shape: (N, nlayers*directions*nhid) attn_weights = F.softmax( self.attn( torch.cat((decoder_input.view( -1, decoder_input.size()[2]), h_n_batchfirst), dim=1))[:, :encoder_outputs.size()[0]]) # attn_weights shape: (N, S) attn_applied = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs.transpose(0, 1)) # attn_applied shape: N, 1, nhid attn_combine_output = F.relu( self.attn_combine( torch.cat((decoder_input.view(-1, decoder_input.size()[2]), attn_applied.view(attn_applied.size()[0], attn_applied.size()[2])), dim=1))) # attn_combine_output shape: N, nhid decoder_rnns_output, decoder_hidden = self.decoder_rnns( attn_combine_output.unsqueeze(0), decoder_hidden) # decoder_rnns_output shape: (1, N, nhid), # decoder_hidden shape: (nlayers*directions, N, nhid) decoder_rnns_output = self.decoder(decoder_rnns_output) # decoder_rnns_output shape: (1, N, ntok) if use_teacher_forcing: decoder_input = targets[seq_index].view(-1, batch_size) else: topv, topi = decoder_rnns_output.topk(1, dim=2) decoder_input = topi.view(1, batch_size).detach() decoder_rnns_output_list.append(decoder_rnns_output) decoder_rnns_h_list.append(decoder_hidden) decoder_rnns_output_tensor = torch.cat(tuple(decoder_rnns_output_list), dim=0) # decoder_rnns_output_tensor shape: (S, N, ntok) if not return_decoder_all_h: return decoder_rnns_output_tensor, decoder_hidden else: return decoder_rnns_output_tensor, decoder_hidden, \ decoder_rnns_h_list
def forward(self, sentence, p_sentence, pos_tags, lengths, target_idx_in, region_marks, local_roles_voc, frames, local_roles_mask, sent_pred_lemmas_idx, dep_tags, dep_heads, targets, specific_dep_tags, specific_dep_relations, test=False): #contruct input for DEP embeds_DEP = self.word_embeddings_DEP(sentence) embeds_DEP = embeds_DEP.view(self.batch_size, len(sentence[0]), self.word_emb_dim) pos_embeds = self.pos_embeddings(pos_tags) pos_embeds_DEP = self.pos_embeddings_DEP(pos_tags) region_marks = region_marks.view(self.batch_size, len(sentence[0]), 1) fixed_embeds_DEP = self.word_fixed_embeddings_DEP(p_sentence) fixed_embeds_DEP = fixed_embeds_DEP.view(self.batch_size, len(sentence[0]), self.word_emb_dim) embeds_forDEP = torch.cat( (embeds_DEP, fixed_embeds_DEP, pos_embeds_DEP, region_marks), 2) embeds_forDEP = self.DEP_input_dropout(embeds_forDEP) #first layer embeds_sort, lengths_sort, unsort_idx = self.sort_batch( embeds_forDEP, lengths) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort, batch_first=True) # hidden states [time_steps * batch_size * hidden_units] hidden_states, self.hidden = self.BiLSTM_0(embeds_sort, self.hidden) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) # hidden_states = hidden_states.transpose(0, 1) hidden_states_0 = hidden_states[unsort_idx] # second_layer embeds_sort, lengths_sort, unsort_idx = self.sort_batch( hidden_states_0, lengths) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort, batch_first=True) # hidden states [time_steps * batch_size * hidden_units] hidden_states, self.hidden_2 = self.BiLSTM_1(embeds_sort, self.hidden_2) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) #hidden_states = hidden_states.transpose(0, 1) hidden_states_1 = hidden_states[unsort_idx] Label_composer = hidden_states_1 predicate_embeds = Label_composer[np.arange(0, Label_composer.size()[0]), target_idx_in] # T * B * H added_embeds = torch.zeros(Label_composer.size()[1], Label_composer.size()[0], Label_composer.size()[2]).to(device) concat_embeds = (added_embeds + predicate_embeds).transpose(0, 1) Label_features = torch.cat((Label_composer, concat_embeds), 2) dep_tag_space = self.MLP( self.label_dropout(F.tanh(self.hidden2tag(Label_features)))).view( len(sentence[0]) * self.batch_size, -1) dep_labels = torch.argmax(dep_tag_space, dim=1) if test: TagProbs_use = F.softmax(dep_tag_space, dim=1).view(self.batch_size, len(sentence[0]), -1) TagProbs_noGrad = TagProbs_use.detach() h1 = F.relu(self.tag2hidden(TagProbs_noGrad)) else: h1 = self.dep_embeddings(dep_tags).view(self.batch_size, len(sentence[0]), -1) fixed_embeds = self.word_fixed_embeddings(p_sentence) fixed_embeds = fixed_embeds.view(self.batch_size, len(sentence[0]), self.word_emb_dim) sent_pred_lemmas_embeds = self.p_lemma_embeddings(sent_pred_lemmas_idx) embeds_SRL = self.word_embeddings_SRL(sentence) embeds_SRL = embeds_SRL.view(self.batch_size, len(sentence[0]), self.word_emb_dim) SRL_hidden_states = torch.cat( (embeds_SRL, fixed_embeds, sent_pred_lemmas_embeds, pos_embeds, region_marks, h1), 2) SRL_hidden_states = self.SRL_input_dropout(SRL_hidden_states) # SRL layer embeds_sort, lengths_sort, unsort_idx = self.sort_batch( SRL_hidden_states, lengths) embeds_sort = rnn.pack_padded_sequence(embeds_sort, lengths_sort.cpu().numpy(), batch_first=True) # hidden states [time_steps * batch_size * hidden_units] hidden_states, self.hidden_4 = self.BiLSTM_SRL(embeds_sort, self.hidden_4) # it seems that hidden states is already batch first, we don't need swap the dims # hidden_states = hidden_states.permute(1, 2, 0).contiguous().view(self.batch_size, -1, ) hidden_states, lens = rnn.pad_packed_sequence(hidden_states, batch_first=True) # hidden_states = hidden_states.transpose(0, 1) hidden_states = hidden_states[unsort_idx] hidden_states = self.hidden_state_dropout(hidden_states) # B * H hidden_states_3 = hidden_states predicate_embeds = hidden_states_3[ np.arange(0, hidden_states_3.size()[0]), target_idx_in] # T * B * H added_embeds = torch.zeros(hidden_states_3.size()[1], hidden_states_3.size()[0], hidden_states_3.size()[2]).to(device) predicate_embeds = added_embeds + predicate_embeds # B * T * H predicate_embeds = predicate_embeds.transpose(0, 1) hidden_states = torch.cat((hidden_states_3, predicate_embeds), 2) # print(hidden_states) # non-linear map and rectify the roles' embeddings # roles = Variable(torch.from_numpy(np.arange(0, self.tagset_size))) # B * roles # log(local_roles_voc) # log(frames) # B * roles * h role_embeds = self.role_embeddings(local_roles_voc) frame_embeds = self.frame_embeddings(frames) role_embeds = torch.cat((role_embeds, frame_embeds), 2) mapped_roles = F.relu(self.role_map(role_embeds)) mapped_roles = torch.transpose(mapped_roles, 1, 2) # b, times, roles tag_space = torch.matmul(hidden_states, mapped_roles) #tag_space = hidden_states.mm(mapped_roles) # b, roles #sub = torch.div(torch.add(local_roles_mask, -1.0), _BIG_NUMBER) sub = torch.add(local_roles_mask, -1.0) * _BIG_NUMBER sub = torch.FloatTensor(sub.cpu().numpy()).to(device) # b, roles, times tag_space = torch.transpose(tag_space, 0, 1) tag_space += sub # b, T, roles tag_space = torch.transpose(tag_space, 0, 1) tag_space = tag_space.view(len(sentence[0]) * self.batch_size, -1) SRLprobs = F.softmax(tag_space, dim=1) goldLabelInd = dep_tags.view(-1).cpu().data.numpy() rscores = dep_tag_space.view(self.batch_size * len(sentence[0]), -1).cpu().data.numpy() rexprs = dep_tag_space.view(self.batch_size * len(sentence[0]), -1) lerrs = [] # for every word in the batch for i in range(len(rscores)): if goldLabelInd[i] == 0: continue wrongLabelInd = \ max(((l, scr) for l, scr in enumerate(rscores[i]) if l != goldLabelInd[i]), key=itemgetter(1))[0] if rscores[i][goldLabelInd[i]] < rscores[i][wrongLabelInd] + 1: lerrs += [ rexprs[i][wrongLabelInd] - rexprs[i][goldLabelInd[i]] ] # +++++++++++++++++++++++ wrong_l_nums = 0.0 all_l_nums = 0.0 right_noNull_predict = 0.0 noNull_predict = 0.0 noNUll_truth = 0.0 for predict_l, gold_l in zip(dep_labels, dep_tags.cpu().view(-1).data.numpy()): if predict_l > 1: noNull_predict += 1 if gold_l != 0: all_l_nums += 1 if gold_l != 1: noNUll_truth += 1 if gold_l == predict_l: right_noNull_predict += 1 if predict_l != gold_l and gold_l != 0: wrong_l_nums += 1 targets = targets.view(-1) loss_function = nn.CrossEntropyLoss(ignore_index=0) SRLloss = loss_function(tag_space, targets) #DEPloss = loss_function(dep_tag_space, dep_tags.view(-1)) if len(lerrs) > 0: DEPloss = sum(lerrs) loss = SRLloss + DEPloss else: loss = SRLloss return SRLloss, DEPloss, DEPloss, loss, SRLprobs, wrong_l_nums, all_l_nums, wrong_l_nums, all_l_nums, \ right_noNull_predict, noNull_predict, noNUll_truth,\ right_noNull_predict, noNull_predict, noNUll_truth
def forward(self, features, labels=None, valid_lengths=None): assert (valid_lengths is not None), 'Valid_lengths is required.' # features from mockingjay: (batch_size, layer, seq_len, feature) # features from baseline: (batch_size, seq_len, feature) # labels: (batch_size,), one utterance to one label # valid_lengths: (batch_size, ) batch_size = features.size(0) layer_num = features.size(1) if len(features.shape) == 4 else None seq_len = features.size(2) if len( features.shape) == 4 else features.size(1) feature_dim = features.size(3) if len( features.shape) == 4 else features.size(2) select_hidden = self.config['select_hidden'] if len(features.shape) == 4: # compute mean on mockingjay representations if given features from mockingjay if select_hidden == 'last': features = features[:, -1, :, :] elif select_hidden == 'first': features = features[:, 0, :, :] elif select_hidden == 'average': features = features.mean( dim=1 ) # now simply average the representations over all layers, (batch_size, seq_len, feature) elif select_hidden == 'weighted_sum': features = features.transpose(0, 1).reshape(layer_num, -1) features = torch.matmul(self.weight[:layer_num], features).reshape( batch_size, seq_len, feature_dim) elif select_hidden == 'weighted_sum_norm': weights = nn.functional.softmax(self.weight[:layer_num], dim=-1) features = features.transpose(0, 1).reshape(layer_num, -1) features = torch.matmul(weights, features).reshape( batch_size, seq_len, feature_dim) else: raise NotImplementedError( 'Feature selection mode not supported!') sample_rate = self.config['sample_rate'] features = features[:, torch.arange(0, seq_len, sample_rate), :] valid_lengths /= sample_rate for linear in self.pre_linears: features = linear(features) features = self.act_fn(features) features = self.dropout(features) packed = pack_padded_sequence(features, valid_lengths, batch_first=True, enforce_sorted=True) _, h_n = self.rnn(packed) hidden = h_n[-1, :, :] # cause h_n directly contains info for final states # it will be easier to use h_n as extracted embedding for linear in self.post_linears: hidden = linear(hidden) hidden = self.act_fn(hidden) hidden = self.dropout(hidden) logits = self.out(hidden) mode = self.config['mode'] if mode == 'classification': result = self.out_fn(logits) # result: (batch_size, class_num) elif mode == 'regression': result = logits.reshape(-1) # result: (batch_size, ) if labels is not None: loss = self.criterion(result, labels) # statistic for accuracy if mode == 'classification': correct, valid = self.statistic(result, labels) elif mode == 'regression': # correct and valid has no meaning when in regression mode # just to make the outside wrapper can correctly function correct, valid = torch.LongTensor([1]), torch.LongTensor([1]) return loss, result.detach().cpu(), correct, valid return result
def beam_search(self, src_sent: List[str], beam_size: int = 20, max_decoding_time_step: int = 70) -> List[ Hypothesis]: """ Given a single source sentence, perform beam search Args: src_sent: a single tokenized source sentence beam_size: beam size max_decoding_time_step: maximum number of time steps to unroll the decoding RNN Returns: hypotheses: a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ self.eou = 2 top_k = 20 batch_size = 1 low_ind = [0] high_ind = [] src_ind = torch.cuda.LongTensor(self.low_src_vocab.words2indices(src_sent[1])) src_embed = self.low_src_embed(src_ind).unsqueeze(0) src_lengths = np.asarray([len(src_sent[1])]) packed_input = pack_padded_sequence(src_embed, src_lengths, batch_first=True) src_output, src_last_hidden = self.encoder(packed_input) low_src_output, low_src_last_hidden = self.low_encoder(packed_input) src_hidden, _ = pad_packed_sequence(src_output, batch_first=True) low_src_hidden, _ = pad_packed_sequence(low_src_output, batch_first=True) decoder_hidden = self.init_hidden((low_src_last_hidden, src_last_hidden), batch_size, low_ind, high_ind) eos_filler = torch.zeros(beam_size).long().cuda().fill_(self.eou) decoder_input = self.tgt_embed(torch.cuda.LongTensor([1])).unsqueeze(1) length = src_hidden.size(1) src_lengths = torch.cuda.LongTensor(src_lengths) q_key = self.q_key(src_hidden) q_value = self.q_value(src_hidden) q_spec_key = torch.zeros(batch_size, length, self.key_size).cuda() q_spec_key[low_ind, :, :] = self.q_low_key(low_src_hidden[low_ind, :, :]) q_spec_value = torch.zeros(batch_size, length, self.embed_size).cuda() q_spec_value[low_ind, :, :] = self.q_low_value(low_src_hidden[low_ind, :, :]) q_mask = torch.arange(length).long().cuda().repeat(src_hidden.size(0), 1) < torch.cuda.LongTensor( src_lengths).repeat(length, 1).transpose(0, 1) context = self.attention(decoder_hidden, q_key, q_value, q_spec_key, q_spec_value, q_mask) decoder_output, decoder_hidden = self.decoder(torch.cat((decoder_input, context), dim=2), decoder_hidden) decoder_output = torch.cat((decoder_output, context), dim=2) decoder_output = self.word_dist(F.tanh(self.out(decoder_output.squeeze(1)))) decoder_output[:, 0] = -np.inf logprobs, argtop = torch.topk(F.log_softmax(decoder_output, dim=1), beam_size, dim=1) beam = torch.zeros(beam_size, max_decoding_time_step).long().cuda() beam[:, 0] = argtop beam_probs = logprobs.clone().squeeze(0) beam_eos = argtop.squeeze(0) == self.eou decoder_hidden = (decoder_hidden[0].expand(1, beam_size, self.hidden_size).contiguous(), decoder_hidden[1].expand(1, beam_size, self.hidden_size).contiguous()) decoder_input = self.tgt_embed(argtop.squeeze(0)).unsqueeze(1) src_hidden = src_hidden.expand(beam_size, length, self.hidden_size * 2) low_src_hidden = low_src_hidden.expand(beam_size, length, self.hidden_size * 2) q_key = self.q_key(src_hidden) q_value = self.q_value(src_hidden) q_spec_key = self.q_low_key(low_src_hidden) q_spec_value = self.q_low_value(low_src_hidden) q_mask = torch.arange(length).long().cuda().repeat(src_hidden.size(0), 1) < torch.cuda.LongTensor( src_lengths).repeat(length, 1).transpose(0, 1) for t in range(max_decoding_time_step - 1): context = self.attention(decoder_hidden, q_key, q_value, q_spec_key, q_spec_value, q_mask) decoder_output, decoder_hidden = self.decoder(torch.cat((decoder_input, context), dim=2).transpose(0, 1), decoder_hidden) decoder_output = torch.cat((decoder_output.transpose(0, 1), context), dim=2) decoder_output = self.word_dist(F.tanh(self.out(decoder_output))) logprobs, argtop = torch.topk(F.log_softmax(decoder_output.squeeze(1), dim=1), top_k, dim=1) best_probs, best_args = (beam_probs.expand(top_k, beam_size).transpose(0, 1) + logprobs).view(-1).topk( beam_size) last = best_args / top_k curr = best_args % top_k beam[:, :] = beam[last, :] beam_eos = beam_eos[last] beam_probs = beam_probs[last] beam[:, t + 1] = argtop[last, curr] * (~beam_eos).long() + eos_filler * beam_eos.long() mask = ~beam_eos beam_probs[mask] = (beam_probs[mask] * (t + 1) + best_probs[mask]) / (t + 2) decoder_hidden = (decoder_hidden[0][:, last, :], decoder_hidden[1][:, last, :]) beam_eos = beam_eos | (beam[:, t + 1] == self.eou) decoder_input = self.tgt_embed(beam[:, t + 1]).unsqueeze(1) if beam_eos.all(): break best, best_arg = beam_probs.max(0) translation = beam[best_arg].cpu().tolist() if self.eou in translation: translation = translation[:translation.index(self.eou)] translation = [self.tgt_vocab.id2word[w] for w in translation] return [Hypothesis(value=translation, score=best.item())]
def featurize(self, batch, load_mask=True, load_frames=True): ''' tensorize and pad batch input ''' device = torch.device('cuda') if self.args.gpu else torch.device('cpu') feat = collections.defaultdict(list) for ex in batch: ########### # auxillary ########### if not self.test_mode: # subgoal completion supervision if self.args.subgoal_aux_loss_wt > 0: feat['subgoals_completed'].append( np.array(ex['num']['low_to_high_idx']) / self.max_subgoals) # progress monitor supervision if self.args.pm_aux_loss_wt > 0: num_actions = len( [a for sg in ex['num']['action_low'] for a in sg]) subgoal_progress = [(i + 1) / float(num_actions) for i in range(num_actions)] feat['subgoal_progress'].append(subgoal_progress) ######### # inputs ######### # serialize segments self.serialize_lang_action(ex) # goal and instr language lang_goal, lang_instr = ex['num']['lang_goal'], ex['num'][ 'lang_instr'] # zero inputs if specified lang_goal = self.zero_input( lang_goal) if self.args.zero_goal else lang_goal lang_instr = self.zero_input( lang_instr) if self.args.zero_instr else lang_instr # append goal + instr lang_goal_instr = lang_goal + lang_instr feat['lang_goal_instr'].append(lang_goal_instr) # load Resnet features from disk if load_frames and not self.test_mode: root = self.get_task_root(ex) im = torch.load(os.path.join(root, self.feat_pt)) keep = [None] * len(ex['plan']['low_actions']) for i, d in enumerate(ex['images']): # only add frames linked with low-level actions (i.e. skip filler frames like smooth rotations and dish washing) if keep[d['low_idx']] is None: keep[d['low_idx']] = im[i] keep.append(keep[-1]) # stop frame feat['frames'].append(torch.stack(keep, dim=0)) ######### # outputs ######### if not self.test_mode: # low-level action feat['action_low'].append( [a['action'] for a in ex['num']['action_low']]) # low-level action mask if load_mask: feat['action_low_mask'].append([ self.decompress_mask(a['mask']) for a in ex['num']['action_low'] if a['mask'] is not None ]) # low-level valid interact feat['action_low_valid_interact'].append( [a['valid_interact'] for a in ex['num']['action_low']]) # tensorization and padding for k, v in feat.items(): if k in {'lang_goal_instr'}: # language embedding and padding seqs = [torch.tensor(vv, device=device) for vv in v] pad_seq = pad_sequence(seqs, batch_first=True, padding_value=self.pad) seq_lengths = np.array(list(map(len, v))) embed_seq = self.emb_word(pad_seq) packed_input = pack_padded_sequence(embed_seq, seq_lengths, batch_first=True, enforce_sorted=False) feat[k] = packed_input elif k in {'action_low_mask'}: # mask padding seqs = [ torch.tensor(vv, device=device, dtype=torch.float) for vv in v ] feat[k] = seqs elif k in {'subgoal_progress', 'subgoals_completed'}: # auxillary padding seqs = [ torch.tensor(vv, device=device, dtype=torch.float) for vv in v ] pad_seq = pad_sequence(seqs, batch_first=True, padding_value=self.pad) feat[k] = pad_seq else: # default: tensorize and pad sequence seqs = [ torch.tensor(vv, device=device, dtype=torch.float if ('frames' in k) else torch.long) for vv in v ] pad_seq = pad_sequence(seqs, batch_first=True, padding_value=self.pad) feat[k] = pad_seq return feat
random.seed(44) n_test = 1000 x_test = next(iter(get_loader(test_data, n_test))) target_padded, sequence_lengths = pad_packed_sequence(x_test) target_averages = [] for i, length in enumerate(sequence_lengths): arr = target_padded[:,i][:length-1].numpy() target_averages.append(arr.mean()) batch_sizes = x_test.batch_sizes num_emph = 4 emph_index = sorted(random.choices(range(len(test_data)), k=num_emph)) emph_packed = pack_padded_sequence( target_padded[:,emph_index,:], sequence_lengths[emph_index]) rae, t_info_rae = get_trained_model(rae, model_name="ToyRAE", training_info=True) vrae, t_info_vrae = get_trained_model(vrae, model_name="ToyVRAE", training_info=True) iaf, t_info_iaf = get_trained_model(iaf, model_name="ToyVRAEIAF", training_info=True) fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(14, 6.5)) # %% Recurrent Autoencoder plt.figure() plt.plot(t_info_rae["training_loss"]) plt.plot(t_info_rae["validation_loss"]) plt.savefig(figure_directory / "rae_toy_loss.pdf") #%%
def _add_embeddings_internal(self, sentences: Union[List[Sentence], Sentence]): """Add embeddings to all sentences in the given list of sentences. If embeddings are already added, update only if embeddings are non-static.""" # TODO: remove in future versions if not hasattr(self, "locked_dropout"): self.locked_dropout = None if not hasattr(self, "word_dropout"): self.word_dropout = None if type(sentences) is Sentence: sentences = [sentences] self.rnn.zero_grad() # embed words in the sentence self.embeddings.embed(sentences) lengths: List[int] = [len(sentence.tokens) for sentence in sentences] longest_token_sequence_in_batch: int = max(lengths) pre_allocated_zero_tensor = torch.zeros( self.embeddings.embedding_length * longest_token_sequence_in_batch, dtype=torch.float, device=flair.device, ) all_embs: List[torch.Tensor] = list() for sentence in sentences: all_embs += [ emb for token in sentence for emb in token.get_each_embedding() ] nb_padding_tokens = longest_token_sequence_in_batch - len(sentence) if nb_padding_tokens > 0: t = pre_allocated_zero_tensor[:self.embeddings. embedding_length * nb_padding_tokens] all_embs.append(t) sentence_tensor = torch.cat(all_embs).view([ len(sentences), longest_token_sequence_in_batch, self.embeddings.embedding_length, ]) # before-RNN dropout if self.dropout: sentence_tensor = self.dropout(sentence_tensor) if self.locked_dropout: sentence_tensor = self.locked_dropout(sentence_tensor) if self.word_dropout: sentence_tensor = self.word_dropout(sentence_tensor) # reproject if set if self.reproject_words: sentence_tensor = self.word_reprojection_map(sentence_tensor) # push through RNN packed = pack_padded_sequence(sentence_tensor, lengths, enforce_sorted=False, batch_first=True) # type: ignore rnn_out, hidden = self.rnn(packed) outputs, output_lengths = pad_packed_sequence(rnn_out, batch_first=True) # after-RNN dropout if self.dropout: outputs = self.dropout(outputs) if self.locked_dropout: outputs = self.locked_dropout(outputs) # extract embeddings from RNN for sentence_no, length in enumerate(lengths): last_rep = outputs[sentence_no, length - 1] embedding = last_rep if self.bidirectional: first_rep = outputs[sentence_no, 0] embedding = torch.cat([first_rep, last_rep], 0) if self.static_embeddings: embedding = embedding.detach() sentence = sentences[sentence_no] sentence.set_embedding(self.name, embedding)
for epoch in range(epochs): epoch_loss = 0 for i, (images, inputs, targets) in enumerate(dataloader, 0): # print(f"Batch = {i}, Time: {time.time() - start}, Loss: {epoch_loss}") images = Variable(images).cuda() images = extractor.forward(images) k = images.shape[0] images = torch.stack([images] * captions_per_image).permute( 1, 0, 2).contiguous().view(-1, images.shape[-1]) inputs = inputs.view(-1, max_length, inputs.shape[-1]) targets = targets.view(-1, max_length) inputs = pack_padded_sequence(inputs[:, :-1], [max_length] * captions_per_image * k, True).cuda() targets = pack_padded_sequence(targets[:, 1:], [max_length] * captions_per_image * k, True).cuda()[0] optimizer.zero_grad() outputs = generator.forward(images, inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() epoch_loss += loss.item() end = time.time() print(f"Epoch: {epoch}, Time: {end - start}, Loss: {epoch_loss}")
print(f'{n} sizes: {w.size()}') #print(lstm._flat_weights_names) #print(lstm._original_flat_names) optimizer = optim.SGD(lstm.parameters(), lr=0.0001, momentum=0.9) optimizer.zero_grad() print(lstm) input_size = 10 hidden_size = 6 num_layers = 2 seq_length = 7 batch = 6 x = torch.randn(batch, seq_length, input_size) # .to('cuda') lengths = [7, 5, 5, 2, 1, 1] x = rnn.pack_padded_sequence(x, lengths, batch_first=True) # x = torch.rand(10, 1, 256) #.to('cuda') # y = torch.rand(batch, seq_length, hidden_size * 2) # .to('cuda') # y = rnn.pack_padded_sequence(y, lengths, batch_first=True).data with torch.no_grad(): cpu_out, _ = lstm(x) lstm.to('cuda') x = x.to('cuda') with torch.no_grad(): gpu_out, _ = lstm(x)
def step(self, x: Dict[str, torch.Tensor], y: torch.Tensor, batch_idx: int, label="train", **kwargs): """ Run for each train/val step. """ # pack y sequence if different encoder lengths exist if (x["decoder_lengths"] < x["decoder_lengths"].max()).any(): y = rnn.pack_padded_sequence(y, lengths=x["decoder_lengths"], batch_first=True, enforce_sorted=False) if label == "train" and len(self.hparams.monotone_constaints) > 0: # calculate gradient with respect to continous decoder features x["decoder_cont"].requires_grad_(True) assert not torch._C._get_cudnn_enabled(), ( "To use monotone constraints, wrap model and training in context " "`torch.backends.cudnn.flags(enable=False)`") out = self(x, **kwargs) out["prediction"] = self.transform_output(out) prediction = out["prediction"] gradient = torch.autograd.grad( outputs=prediction, inputs=x["decoder_cont"], grad_outputs=torch.ones_like(prediction), # t create_graph=True, # allows usage in graph allow_unused=True, )[0] # select relevant features indices = torch.tensor([ self.hparams.x_reals.index(name) for name in self.hparams.monotone_constaints.keys() ]) monotonicity = torch.tensor( [val for val in self.hparams.monotone_constaints.values()], dtype=gradient.dtype, device=gradient.device) # add additionl loss if gradient points in wrong direction gradient = gradient[..., indices] * monotonicity[None, None] monotinicity_loss = gradient.clamp_max(0).mean() # multiply monotinicity loss by large number to ensure relevance and take to the power of 2 # for smoothness of loss function monotinicity_loss = 10 * torch.pow(monotinicity_loss, 2) if isinstance(self.loss, MASE): loss = self.loss(prediction, y, encoder_target=x["encoder_target"], encoder_lengths=x["encoder_lengths"]) else: loss = self.loss(prediction, y) loss = loss * (1 + monotinicity_loss) else: out = self(x, **kwargs) out["prediction"] = self.transform_output(out) # calculate loss prediction = out["prediction"] if isinstance(self.loss, MASE): loss = self.loss(prediction, y, encoder_target=x["encoder_target"], encoder_lengths=x["encoder_lengths"]) else: loss = self.loss(prediction, y) # log self._log_metrics(x, y, out, label=label) if self.log_interval(label == "train") > 0: self._log_prediction(x, out, batch_idx, label=label) log = {"loss": loss, "n_samples": x["decoder_lengths"].size(0)} return log, out