def encode( self, source_batch ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor], torch.Tensor]: input_embedded = self.source_embedder(source_batch) enc_masks = source_batch.sum(dim=2) != 0 lengths = enc_masks.sum(dim=1) input_embedded_sorted, new_lengths, rest_idxs, _ = sort_batch_by_length( input_embedded, lengths) # TODO: remove assertion assert torch.equal(input_embedded, input_embedded_sorted[rest_idxs]) enc_hiddens, (last_hidden, last_cell) = self.encoder( pack_padded_sequence(input_embedded_sorted, new_lengths, batch_first=True)) enc_hiddens, _ = pad_packed_sequence(enc_hiddens, padding_value=0, batch_first=True) # restore original ordering enc_hiddens = enc_hiddens[rest_idxs] last_hidden = last_hidden[:, rest_idxs, :] last_cell = last_cell[:, rest_idxs, :] init_decoder_hidden = self.h_projection( torch.cat([last_hidden[0], last_hidden[1]], dim=1)) init_decoder_cell = self.c_projection( torch.cat([last_cell[0], last_cell[1]], dim=1)) dec_init_state = (init_decoder_hidden, init_decoder_cell) return enc_hiddens, dec_init_state, enc_masks
def test_forward_pulls_out_correct_tensor_with_unsorted_batches(self): lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=7, batch_first=True) encoder = PytorchSeq2VecWrapper(lstm) input_tensor = torch.rand([5, 7, 3]) input_tensor[0, 3:, :] = 0 input_tensor[1, 4:, :] = 0 input_tensor[2, 2:, :] = 0 input_tensor[3, 6:, :] = 0 mask = torch.ones(5, 7) mask[0, 3:] = 0 mask[1, 4:] = 0 mask[2, 2:] = 0 mask[3, 6:] = 0 sequence_lengths = get_lengths_from_binary_sequence_mask(mask) sorted_inputs, sorted_sequence_lengths, restoration_indices, _ = sort_batch_by_length(input_tensor, sequence_lengths) packed_sequence = pack_padded_sequence(sorted_inputs, sorted_sequence_lengths.tolist(), batch_first=True) _, state = lstm(packed_sequence) # Transpose output state, extract the last forward and backward states and # reshape to be of dimension (batch_size, 2 * hidden_size). sorted_transposed_state = state[0].transpose(0, 1).index_select(0, restoration_indices) reshaped_state = sorted_transposed_state[:, -2:, :].contiguous() explicitly_concatenated_state = torch.cat([reshaped_state[:, 0, :].squeeze(1), reshaped_state[:, 1, :].squeeze(1)], -1) encoder_output = encoder(input_tensor, mask) assert_almost_equal(encoder_output.data.numpy(), explicitly_concatenated_state.data.numpy())
def sort_and_run_forward( self, module: Callable[[PackedSequence, Optional[RnnState]], Tuple[Union[PackedSequence, torch.Tensor], RnnState]], inputs: torch.Tensor, mask: torch.Tensor, hidden_states: Optional[RnnState] = None, reset_hidden_state=False): # First count how many sequences are empty. batch_size = mask.size(0) num_valid = torch.sum(mask[:, 0]).int().item() sequence_lengths = get_lengths_from_binary_sequence_mask(mask) sorted_inputs, sorted_sequence_lengths, restoration_indices, sorting_indices =\ sort_batch_by_length(inputs, sequence_lengths) # Now create a PackedSequence with only the non-empty, sorted sequences. packed_sequence_input = pack_padded_sequence( sorted_inputs[:num_valid, :, :], sorted_sequence_lengths[:num_valid].data.tolist(), batch_first=True) # Prepare the initial states. initial_states, hidden_states = self._get_initial_states( batch_size, num_valid, sorting_indices, hidden_states) if reset_hidden_state: initial_states = None # Actually call the module on the sorted PackedSequence. module_output, final_states = module(packed_sequence_input, initial_states) return module_output, final_states, restoration_indices, hidden_states
def test_sort_tensor_by_length(self): tensor = torch.rand([5, 7, 9]) tensor[0, 3:, :] = 0 tensor[1, 4:, :] = 0 tensor[2, 1:, :] = 0 tensor[3, 5:, :] = 0 tensor = Variable(tensor) sequence_lengths = Variable(torch.LongTensor([3, 4, 1, 5, 7])) sorted_tensor, sorted_lengths, reverse_indices, _ = util.sort_batch_by_length( tensor, sequence_lengths) # Test sorted indices are padded correctly. numpy.testing.assert_array_equal(sorted_tensor[1, 5:, :].data.numpy(), 0.0) numpy.testing.assert_array_equal(sorted_tensor[2, 4:, :].data.numpy(), 0.0) numpy.testing.assert_array_equal(sorted_tensor[3, 3:, :].data.numpy(), 0.0) numpy.testing.assert_array_equal(sorted_tensor[4, 1:, :].data.numpy(), 0.0) assert sorted_lengths.data.equal(torch.LongTensor([7, 5, 4, 3, 1])) # Test restoration indices correctly recover the original tensor. assert sorted_tensor.index_select(0, reverse_indices).data.equal( tensor.data)
def test_forward_pulls_out_correct_tensor_for_unsorted_batches(self): lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=7, batch_first=True) encoder = PytorchSeq2SeqWrapper(lstm) input_tensor = torch.rand([5, 7, 3]) input_tensor[0, 3:, :] = 0 input_tensor[1, 4:, :] = 0 input_tensor[2, 2:, :] = 0 input_tensor[3, 6:, :] = 0 mask = torch.ones(5, 7) mask[0, 3:] = 0 mask[1, 4:] = 0 mask[2, 2:] = 0 mask[3, 6:] = 0 sequence_lengths = get_lengths_from_binary_sequence_mask(mask) sorted_inputs, sorted_sequence_lengths, restoration_indices, _ = sort_batch_by_length(input_tensor, sequence_lengths) packed_sequence = pack_padded_sequence(sorted_inputs, sorted_sequence_lengths.data.tolist(), batch_first=True) lstm_output, _ = lstm(packed_sequence) encoder_output = encoder(input_tensor, mask) lstm_tensor, _ = pad_packed_sequence(lstm_output, batch_first=True) assert_almost_equal(encoder_output.data.numpy(), lstm_tensor.index_select(0, restoration_indices).data.numpy())
def setUp(self): super(TestEncoderBase, self).setUp() self.lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=7, batch_first=True) self.encoder_base = _EncoderBase(stateful=True) tensor = Variable(torch.rand([5, 7, 3])) tensor[1, 6:, :] = 0 tensor[3, 2:, :] = 0 self.tensor = tensor mask = Variable(torch.ones(5, 7)) mask[1, 6:] = 0 mask[2, :] = 0 # <= completely masked mask[3, 2:] = 0 mask[4, :] = 0 # <= completely masked self.mask = mask self.batch_size = 5 self.num_valid = 3 sequence_lengths = get_lengths_from_binary_sequence_mask(mask) _, _, restoration_indices, sorting_indices = sort_batch_by_length( tensor, sequence_lengths) self.sorting_indices = sorting_indices self.restoration_indices = restoration_indices
def forward(self, seq, seq_lens): if not self.batch_first: raise ConfigurationError( "Our encoder semantics assumes batch is always first!") non_zero_length_mask = seq_lens.ne(0).float() # make zero lengths into length=1 seq_lens = seq_lens + seq_lens.eq(0).float() sorted_inputs, sorted_sequence_lengths, restoration_indices, sorting_indices =\ sort_batch_by_length(seq, seq_lens) packed_input = pack(sorted_inputs, sorted_sequence_lengths.data.long().tolist(), batch_first=True) outputs, final_states = super(ProperLSTM, self).forward(packed_input) unpacked_sequence, _ = unpack(outputs, batch_first=True) outputs = unpacked_sequence.index_select(0, restoration_indices) new_unsorted_states = [ self.fix_hidden(state.index_select(1, restoration_indices)) for state in final_states ] # To deal with zero length inputs outputs = outputs * non_zero_length_mask.view(-1, 1, 1).expand_as(outputs) new_unsorted_states[ 0] = new_unsorted_states[0] * non_zero_length_mask.view( 1, -1, 1).expand_as(new_unsorted_states[0]) new_unsorted_states[ 1] = new_unsorted_states[1] * non_zero_length_mask.view( 1, -1, 1).expand_as(new_unsorted_states[1]) return outputs, new_unsorted_states
def test_augmented_lstm_computes_same_function_as_pytorch_lstm(self): augmented_lstm = AugmentedLstm(10, 11) pytorch_lstm = LSTM(10, 11, num_layers=1, batch_first=True) # Initialize all weights to be == 1. initializer = InitializerApplicator([(".*", lambda tensor: torch.nn.init.constant_(tensor, 1.))]) initializer(augmented_lstm) initializer(pytorch_lstm) initial_state = torch.zeros([1, 5, 11]) initial_memory = torch.zeros([1, 5, 11]) # Use bigger numbers to avoid floating point instability. sorted_tensor, sorted_sequence, _, _ = sort_batch_by_length(self.random_tensor * 5., self.sequence_lengths) lstm_input = pack_padded_sequence(sorted_tensor, sorted_sequence.data.tolist(), batch_first=True) augmented_output, augmented_state = augmented_lstm(lstm_input, (initial_state, initial_memory)) pytorch_output, pytorch_state = pytorch_lstm(lstm_input, (initial_state, initial_memory)) pytorch_output_sequence, _ = pad_packed_sequence(pytorch_output, batch_first=True) augmented_output_sequence, _ = pad_packed_sequence(augmented_output, batch_first=True) numpy.testing.assert_array_almost_equal(pytorch_output_sequence.data.numpy(), augmented_output_sequence.data.numpy(), decimal=4) numpy.testing.assert_array_almost_equal(pytorch_state[0].data.numpy(), augmented_state[0].data.numpy(), decimal=4) numpy.testing.assert_array_almost_equal(pytorch_state[1].data.numpy(), augmented_state[1].data.numpy(), decimal=4)
def test_forward_pulls_out_correct_tensor_with_unsorted_batches(self): lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=7, batch_first=True) encoder = PytorchSeq2VecWrapper(lstm) tensor = torch.rand([5, 7, 3]) tensor[0, 3:, :] = 0 tensor[1, 4:, :] = 0 tensor[2, 2:, :] = 0 tensor[3, 6:, :] = 0 mask = torch.ones(5, 7) mask[0, 3:] = 0 mask[1, 4:] = 0 mask[2, 2:] = 0 mask[3, 6:] = 0 input_tensor = Variable(tensor) mask = Variable(mask) sequence_lengths = get_lengths_from_binary_sequence_mask(mask) sorted_inputs, sorted_sequence_lengths, restoration_indices = sort_batch_by_length(input_tensor, sequence_lengths) packed_sequence = pack_padded_sequence(sorted_inputs, sorted_sequence_lengths.data.tolist(), batch_first=True) _, state = lstm(packed_sequence) # Transpose output state, extract the last forward and backward states and # reshape to be of dimension (batch_size, 2 * hidden_size). sorted_transposed_state = state[0].transpose(0, 1).index_select(0, restoration_indices) reshaped_state = sorted_transposed_state[:, -2:, :].contiguous() explicitly_concatenated_state = torch.cat([reshaped_state[:, 0, :].squeeze(1), reshaped_state[:, 1, :].squeeze(1)], -1) encoder_output = encoder(input_tensor, mask) assert_almost_equal(encoder_output.data.numpy(), explicitly_concatenated_state.data.numpy())
def test_forward_pulls_out_correct_tensor_for_unsorted_batches(self): lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=7, batch_first=True) encoder = PytorchSeq2SeqWrapper(lstm) tensor = torch.rand([5, 7, 3]) tensor[0, 3:, :] = 0 tensor[1, 4:, :] = 0 tensor[2, 2:, :] = 0 tensor[3, 6:, :] = 0 mask = torch.ones(5, 7) mask[0, 3:] = 0 mask[1, 4:] = 0 mask[2, 2:] = 0 mask[3, 6:] = 0 input_tensor = Variable(tensor) mask = Variable(mask) sequence_lengths = get_lengths_from_binary_sequence_mask(mask) sorted_inputs, sorted_sequence_lengths, restoration_indices, _ = sort_batch_by_length(input_tensor, sequence_lengths) packed_sequence = pack_padded_sequence(sorted_inputs, sorted_sequence_lengths.data.tolist(), batch_first=True) lstm_output, _ = lstm(packed_sequence) encoder_output = encoder(input_tensor, mask) lstm_tensor, _ = pad_packed_sequence(lstm_output, batch_first=True) assert_almost_equal(encoder_output.data.numpy(), lstm_tensor.index_select(0, restoration_indices).data.numpy())
def forward(self, inputs, lengths): # 1. run LSTM # apply dropout to the input # Shape of inputs: (batch_size, sequence_length, embedding_dim) embedded_input = self.dropout_on_input_to_LSTM(inputs) # Sort the embedded inputs by decreasing order of input length. [ this is done for batching ] # sorted_input shape: (batch_size, sequence_length, embedding_dim) (sorted_input, sorted_lengths, input_unsort_indices, _) = sort_batch_by_length(embedded_input, lengths) # Pack the sorted inputs with pack_padded_sequence. packed_input = pack_padded_sequence(sorted_input, sorted_lengths.data.tolist(), batch_first=True) # Run the input through the RNN. packed_sorted_output, _ = self.rnn(packed_input) # Unpack (pad) the input with pad_packed_sequence # Shape: (batch_size, sequence_length, hidden_size) sorted_output, _ = pad_packed_sequence(packed_sorted_output, batch_first=True) # Re-sort the packed sequence to restore the initial ordering # Shape: (batch_size, sequence_length, hidden_size) output = sorted_output[input_unsort_indices] # 2. run linear layer # apply dropout to input to the linear layer # (batch_size, sequence_length, hidden_size) input_encoding = self.dropout_on_input_to_linear_layer(output) # Run the RNN encoding of the input through the output projection # to get scores for each of the classes. # (batch_size, sequence_length, 2) unnormalized_output = self.output_projection(input_encoding) # Normalize with log softmax output_distribution = F.log_softmax(unnormalized_output, dim=-1) return output_distribution
def forward( self, # pylint: disable=arguments-differ inputs: torch.Tensor, mask: torch.Tensor, hidden_state: torch.Tensor = None) -> torch.Tensor: if mask is None: return self._module(inputs, hidden_state)[0] # In some circumstances you may have sequences of zero length. # ``pack_padded_sequence`` requires all sequence lengths to be > 0, so here we # adjust the ``mask`` so that every sequence has length at least 1. Then after # running the RNN we zero out the corresponding rows in the result. # First count how many sequences are empty. batch_size, total_sequence_length = mask.size() num_valid = torch.sum(mask[:, 0]).int().data[0] # Force every sequence to be length at least one. Need to `.clone()` the mask # to avoid a RuntimeError from shared storage. if num_valid < batch_size: mask = mask.clone() mask[:, 0] = 1 sequence_lengths = get_lengths_from_binary_sequence_mask(mask) sorted_inputs, sorted_sequence_lengths, restoration_indices = sort_batch_by_length( inputs, sequence_lengths) packed_sequence_input = pack_padded_sequence( sorted_inputs, sorted_sequence_lengths.data.tolist(), batch_first=True) # Actually call the module on the sorted PackedSequence. packed_sequence_output, _ = self._module(packed_sequence_input, hidden_state) unpacked_sequence_tensor, _ = pad_packed_sequence( packed_sequence_output, batch_first=True) # We sorted by length, so if there are invalid rows that need to be zeroed out # they will be at the end. if num_valid < batch_size: unpacked_sequence_tensor[num_valid:, :, :] = 0. # It's possible to need to pass sequences which are padded to longer than the # max length of the sequence to a Seq2SeqEncoder. However, packing and unpacking # the sequences mean that the returned tensor won't include these dimensions, because # the RNN did not need to process them. We add them back on in the form of zeros here. sequence_length_difference = total_sequence_length - unpacked_sequence_tensor.size( 1) if sequence_length_difference > 0: zeros = unpacked_sequence_tensor.data.new( batch_size, sequence_length_difference, unpacked_sequence_tensor.size(-1)).fill_(0) zeros = torch.autograd.Variable(zeros) unpacked_sequence_tensor = torch.cat( [unpacked_sequence_tensor, zeros], 1) # Restore the original indices and return the sequence. return unpacked_sequence_tensor.index_select(0, restoration_indices)
def test_augmented_lstm_works_with_highway_connections(self): augmented_lstm = AugmentedLstm(10, 11, use_highway=True) sorted_tensor, sorted_sequence, _, _ = sort_batch_by_length( self.random_tensor, self.sequence_lengths) lstm_input = pack_padded_sequence(sorted_tensor, sorted_sequence.data.tolist(), batch_first=True) augmented_lstm(lstm_input)
def test_variable_length_sequences_run_backward_return_correctly_padded_outputs(self): sorted_tensor, sorted_sequence, _, _ = sort_batch_by_length(self.random_tensor, self.sequence_lengths) tensor = pack_padded_sequence(sorted_tensor, sorted_sequence.data.tolist(), batch_first=True) lstm = AugmentedLstm(10, 11, go_forward=False) output, _ = lstm(tensor) output_sequence, _ = pad_packed_sequence(output, batch_first=True) numpy.testing.assert_array_equal(output_sequence.data[1, 6:, :].numpy(), 0.0) numpy.testing.assert_array_equal(output_sequence.data[2, 4:, :].numpy(), 0.0) numpy.testing.assert_array_equal(output_sequence.data[3, 3:, :].numpy(), 0.0) numpy.testing.assert_array_equal(output_sequence.data[4, 2:, :].numpy(), 0.0)
def test_biaugmented_lstm(self): for bidirectional in [True, False]: bi_augmented_lstm = BiAugmentedLstm( 10, 11, 3, recurrent_dropout_probability=0.1, bidirectional=bidirectional ) sorted_tensor, sorted_sequence, _, _ = sort_batch_by_length( self.random_tensor, self.sequence_lengths ) lstm_input = pack_padded_sequence( sorted_tensor, sorted_sequence.data.tolist(), batch_first=True ) bi_augmented_lstm(lstm_input)
def document_representation(self, document_tensor): """ Compute the sentence representation, D. :param document_tensor: Stacked tensors of the sentences given throughout the document. Assumes document_tensor is wrapped with Variable. :return: D: The average pooled representation of the document. """ # 1. Pad variable lengths sentences to prevent the model from learning # from the padding. # Collect lengths for sorting and padding. # Shape: (batch_size,) document_mask = (document_tensor != 0) sentence_lengths = Variable(document_mask.sum(dim=1)) # Shape: (batch_size x max sentence length x embedding size) embedded_sentences = self.embedding(Variable(document_tensor)) sorted_embeddings, sorted_lengths, restore_index, permute_index \ = sort_batch_by_length(embedded_sentences, sentence_lengths) sorted_lengths = list(sorted_lengths.data.long()) packed_sentences = nn.utils.rnn.pack_padded_sequence(sorted_embeddings, sorted_lengths, batch_first=True) # 2. Encode the sentences at the word level. # Shape: (batch_size x max sentence length x bidirectional hidden) # (batch_size x bidirectional hidden) sentences_out, sentences_hidden = self.word_rnn(packed_sentences) padded_sentences, padded_sentences_lengths = \ nn.utils.rnn.pad_packed_sequence(sentences_out, batch_first=True) # Restore order for predictions. encoded_sentences_restored = padded_sentences[restore_index] # 3. Pool along the length dimension. sentence_representations = torch.mean(encoded_sentences_restored, 1) # 4. Encode the document at the sentence level. doc_out, doc_hiddens = self.sentence_rnn( sentence_representations.unsqueeze(0)) # 4. Average the sentence representations and push through affine. pooled_doc_out = torch.mean(doc_out.squeeze(), 0) doc_rep = self.encode_document(pooled_doc_out) return sentence_representations, doc_rep
def forward(self, inputs, lengths): # 1. run LSTM # apply dropout to the input # Shape of inputs: (batch_size, sequence_length, embedding_dim) embedded_input = self.dropout_on_input_to_LSTM(inputs) # Sort the embedded inputs by decreasing order of input length. # sorted_input shape: (batch_size, sequence_length, embedding_dim) (sorted_input, sorted_lengths, input_unsort_indices, _) = sort_batch_by_length(embedded_input, lengths) # Pack the sorted inputs with pack_padded_sequence. packed_input = pack_padded_sequence(sorted_input, sorted_lengths.data.tolist(), batch_first=True) # Run the input through the RNN. packed_sorted_output, _ = self.rnn(packed_input) # Unpack (pad) the input with pad_packed_sequence # Shape: (batch_size, sequence_length, hidden_size) sorted_output, _ = pad_packed_sequence(packed_sorted_output, batch_first=True) # Re-sort the packed sequence to restore the initial ordering # Shape: (batch_size, sequence_length, hidden_size) output = sorted_output[input_unsort_indices] # 2. use attention # Shape: (batch_size, sequence_length, 1) # Shape: (batch_size, sequence_length) after squeeze attention_logits = self.attention_weights(output).squeeze(dim=-1) mask_attention_logits = (attention_logits != 0).type( torch.cuda.FloatTensor if inputs.is_cuda else torch.FloatTensor) # Shape: (batch_size, sequence_length) softmax_attention_logits = masked_softmax(attention_logits, mask_attention_logits) # Shape: (batch_size, 1, sequence_length) softmax_attention_logits = softmax_attention_logits.unsqueeze(dim=1) # Shape of input_encoding: (batch_size, 1, hidden_size ) # output: (batch_size, sequence_length, hidden_size) # softmax_attention_logits: (batch_size, 1, sequence_length) input_encoding = torch.bmm(softmax_attention_logits, output) # Shape: (batch_size, hidden_size) input_encoding = input_encoding.squeeze(dim=1) # 3. run linear layer # apply dropout to input to the linear layer input_encoding = self.dropout_on_input_to_linear_layer(input_encoding) # Run the RNN encoding of the input through the output projection # to get scores for each of the classes. unnormalized_output = self.output_projection(input_encoding) # Normalize with log softmax output_distribution = F.log_softmax(unnormalized_output, dim=-1) return output_distribution
def forward(self, frames: torch.FloatTensor, frame_lens: torch.LongTensor): """ frames: (batch_size, seq_len, num_lmks, lmk_dim) frame_lens: (batch_size, ) """ if self.frame_processing == 'flatten': frames = frames.reshape(frames.shape[0], frames.shape[1], -1) # Reverse sorts the batch by unpadded seq_len. (sorted_frames, sorted_frame_lens, restoration_indices, _) = sort_batch_by_length(frames, frame_lens) # Returns a PackedSequence. packed_frames = nn.utils.rnn.pack_padded_sequence( sorted_frames, sorted_frame_lens.data.cpu().numpy() if sorted_frame_lens.is_cuda else sorted_frame_lens.data.numpy(), batch_first=True) # Encoder: feed frames to the model, output hidden states. # final_state: (num_layers * num_dir, batch_size, hidden_size) (*2 if LSTM) packed_hidden_states, final_state = self.rnn(packed_frames) # Unpack encoding, the hidden states, a Tensor. # (batch_size, seq_len, num_dir * hidden_size) hidden_states, _ = nn.utils.rnn.pad_packed_sequence( packed_hidden_states, batch_first=True) # (num_layers, batch_size, hidden_size * num_dir) (*2 if LSTM) if self.bidirectional: final_state = self._cat_directions(final_state) hidden_states = hidden_states.index_select(0, restoration_indices) if isinstance(final_state, tuple): # LSTM final_state = (final_state[0].index_select(1, restoration_indices), final_state[1].index_select(1, restoration_indices)) else: final_state = final_state.index_select(1, restoration_indices) if self.enable_ctc: output_logits = self.output_proj(hidden_states) output_log_probs = masked_log_softmax(output_logits, self.output_mask.expand( output_logits.shape[0], self.adj_vocab_size), dim=-1) return output_log_probs, hidden_states, final_state else: return hidden_states, final_state
def forward( self, # pylint: disable=arguments-differ inputs: torch.Tensor, mask: torch.Tensor, hidden_state: torch.Tensor = None) -> torch.Tensor: if mask is None: # If a mask isn't passed, there is no padding in the batch of instances, so we can just # return the last sequence output as the state. This doesn't work in the case of # variable length sequences, as the last state for each element of the batch won't be # at the end of the max sequence length, so we have to use the state of the RNN below. return self._module(inputs, hidden_state)[0][:, -1, :] sequence_lengths = get_lengths_from_binary_sequence_mask(mask) sorted_inputs, sorted_sequence_lengths, restoration_indices = sort_batch_by_length( inputs, sequence_lengths) packed_sequence_input = pack_padded_sequence( sorted_inputs, sorted_sequence_lengths.data.tolist(), batch_first=True) # Actually call the module on the sorted PackedSequence. _, state = self._module(packed_sequence_input, hidden_state) # Deal with the fact the LSTM state is a tuple of (state, memory). if isinstance(state, tuple): state = state[0] # Restore the original indices and return the final state of the # top layer. Pytorch's recurrent layers return state in the form # (num_layers * num_directions, batch_size, hidden_size) regardless # of the 'batch_first' flag, so we transpose, extract the relevant # layer state (both forward and backward if using bidirectional layers) # and return them as a single (batch_size, self.get_output_dim()) tensor. # now of shape: (batch_size, num_layers * num_directions, hidden_size). unsorted_state = state.transpose(0, 1).index_select( 0, restoration_indices) # Extract the last hidden vector, including both forward and backward states # if the cell is bidirectional. Then reshape by concatenation (in the case # we have bidirectional states) or just squash the 1st dimension in the non- # bidirectional case. Return tensor has shape (batch_size, hidden_size * num_directions). try: last_state_index = 2 if self._module.bidirectional else 1 except AttributeError: last_state_index = 1 last_layer_state = unsorted_state[:, -last_state_index:, :] return last_layer_state.contiguous().view([-1, self.get_output_dim()])
def test_dropout_version_is_different_to_no_dropout(self): augmented_lstm = AugmentedLstm(10, 11) dropped_augmented_lstm = AugmentedLstm( 10, 11, recurrent_dropout_probability=0.9) # Initialize all weights to be == 1. constant_init = Initializer.from_params( Params({ "type": "constant", "val": 0.5 })) initializer = InitializerApplicator([(".*", constant_init)]) initializer(augmented_lstm) initializer(dropped_augmented_lstm) initial_state = torch.randn([1, 5, 11]) initial_memory = torch.randn([1, 5, 11]) # If we use too bigger number like in the PyTorch test the dropout has no affect sorted_tensor, sorted_sequence, _, _ = sort_batch_by_length( self.random_tensor, self.sequence_lengths) lstm_input = pack_padded_sequence(sorted_tensor, sorted_sequence.data.tolist(), batch_first=True) augmented_output, augmented_state = augmented_lstm( lstm_input, (initial_state, initial_memory)) dropped_output, dropped_state = dropped_augmented_lstm( lstm_input, (initial_state, initial_memory)) dropped_output_sequence, _ = pad_packed_sequence(dropped_output, batch_first=True) augmented_output_sequence, _ = pad_packed_sequence(augmented_output, batch_first=True) with pytest.raises(AssertionError): numpy.testing.assert_array_almost_equal( dropped_output_sequence.data.numpy(), augmented_output_sequence.data.numpy(), decimal=4) with pytest.raises(AssertionError): numpy.testing.assert_array_almost_equal( dropped_state[0].data.numpy(), augmented_state[0].data.numpy(), decimal=4) with pytest.raises(AssertionError): numpy.testing.assert_array_almost_equal( dropped_state[1].data.numpy(), augmented_state[1].data.numpy(), decimal=4)
def run_lstm(lstm, inputs, lengths): """ Run inputs through a LSTM. Args: lstm (LSTM): LSTM to use inputs (FloatTensor): word embeddings lengths (LongTensor): vector with sentence lengths """ inputs, lengths, unsort_idx, _ = sort_batch_by_length(inputs, lengths) inputs = pack_padded_sequence(inputs, lengths.data.tolist(), batch_first=True) lstm.flatten_parameters() packed_sorted_output, _ = lstm(inputs) sort, _ = pad_packed_sequence(packed_sorted_output, batch_first=True) return sort[unsort_idx]
def forward(self, # pylint: disable=arguments-differ inputs: torch.Tensor, sequence_lengths: torch.LongTensor = None, hidden_state: torch.Tensor = None) -> torch.Tensor: if sequence_lengths is None: return self._module(inputs, hidden_state)[0] sorted_inputs, sorted_sequence_lengths, restoration_indices = sort_batch_by_length(inputs, sequence_lengths) packed_sequence_input = pack_padded_sequence(sorted_inputs, sorted_sequence_lengths.data.tolist(), batch_first=True) # Actually call the module on the sorted PackedSequence. packed_sequence_output, _ = self._module(packed_sequence_input, hidden_state) unpacked_sequence_tensor, _ = pad_packed_sequence(packed_sequence_output, batch_first=True) # Restore the original indices and return the sequence. return unpacked_sequence_tensor.index_select(0, restoration_indices)
def forward(self, inputs, lengths): embedded_input = self.dropout_on_input_to_LSTM(inputs) (sorted_input, sorted_lengths, input_unsort_indices, _) = sort_batch_by_length(embedded_input, lengths) packed_input = pack_padded_sequence(sorted_input, sorted_lengths.data.tolist(), batch_first=True) packed_sorted_output, _ = self.rnn(packed_input) sorted_output, _ = pad_packed_sequence(packed_sorted_output, batch_first=True) output = sorted_output[input_unsort_indices] att_vec_dim = int(output.size(2) / 2) query_l2r = self.get_query(output[:, :, :att_vec_dim]) query_r2l = self.get_query(output[:, :, att_vec_dim:]) context_l2r, context_r2l = self.get_context(output[:, :, :att_vec_dim], output[:, :, att_vec_dim:], window=3) att_l2r = self.multiheadcontextattention(query_l2r, context_l2r, context_l2r).view( output.size(0), output.size(1), -1) att_r2l = self.multiheadcontextattention(query_r2l, context_r2l, context_r2l).view( output.size(0), output.size(1), -1) att = torch.cat([att_l2r, att_r2l], -1) output_cat = torch.cat([output, att], -1) input_encoding = self.dropout_on_input_to_linear_layer(output_cat) unnormalized_output = self.output_to_label(input_encoding) output_distribution = F.log_softmax(unnormalized_output, dim=-1) return output_distribution
def test_stacked_bidirectional_lstm_dropout_version_is_different(self, dropout_name: str): stacked_lstm = StackedBidirectionalLstm(input_size=10, hidden_size=11, num_layers=3) if dropout_name == 'layer_dropout_probability': dropped_stacked_lstm = StackedBidirectionalLstm(input_size=10, hidden_size=11, num_layers=3, layer_dropout_probability=0.9) elif dropout_name == 'recurrent_dropout_probability': dropped_stacked_lstm = StackedBidirectionalLstm(input_size=10, hidden_size=11, num_layers=3, recurrent_dropout_probability=0.9) else: raise ValueError('Do not recognise the following dropout name ' f'{dropout_name}') # Initialize all weights to be == 1. constant_init = Initializer.from_params(Params({"type": "constant", "val": 0.5})) initializer = InitializerApplicator([(".*", constant_init)]) initializer(stacked_lstm) initializer(dropped_stacked_lstm) initial_state = torch.randn([3, 5, 11]) initial_memory = torch.randn([3, 5, 11]) tensor = torch.rand([5, 7, 10]) sequence_lengths = torch.LongTensor([7, 7, 7, 7, 7]) sorted_tensor, sorted_sequence, _, _ = sort_batch_by_length(tensor, sequence_lengths) lstm_input = pack_padded_sequence(sorted_tensor, sorted_sequence.data.tolist(), batch_first=True) stacked_output, stacked_state = stacked_lstm(lstm_input, (initial_state, initial_memory)) dropped_output, dropped_state = dropped_stacked_lstm(lstm_input, (initial_state, initial_memory)) dropped_output_sequence, _ = pad_packed_sequence(dropped_output, batch_first=True) stacked_output_sequence, _ = pad_packed_sequence(stacked_output, batch_first=True) if dropout_name == 'layer_dropout_probability': with pytest.raises(AssertionError): numpy.testing.assert_array_almost_equal(dropped_output_sequence.data.numpy(), stacked_output_sequence.data.numpy(), decimal=4) if dropout_name == 'recurrent_dropout_probability': with pytest.raises(AssertionError): numpy.testing.assert_array_almost_equal(dropped_state[0].data.numpy(), stacked_state[0].data.numpy(), decimal=4) with pytest.raises(AssertionError): numpy.testing.assert_array_almost_equal(dropped_state[1].data.numpy(), stacked_state[1].data.numpy(), decimal=4)
def forward(self, inputs, lengths, char_seqs): char_emb_seq = self.char_emb(char_seqs) glove_part = inputs[:,:,:300] elmo_part = inputs[:,:,300:1324] pos_part = inputs[:,:,1324:] inputs = torch.cat((glove_part, char_emb_seq), dim=-1) inputs = self.highway(inputs) inputs = torch.cat([inputs, elmo_part, pos_part], dim=-1) embedded_input = self.dropout_on_input_to_LSTM(inputs) # Sort the embedded inputs by decreasing order of input length. # sorted_input shape: (batch_size, sequence_length, embedding_dim) (sorted_input, sorted_lengths, input_unsort_indices, _) = sort_batch_by_length(embedded_input, lengths) # Pack the sorted inputs with pack_padded_sequence. packed_input = pack_padded_sequence(sorted_input, sorted_lengths.data.tolist(), batch_first=True) # Run the input through the RNN. packed_sorted_output, _ = self.rnn(packed_input) # Unpack (pad) the input with pad_packed_sequence # Shape: (batch_size, sequence_length, hidden_size) sorted_output, _ = pad_packed_sequence(packed_sorted_output, batch_first=True) # Re-sort the packed sequence to restore the initial ordering # Shape: (batch_size, sequence_length, hidden_size) output = sorted_output[input_unsort_indices] input_encoding = self.dropout_on_input_to_linear_layer(output) if self.name == 'vua': projected_output = self.transform(inputs) multiplied_output = projected_output * input_encoding features = self.features(multiplied_output) unnormalized_output = self.output_projection(features) else: unnormalized_output = self.output_projection(input_encoding) output_distribution = F.log_softmax(unnormalized_output, dim=-1) return output_distribution, input_encoding, unnormalized_output
def run_rnn(self, embedded_input, batch, rnn): """ Run embeddings through RNN and return the output. Args: embedded_input (torch.FloatTensor): batch x seq x dim batch (Batch): batch object containing .lengths tensor param (torch.nn.LSTM): LSTM to run the embeddings through Returns: torch.FloatTensor: hidden states output of LSTM, batch x seq x dim """ (sorted_input, sorted_lengths, input_unsort_indices, _) = \ sort_batch_by_length(embedded_input, batch.lengths) packed_input = pack(sorted_input, sorted_lengths.data.tolist(), batch_first=True) rnn.flatten_parameters() packed_sorted_output, _ = rnn(packed_input) sorted_output, _ = pad(packed_sorted_output, batch_first=True) return sorted_output[input_unsort_indices]
def test_sort_tensor_by_length(self): tensor = torch.rand([5, 7, 9]) tensor[0, 3:, :] = 0 tensor[1, 4:, :] = 0 tensor[2, 1:, :] = 0 tensor[3, 5:, :] = 0 sequence_lengths = torch.LongTensor([3, 4, 1, 5, 7]) sorted_tensor, sorted_lengths, reverse_indices, _ = util.sort_batch_by_length(tensor, sequence_lengths) # Test sorted indices are padded correctly. numpy.testing.assert_array_equal(sorted_tensor[1, 5:, :].data.numpy(), 0.0) numpy.testing.assert_array_equal(sorted_tensor[2, 4:, :].data.numpy(), 0.0) numpy.testing.assert_array_equal(sorted_tensor[3, 3:, :].data.numpy(), 0.0) numpy.testing.assert_array_equal(sorted_tensor[4, 1:, :].data.numpy(), 0.0) assert sorted_lengths.data.equal(torch.LongTensor([7, 5, 4, 3, 1])) # Test restoration indices correctly recover the original tensor. assert sorted_tensor.index_select(0, reverse_indices).data.equal(tensor.data)
def forward(self, # pylint: disable=arguments-differ inputs: torch.Tensor, mask: torch.Tensor, hidden_state: torch.Tensor = None) -> torch.Tensor: if mask is None: return self._module(inputs, hidden_state)[0] # In some circumstances you may have sequences of zero length. # ``pack_padded_sequence`` requires all sequence lengths to be > 0, so here we # adjust the ``mask`` so that every sequence has length at least 1. Then after # running the RNN we zero out the corresponding rows in the result. # First count how many sequences are empty. batch_size = mask.size()[0] num_valid = torch.sum(mask[:, 0]).int().data[0] # Force every sequence to be length at least one. Need to `.clone()` the mask # to avoid a RuntimeError from shared storage. if num_valid < batch_size: mask = mask.clone() mask[:, 0] = 1 sequence_lengths = get_lengths_from_binary_sequence_mask(mask) sorted_inputs, sorted_sequence_lengths, restoration_indices = sort_batch_by_length(inputs, sequence_lengths) packed_sequence_input = pack_padded_sequence(sorted_inputs, sorted_sequence_lengths.data.tolist(), batch_first=True) # Actually call the module on the sorted PackedSequence. packed_sequence_output, _ = self._module(packed_sequence_input, hidden_state) unpacked_sequence_tensor, _ = pad_packed_sequence(packed_sequence_output, batch_first=True) # We sorted by length, so if there are invalid rows that need to be zeroed out # they will be at the end. if num_valid < batch_size: unpacked_sequence_tensor[num_valid:, :, :] = 0. # Restore the original indices and return the sequence. return unpacked_sequence_tensor.index_select(0, restoration_indices)
def test_dropout_is_not_applied_to_output_or_returned_hidden_states(self): sorted_tensor, sorted_sequence, _, _ = sort_batch_by_length( self.random_tensor, self.sequence_lengths ) tensor = pack_padded_sequence( sorted_tensor, sorted_sequence.data.tolist(), batch_first=True ) lstm = AugmentedLstm(10, 11, recurrent_dropout_probability=0.5) output, (hidden_state, _) = lstm(tensor) output_sequence, _ = pad_packed_sequence(output, batch_first=True) # Test returned output sequence num_hidden_dims_zero_across_timesteps = ((output_sequence.sum(1) == 0).sum()).item() # If this is not True then dropout has been applied to the output of the LSTM assert not num_hidden_dims_zero_across_timesteps # Should not have dropout applied to the last hidden state as this is not used # within the LSTM and makes it more consistent with the `torch.nn.LSTM` where # dropout is not applied to any of it's output. This would also make it more # consistent with the Keras LSTM implementation as well. hidden_state = hidden_state.squeeze() num_hidden_dims_zero_across_timesteps = ((hidden_state == 0).sum()).item() assert not num_hidden_dims_zero_across_timesteps
def forward(self, feature: torch.Tensor, feature_length: torch.Tensor, txt_label: Dict[str, torch.Tensor] = None, txt_length: torch.Tensor = None, meta_data: List[Dict[str, Any]] = None, **args: Any) -> Dict[str, torch.Tensor]: """ Parameters: feature: (batch, T, feature) feature_length: (batch) txt_label: { "character": (batch, max_label_length) } txt_length: (batch) """ if txt_label is not None: txt_label = txt_label['character'].view(-1) sorted_feature, sorted_feature_length, restore_idx, _ = sort_batch_by_length( feature, feature_length) sorted_feature = sorted_feature.transpose(-2, -1).unsqueeze( 1) # (batch, 1, feature, T) logits, output_lengths = self.speech_model(sorted_feature, sorted_feature_length) logits = logits.index_select(0, restore_idx) # (batch, T, num_class) output_lengths = output_lengths.index_select(0, restore_idx) # (batch) prob = F.log_softmax(logits, dim=-1) # (batch, T, num_class) output_dict = {} if txt_label is not None and txt_length is not None: import pdb pdb.set_trace() txt_label = txt_label[txt_label.nonzero().squeeze( dim=-1)] # (sum(txt_label)) loss = F.ctc_loss(log_probs=prob.transpose(0, 1), targets=txt_label.int(), input_lengths=output_lengths.int(), target_lengths=txt_length.int()) output_dict['loss'] = loss return output_dict
def setUp(self): super(TestEncoderBase, self).setUp() self.lstm = LSTM(bidirectional=True, num_layers=3, input_size=3, hidden_size=7, batch_first=True) self.encoder_base = _EncoderBase(stateful=True) tensor = Variable(torch.rand([5, 7, 3])) tensor[1, 6:, :] = 0 tensor[3, 2:, :] = 0 self.tensor = tensor mask = Variable(torch.ones(5, 7)) mask[1, 6:] = 0 mask[2, :] = 0 # <= completely masked mask[3, 2:] = 0 mask[4, :] = 0 # <= completely masked self.mask = mask self.batch_size = 5 self.num_valid = 3 sequence_lengths = get_lengths_from_binary_sequence_mask(mask) _, _, restoration_indices, sorting_indices = sort_batch_by_length(tensor, sequence_lengths) self.sorting_indices = sorting_indices self.restoration_indices = restoration_indices
def sort_and_run_forward(self, module: Callable[[PackedSequence, Optional[RnnState]], Tuple[Union[PackedSequence, torch.Tensor], RnnState]], inputs: torch.Tensor, mask: torch.Tensor, hidden_state: Optional[RnnState] = None): """ This function exists because Pytorch RNNs require that their inputs be sorted before being passed as input. As all of our Seq2xxxEncoders use this functionality, it is provided in a base class. This method can be called on any module which takes as input a ``PackedSequence`` and some ``hidden_state``, which can either be a tuple of tensors or a tensor. As all of our Seq2xxxEncoders have different return types, we return `sorted` outputs from the module, which is called directly. Additionally, we return the indices into the batch dimension required to restore the tensor to it's correct, unsorted order and the number of valid batch elements (i.e the number of elements in the batch which are not completely masked). This un-sorting and re-padding of the module outputs is left to the subclasses because their outputs have different types and handling them smoothly here is difficult. Parameters ---------- module : ``Callable[[PackedSequence, Optional[RnnState]], Tuple[Union[PackedSequence, torch.Tensor], RnnState]]``, required. A function to run on the inputs. In most cases, this is a ``torch.nn.Module``. inputs : ``torch.Tensor``, required. A tensor of shape ``(batch_size, sequence_length, embedding_size)`` representing the inputs to the Encoder. mask : ``torch.Tensor``, required. A tensor of shape ``(batch_size, sequence_length)``, representing masked and non-masked elements of the sequence for each element in the batch. hidden_state : ``Optional[RnnState]``, (default = None). A single tensor of shape (num_layers, batch_size, hidden_size) representing the state of an RNN with or a tuple of tensors of shapes (num_layers, batch_size, hidden_size) and (num_layers, batch_size, memory_size), representing the hidden state and memory state of an LSTM-like RNN. Returns ------- module_output : ``Union[torch.Tensor, PackedSequence]``. A Tensor or PackedSequence representing the output of the Pytorch Module. The batch size dimension will be equal to ``num_valid``, as sequences of zero length are clipped off before the module is called, as Pytorch cannot handle zero length sequences. final_states : ``Optional[RnnState]`` A Tensor representing the hidden state of the Pytorch Module. This can either be a single tensor of shape (num_layers, num_valid, hidden_size), for instance in the case of a GRU, or a tuple of tensors, such as those required for an LSTM. restoration_indices : ``torch.LongTensor`` A tensor of shape ``(batch_size,)``, describing the re-indexing required to transform the outputs back to their original batch order. """ # In some circumstances you may have sequences of zero length. ``pack_padded_sequence`` # requires all sequence lengths to be > 0, so remove sequences of zero length before # calling self._module, then fill with zeros. # First count how many sequences are empty. batch_size = mask.size(0) num_valid = torch.sum(mask[:, 0]).int().item() sequence_lengths = get_lengths_from_binary_sequence_mask(mask) sorted_inputs, sorted_sequence_lengths, restoration_indices, sorting_indices =\ sort_batch_by_length(inputs, sequence_lengths) # Now create a PackedSequence with only the non-empty, sorted sequences. packed_sequence_input = pack_padded_sequence(sorted_inputs[:num_valid, :, :], sorted_sequence_lengths[:num_valid].data.tolist(), batch_first=True) # Prepare the initial states. if not self.stateful: if hidden_state is None: initial_states = hidden_state elif isinstance(hidden_state, tuple): initial_states = [state.index_select(1, sorting_indices)[:, :num_valid, :].contiguous() for state in hidden_state] else: initial_states = hidden_state.index_select(1, sorting_indices)[:, :num_valid, :].contiguous() else: initial_states = self._get_initial_states(batch_size, num_valid, sorting_indices) # Actually call the module on the sorted PackedSequence. module_output, final_states = module(packed_sequence_input, initial_states) return module_output, final_states, restoration_indices
def forward(self, passage, question): """ The forward pass of the RNN-based model. Parameters ---------- passage: Variable(LongTensor) A Variable(LongTensor) of shape (batch_size, passage_length) representing the words in the passage for each batch. question: Variable(LongTensor) A Variable(LongTensor) of shape (batch_size, question_length) representing the words in the question for each batch. Returns ------- An output dictionary consisting of: start_logits: Variable(FloatTensor) The first element in the returned tuple. Variable(FloatTensor) of shape (batch_size, max_passage_size). Each value is the score assigned to a given token. Masked indices are assigned very small scores (-1e7). end_logits: Variable(FloatTensor) The second element in the returned tuple. Variable(FloatTensor) of shape (batch_size, max_passage_size). Each value is the score assigned to a given token. Masked indices are assigned very small scores (-1e7). softmax_start_logits: Variable(FloatTensor) The third element in the returned tuple. Variable(FloatTensor) of shape (batch_size, max_passage_size). Exactly the same as start_logits, but with a masked log softmax applied. Represents a probability distribution over the passage, indicating the probability that any given token is where the answer begins. Masked indices have probability mass of -inf. softmax_end_logits: Variable(FloatTensor) The fourth element in the returned tuple. Variable(FloatTensor) of shape (batch_size, max_passage_size). Exactly the same as start_logits, but with a masked log softmax applied. Represents a probability distribution over the passage, indicating the probability that any given token is where the answer end. Masked indices have probability mass of -inf. """ # Mask: FloatTensor with 0 in positions that are # padding (word index 0) and 1 in positions with actual words. # Make a mask for the passage. Shape: ? # TODO: Your code here. passage_mask = (passage != 0).type( torch.cuda.FloatTensor if passage.is_cuda else torch.FloatTensor) # Make a mask for the question. Shape: ? # TODO: Your code here. question_mask = (question != 0).type( torch.cuda.FloatTensor if question.is_cuda else torch.FloatTensor) # Make a LongTensor with the length (number non-padding words # in) each passage. # Shape: ? # TODO: Your code here. # keep as float tensor for use in later methods passageLengths = passage_mask.sum(dim=1) # Make a LongTensor with the length (number non-padding words # in) each question. # Shape: ? # TODO: Your code here. # keep as float tensor by summing along mask dimension for non-padding words questionLengths = question_mask.sum(dim=1) # Part 1: Embed the passages and the questions. # 1.1. Embed the passage. # TODO: Your code here. # Shape: ? # Get stored passage embedding embedded_passage = self.embedding(passage) # 1.2. Embed the question. # TODO: Your code here. # Shape: ? # Get stored question embedding embedded_question = self.embedding(question) # Part 2. Encode the embedded passages with the RNN. # 2.1. Sort embedded passages by decreasing order of passage_lengths. # Hint: allennlp.nn.util.sort_batch_by_length might be helpful. # TODO: Your code here. # method gives a tuple of outputs # (sorted passage, sorted passage lengths, restoration index) sorted_passage, sorted_passage_lengths, passage_restoration, _ = sort_batch_by_length( embedded_passage, passageLengths) # 2.2. Pack the passages with torch.nn.utils.rnn.pack_padded_sequence. # Hint: Make sure you have the proper value for batch_first. # TODO: Your code here. # packing optimizes out the padding, removes out padding words from passages # packed_passage is a pytorch object which nests sequences, converts to 2-d matrix packed_passage = pack_padded_sequence(sorted_passage, sorted_passage_lengths, batch_first=True) # 2.3. Encode the packed passages with the RNN. # TODO: Your code here. (input), feeding in optimized passages thru the network nodes # encoding is used to represent input within the neural network # output is a packed sequence passageEncoding, passageHidden = self.gruPassage(packed_passage) # 2.4. Unpack (pad) the passages with # torch.nn.utils.rnn.pad_packed_sequence. # Hint: Make sure you have the proper value for batch_first. # Shape: ? # TODO: Your code here. # returns tuple again, variable, variable expands tuple 0, 1 # extract unpadded passages from encoding passage_unpacked, lens_unpacked = pad_packed_sequence(passageEncoding, batch_first=True) # 2.5. Unsort the unpacked, encoded passage to restore the # initial ordering. # Hint: Look into torch.index_select or NumPy/PyTorch fancy indexing. # Shape: ? # TODO: Your code here. # Parameters: (input, dim to index along, original ordering) # use restoration indices to get original ordering for unpacked passages unsorted_passage = passage_unpacked.index_select( 0, passage_restoration) # Part 3. Encode the embedded questions with the RNN. # 3.1. Sort the embedded questions by decreasing order # of question_lengths. # Hint: allennlp.nn.util.sort_batch_by_length might be helpful. # TODO: Your code here. # Returns tuple of 4 arguments sorted_question, sorted_question_lengths, question_restoration, _ = sort_batch_by_length( embedded_question, questionLengths) # 3.2. Pack the questions with pack_padded_sequence. # Hint: Make sure you have the proper value for batch_first. # TODO: Your code here. # Pack questions based on padding packed_question = pack_padded_sequence(sorted_question, sorted_question_lengths, batch_first=True) # 3.3. Encode the questions with the RNN. # TODO: Your code here. # Encode with question bidirectional GRU # output is a packed sequence questionEncoding, questionHidden = self.gruQuestion(packed_question) # 3.4. Unpack (pad) the questions with pad_packed_sequence. # Hint: Make sure you have the proper value for batch_first. # Shape: ? # TODO: Your code here. # extract unpadded questions question_unpacked, lens_unpacked = pad_packed_sequence( questionEncoding, batch_first=True) # 3.5. Unsort the unpacked, encoded question to restore the # initial ordering. # Hint: Look into torch.index_select or NumPy/PyTorch fancy indexing. # Shape: ? # TODO: Your code here. # Unsort using question restoration original ordering unsorted_question = question_unpacked.index_select( 0, question_restoration) # 3.6. Take the average of the GRU hidden states. # Hint: Be careful how you treat padding. # Shape: ? # TODO: Your code here. # set padding to 0 in question, question hidden gru can have hidden state for padding index that is not all 0 # element-wise product mask * unpacked, unsorted question of question,unsqueeze and add dimension to mask so it fits questionProduct = question_mask.unsqueeze(-1) * unsorted_question # sum up non-padded elements of product and get average of gru states questionRepresent = (torch.sum(questionProduct, dim=1) / questionLengths.unsqueeze(1)) # Part 4: Combine the passage and question representations by # concatenating the passage and question representations with # their product. # 4.1. Reshape the question encoding to make it # amenable to concatenation # Shape: (batchsize, max passage length, hidden size) # TODO: Your code here. # questionEncoding (batchsize, max passage length, hidden size) # expand depending on this size tiled_encoded_q = questionRepresent.unsqueeze( dim=1).expand_as(unsorted_passage) # 4.2. Concatenate to make the combined representation. # Hint: Use torch.cat # Shape: (batch_size, max_passage_size, 6 * embedding_dim) # TODO: Your code here. # concatenate the expanded passage and expanded questions as well as product over last dim combined_x_q = torch.cat([ unsorted_passage, tiled_encoded_q, unsorted_passage * tiled_encoded_q ], dim=-1) # Part 5: Compute logits for answer start index. # 5.1. Apply the affine transformation, and edit the shape. # Shape after affine transformation: ? # Shape after editing shape: ? # TODO: Your code here. # get start logits with output project, and reshape last column start_logits = self.start_output_projection(combined_x_q).squeeze(-1) # 5.2. Replace the masked values so they have a very low score (-1e7). # This tensor is your start_logits. # Hint: allennlp.nn.util.replace_masked_values might be helpful. # Shape: ? # TODO: Your code here. start_logits = replace_masked_values(start_logits, passage_mask, -1e7) # 5.3. Apply a padding-aware log-softmax to normalize. # This tensor is your softmax_start_logits. # Hint: allennlp.nn.util.masked_log_softmax might be helpful. # Shape: ? # TODO: Your code here. softmax_start_logits = masked_log_softmax(start_logits, passage_mask) # Part 6: Compute logits for answer end index. # 6.1. Apply the affine transformation, and edit the shape. # Shape after affine transformation: ? # Shape after editing shape: ? # TODO: Your code here. end_logits = self.end_output_projection(combined_x_q).squeeze(-1) # 6.2. Replace the masked values so they have a very low score (-1e7). # This tensor is your end_logits. # Hint: allennlp.nn.util.replace_masked_values might be helpful. # Shape: ? # TODO: Your code here. end_logits = replace_masked_values(end_logits, passage_mask, -1e7) # 6.3. Apply a padding-aware log-softmax to normalize. # This tensor is your softmax_end_logits. # Hint: allennlp.nn.util.masked_log_softmax might be helpful. # Shape: ? # TODO: Your code here. softmax_end_logits = masked_log_softmax(end_logits, passage_mask) # Part 7: Output a dictionary with the start_logits, end_logits, # softmax_start_logits, softmax_end_logits. # TODO: Your code here. Remove the NotImplementedError below. return { "start_logits": start_logits, "end_logits": end_logits, "softmax_start_logits": softmax_start_logits, "softmax_end_logits": softmax_end_logits }
def sort_and_run_forward( self, module: Callable[[PackedSequence, Optional[RnnState]], Tuple[Union[PackedSequence, torch.Tensor], RnnState], ], inputs: torch.Tensor, mask: torch.Tensor, hidden_state: Optional[RnnState] = None, prevs=None, rev_prevs=None): # In some circumstances you may have sequences of zero length. ``pack_padded_sequence`` # requires all sequence lengths to be > 0, so remove sequences of zero length before # calling self._module, then fill with zeros. # First count how many sequences are empty. batch_size = mask.size(0) num_valid = torch.sum(mask[:, 0]).int().item() sequence_lengths = get_lengths_from_binary_sequence_mask(mask) sorted_inputs, sorted_sequence_lengths, restoration_indices, sorting_indices = sort_batch_by_length( inputs, sequence_lengths) prevs = [prevs[i] for i in sorting_indices][:num_valid] rev_prevs = [rev_prevs[i] for i in sorting_indices][:num_valid] # Now create a PackedSequence with only the non-empty, sorted sequences. packed_sequence_input = pack_padded_sequence( sorted_inputs[:num_valid, :, :], sorted_sequence_lengths[:num_valid].data.tolist(), batch_first=True, ) # Prepare the initial states. if not self.stateful: if hidden_state is None: initial_states: Any = hidden_state elif isinstance(hidden_state, tuple): initial_states = [ state.index_select( 1, sorting_indices)[:, :num_valid, :].contiguous() for state in hidden_state ] else: initial_states = hidden_state.index_select( 1, sorting_indices)[:, :num_valid, :].contiguous() else: initial_states = self._get_initial_states(batch_size, num_valid, sorting_indices) # Actually call the module on the sorted PackedSequence. module_output, final_states = module(packed_sequence_input, initial_states, prevs, rev_prevs) return module_output, final_states, restoration_indices
def test_augmented_lstm_works_with_highway_connections(self): augmented_lstm = AugmentedLstm(10, 11, use_highway=True) sorted_tensor, sorted_sequence, _, _ = sort_batch_by_length(self.random_tensor, self.sequence_lengths) lstm_input = pack_padded_sequence(sorted_tensor, sorted_sequence.data.tolist(), batch_first=True) augmented_lstm(lstm_input)
def sort_and_run_forward( self, module: Callable[ [PackedSequence, Optional[RnnState]], Tuple[Union[PackedSequence, torch.Tensor], RnnState], ], inputs: torch.Tensor, mask: torch.Tensor, hidden_state: Optional[RnnState] = None, ): """ This function exists because Pytorch RNNs require that their inputs be sorted before being passed as input. As all of our Seq2xxxEncoders use this functionality, it is provided in a base class. This method can be called on any module which takes as input a `PackedSequence` and some `hidden_state`, which can either be a tuple of tensors or a tensor. As all of our Seq2xxxEncoders have different return types, we return `sorted` outputs from the module, which is called directly. Additionally, we return the indices into the batch dimension required to restore the tensor to it's correct, unsorted order and the number of valid batch elements (i.e the number of elements in the batch which are not completely masked). This un-sorting and re-padding of the module outputs is left to the subclasses because their outputs have different types and handling them smoothly here is difficult. # Parameters module : `Callable[[PackedSequence, Optional[RnnState]], Tuple[Union[PackedSequence, torch.Tensor], RnnState]]`, required. A function to run on the inputs. In most cases, this is a `torch.nn.Module`. inputs : `torch.Tensor`, required. A tensor of shape `(batch_size, sequence_length, embedding_size)` representing the inputs to the Encoder. mask : `torch.Tensor`, required. A tensor of shape `(batch_size, sequence_length)`, representing masked and non-masked elements of the sequence for each element in the batch. hidden_state : `Optional[RnnState]`, (default = None). A single tensor of shape (num_layers, batch_size, hidden_size) representing the state of an RNN with or a tuple of tensors of shapes (num_layers, batch_size, hidden_size) and (num_layers, batch_size, memory_size), representing the hidden state and memory state of an LSTM-like RNN. # Returns module_output : `Union[torch.Tensor, PackedSequence]`. A Tensor or PackedSequence representing the output of the Pytorch Module. The batch size dimension will be equal to `num_valid`, as sequences of zero length are clipped off before the module is called, as Pytorch cannot handle zero length sequences. final_states : `Optional[RnnState]` A Tensor representing the hidden state of the Pytorch Module. This can either be a single tensor of shape (num_layers, num_valid, hidden_size), for instance in the case of a GRU, or a tuple of tensors, such as those required for an LSTM. restoration_indices : `torch.LongTensor` A tensor of shape `(batch_size,)`, describing the re-indexing required to transform the outputs back to their original batch order. """ # In some circumstances you may have sequences of zero length. `pack_padded_sequence` # requires all sequence lengths to be > 0, so remove sequences of zero length before # calling self._module, then fill with zeros. # First count how many sequences are empty. batch_size = mask.size(0) num_valid = torch.sum(mask[:, 0]).int().item() sequence_lengths = get_lengths_from_binary_sequence_mask(mask) ( sorted_inputs, sorted_sequence_lengths, restoration_indices, sorting_indices, ) = sort_batch_by_length(inputs, sequence_lengths) # Now create a PackedSequence with only the non-empty, sorted sequences. packed_sequence_input = pack_padded_sequence( sorted_inputs[:num_valid, :, :], sorted_sequence_lengths[:num_valid].data.tolist(), batch_first=True, ) # Prepare the initial states. if not self.stateful: if hidden_state is None: initial_states: Any = hidden_state elif isinstance(hidden_state, tuple): initial_states = [ state.index_select(1, sorting_indices)[:, :num_valid, :].contiguous() for state in hidden_state ] else: initial_states = hidden_state.index_select(1, sorting_indices)[ :, :num_valid, : ].contiguous() else: initial_states = self._get_initial_states(batch_size, num_valid, sorting_indices) # Actually call the module on the sorted PackedSequence. module_output, final_states = module(packed_sequence_input, initial_states) return module_output, final_states, restoration_indices