def test_add_sentence_boundary_token_ids_handles_2D_input(self): tensor = torch.from_numpy(numpy.array([[1, 2, 3], [4, 5, 0]])) mask = (tensor > 0).long() bos = 9 eos = 10 new_tensor, new_mask = util.add_sentence_boundary_token_ids(tensor, mask, bos, eos) expected_new_tensor = numpy.array([[9, 1, 2, 3, 10], [9, 4, 5, 10, 0]]) assert (new_tensor.data.numpy() == expected_new_tensor).all() assert (new_mask.data.numpy() == (expected_new_tensor > 0)).all()
def forward(self, # type: ignore inputs: torch.Tensor) -> Dict[str, torch.Tensor]: """ Parameters ---------- inputs: ``torch.Tensor`` Shape ``(batch_size, timesteps, ...)`` of token ids representing the current batch. These must have been produced using the same indexer the LM was trained on. Returns ------- The bidirectional language model representations for the input sequence, shape ``(batch_size, timesteps, embedding_dim)`` """ # pylint: disable=arguments-differ if self._bos_indices is not None: mask = get_text_field_mask({"": inputs}) inputs, mask = add_sentence_boundary_token_ids( inputs, mask, self._bos_indices, self._eos_indices ) source = {self._token_name: inputs} result_dict = self._lm(source) # shape (batch_size, timesteps, embedding_size) noncontextual_token_embeddings = result_dict["noncontextual_token_embeddings"] contextual_embeddings = result_dict["lm_embeddings"] # Typically the non-contextual embeddings are smaller than the contextualized embeddings. # Since we're averaging all the layers we need to make their dimensions match. Simply # repeating the non-contextual embeddings is a crude, but effective, way to do this. duplicated_character_embeddings = torch.cat( [noncontextual_token_embeddings] * self._character_embedding_duplication_count, -1 ) averaged_embeddings = self._scalar_mix( [duplicated_character_embeddings] + contextual_embeddings ) # Add dropout averaged_embeddings = self._dropout(averaged_embeddings) if self._remove_bos_eos: averaged_embeddings, _ = remove_sentence_boundaries( averaged_embeddings, result_dict["mask"] ) return averaged_embeddings
def test_add_sentence_boundary_token_ids_handles_3D_input(self): tensor = torch.from_numpy( numpy.array([[[1, 2, 3, 4], [5, 5, 5, 5], [6, 8, 1, 2]], [[4, 3, 2, 1], [8, 7, 6, 5], [0, 0, 0, 0]]])) mask = ((tensor > 0).sum(dim=-1) > 0).type(torch.LongTensor) bos = torch.from_numpy(numpy.array([9, 9, 9, 9])) eos = torch.from_numpy(numpy.array([10, 10, 10, 10])) new_tensor, new_mask = util.add_sentence_boundary_token_ids(tensor, mask, bos, eos) expected_new_tensor = numpy.array([[[9, 9, 9, 9], [1, 2, 3, 4], [5, 5, 5, 5], [6, 8, 1, 2], [10, 10, 10, 10]], [[9, 9, 9, 9], [4, 3, 2, 1], [8, 7, 6, 5], [10, 10, 10, 10], [0, 0, 0, 0]]]) assert (new_tensor.data.numpy() == expected_new_tensor).all() assert (new_mask.data.numpy() == ((expected_new_tensor > 0).sum(axis=-1) > 0)).all()
def forward(self, inputs: torch.Tensor) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Compute context insensitive token embeddings for ELMo representations. Parameters ---------- inputs: ``torch.autograd.Variable`` Shape ``(batch_size, sequence_length, 50)`` of character ids representing the current batch. Returns ------- Dict with keys: ``'token_embedding'``: ``torch.autograd.Variable`` Shape ``(batch_size, sequence_length + 2, embedding_dim)`` tensor with context insensitive token representations. ``'mask'``: ``torch.autograd.Variable`` Shape ``(batch_size, sequence_length + 2)`` long tensor with sequence mask. """ # Add BOS/EOS mask = ((inputs > 0).long().sum(dim=-1) > 0).long() character_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids( inputs, mask, self._beginning_of_sentence_characters, self._end_of_sentence_characters ) # the character id embedding max_chars_per_token = self._options['char_cnn']['max_characters_per_token'] # (batch_size * sequence_length, max_chars_per_token, embed_dim) character_embedding = torch.nn.functional.embedding( character_ids_with_bos_eos.view(-1, max_chars_per_token), self._char_embedding_weights ) # run convolutions cnn_options = self._options['char_cnn'] if cnn_options['activation'] == 'tanh': activation = torch.nn.functional.tanh elif cnn_options['activation'] == 'relu': activation = torch.nn.functional.relu else: raise ConfigurationError("Unknown activation") # (batch_size * sequence_length, embed_dim, max_chars_per_token) character_embedding = torch.transpose(character_embedding, 1, 2) convs = [] for i in range(len(self._convolutions)): conv = getattr(self, 'char_conv_{}'.format(i)) convolved = conv(character_embedding) # (batch_size * sequence_length, n_filters for this width) convolved, _ = torch.max(convolved, dim=-1) convolved = activation(convolved) convs.append(convolved) # (batch_size * sequence_length, n_filters) token_embedding = torch.cat(convs, dim=-1) # apply the highway layers (batch_size * sequence_length, n_filters) token_embedding = self._highways(token_embedding) # final projection (batch_size * sequence_length, embedding_dim) token_embedding = self._projection(token_embedding) # reshape to (batch_size, sequence_length, embedding_dim) batch_size, sequence_length, _ = character_ids_with_bos_eos.size() return { 'mask': mask_with_bos_eos, 'token_embedding': token_embedding.view(batch_size, sequence_length, -1) }
def forward(self, inputs: torch.Tensor) -> Dict[str, torch.Tensor]: """ Compute context insensitive token embeddings for ELMo representations. # Parameters inputs : `torch.Tensor` Shape `(batch_size, sequence_length, 50)` of character ids representing the current batch. # Returns Dict with keys: `'token_embedding'` : `torch.Tensor` Shape `(batch_size, sequence_length + 2, embedding_dim)` tensor with context insensitive token representations. `'mask'`: `torch.BoolTensor` Shape `(batch_size, sequence_length + 2)` long tensor with sequence mask. """ # Add BOS/EOS mask = (inputs > 0).sum(dim=-1) > 0 character_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids( inputs, mask, self._beginning_of_sentence_characters, self._end_of_sentence_characters) # the character id embedding max_chars_per_token = self._options["char_cnn"][ "max_characters_per_token"] # (batch_size * sequence_length, max_chars_per_token, embed_dim) character_embedding = torch.nn.functional.embedding( character_ids_with_bos_eos.view(-1, max_chars_per_token), self._char_embedding_weights) # run convolutions cnn_options = self._options["char_cnn"] if cnn_options["activation"] == "tanh": activation = torch.tanh elif cnn_options["activation"] == "relu": activation = torch.nn.functional.relu else: raise ConfigurationError("Unknown activation") # (batch_size * sequence_length, embed_dim, max_chars_per_token) character_embedding = torch.transpose(character_embedding, 1, 2) convs = [] for i in range(len(self._convolutions)): conv = getattr(self, "char_conv_{}".format(i)) convolved = conv(character_embedding) # (batch_size * sequence_length, n_filters for this width) convolved, _ = torch.max(convolved, dim=-1) convolved = activation(convolved) convs.append(convolved) # (batch_size * sequence_length, n_filters) token_embedding = torch.cat(convs, dim=-1) # apply the highway layers (batch_size * sequence_length, n_filters) token_embedding = self._highways(token_embedding) # final projection (batch_size * sequence_length, embedding_dim) token_embedding = self._projection(token_embedding) # reshape to (batch_size, sequence_length, embedding_dim) batch_size, sequence_length, _ = character_ids_with_bos_eos.size() return { "mask": mask_with_bos_eos, "token_embedding": token_embedding.view(batch_size, sequence_length, -1), }
def forward( self, # pylint: disable=arguments-differ inputs: torch.Tensor, word_inputs: torch.Tensor = None ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: """ Parameters ---------- inputs: ``torch.Tensor``, required. Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch. word_inputs : ``torch.Tensor``, required. If you passed a cached vocab, you can in addition pass a tensor of shape ``(batch_size, timesteps)``, which represent word ids which have been pre-cached. Returns ------- Dict with keys: ``'activations'``: ``List[torch.Tensor]`` A list of activations at each layer of the network, each of shape ``(batch_size, timesteps + 2, embedding_dim)`` ``'mask'``: ``torch.Tensor`` Shape ``(batch_size, timesteps + 2)`` long tensor with sequence mask. Note that the output tensors all include additional special begin and end of sequence markers. """ if self._word_embedding is not None and word_inputs is not None: try: mask_without_bos_eos = (word_inputs > 0).long() # The character cnn part is cached - just look it up. embedded_inputs = self._word_embedding( word_inputs) # type: ignore # shape (batch_size, timesteps + 2, embedding_dim) type_representation, mask = add_sentence_boundary_token_ids( embedded_inputs, mask_without_bos_eos, self._bos_embedding, self._eos_embedding) except RuntimeError: # Back off to running the character convolutions, # as we might not have the words in the cache. token_embedding = self._token_embedder(inputs) mask = token_embedding['mask'] type_representation = token_embedding['token_embedding'] else: token_embedding = self._token_embedder(inputs) mask = token_embedding['mask'] type_representation = token_embedding['token_embedding'] lstm_outputs = self._elmo_lstm(type_representation, mask) # Prepare the output. The first layer is duplicated. # Because of minor differences in how masking is applied depending # on whether the char cnn layers are cached, we'll be defensive and # multiply by the mask here. It's not strictly necessary, as the # mask passed on is correct, but the values in the padded areas # of the char cnn representations can change. output_tensors = [ torch.cat([type_representation, type_representation], dim=-1) * mask.float().unsqueeze(-1) ] for layer_activations in torch.chunk(lstm_outputs, lstm_outputs.size(0), dim=0): output_tensors.append(layer_activations.squeeze(0)) return { 'activations': output_tensors, 'mask': mask, }
def forward(self, inputs: torch.Tensor) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Compute context insensitive token embeddings for ELMo representations. Parameters ---------- inputs: ``torch.Tensor`` Shape ``(batch_size, sequence_length, 50)`` of character ids representing the current batch. Returns ------- Dict with keys: ``'token_embedding'``: ``torch.Tensor`` Shape ``(batch_size, sequence_length + 2, embedding_dim)`` tensor with context insensitive token representations. ``'mask'``: ``torch.Tensor`` Shape ``(batch_size, sequence_length + 2)`` long tensor with sequence mask. """ # Add BOS/EOS mask = ((inputs > 0).long().sum(dim=-1) > 0).long() character_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids( inputs, mask, self._beginning_of_sentence_characters, self._end_of_sentence_characters) # the character id embedding max_chars_per_token = self._options['char_cnn'][ 'max_characters_per_token'] # (batch_size * sequence_length, max_chars_per_token, embed_dim) character_embedding = torch.nn.functional.embedding( character_ids_with_bos_eos.view(-1, max_chars_per_token), self._char_embedding_weights) # run convolutions cnn_options = self._options['char_cnn'] if cnn_options['activation'] == 'tanh': activation = torch.tanh elif cnn_options['activation'] == 'relu': activation = torch.nn.functional.relu else: raise ConfigurationError("Unknown activation") # (batch_size * sequence_length, embed_dim, max_chars_per_token) character_embedding = torch.transpose(character_embedding, 1, 2) convs = [] for i in range(len(self._convolutions)): conv = getattr(self, 'char_conv_{}'.format(i)) convolved = conv(character_embedding) # (batch_size * sequence_length, n_filters for this width) convolved, _ = torch.max(convolved, dim=-1) convolved = activation(convolved) convs.append(convolved) # (batch_size * sequence_length, n_filters) token_embedding = torch.cat(convs, dim=-1) # apply the highway layers (batch_size * sequence_length, n_filters) token_embedding = self._highways(token_embedding) # final projection (batch_size * sequence_length, embedding_dim) token_embedding = self._projection(token_embedding) # reshape to (batch_size, sequence_length, embedding_dim) batch_size, sequence_length, _ = character_ids_with_bos_eos.size() return { 'mask': mask_with_bos_eos, 'token_embedding': token_embedding.view(batch_size, sequence_length, -1) }
def forward( # type: ignore self, image_features: torch.Tensor, penultimate_features: torch.Tensor, caption_tokens: Optional[torch.Tensor] = None, fsm: torch.Tensor = None, num_constraints: torch.Tensor = None, ) -> Dict[str, torch.Tensor]: r""" Given bottom-up image features, maximize the likelihood of paired captions during training. During evaluation, decode captions given image features using beam search. Parameters ---------- image_features: torch.Tensor A tensor of shape ``(batch_size, num_boxes * image_feature_size)``. ``num_boxes`` for each instance in a batch might be different. Instances with lesser boxes are padded with zeros up to ``num_boxes``. penultimate_features: torch.Tensor A tensor of shape ``(batch_size, channel, height, width)``. They are extracted from saliency attentive model in the penultimate layers caption_tokens: torch.Tensor, optional (default = None) A tensor of shape ``(batch_size, max_caption_length)`` of tokenized captions. This tensor does not contain ``@@BOUNDARY@@`` tokens yet. Captions are not provided during evaluation. fsm: torch.Tensor, optional (default = None) A tensor of shape ``(batch_size, num_states, num_states, vocab_size)``: finite state machines per instance, represented as adjacency matrix. For a particular instance ``[_, s1, s2, v] = 1`` shows a transition from state ``s1`` to ``s2`` on decoding ``v`` token (constraint). Would be ``None`` for regular beam search decoding. num_constraints: torch.Tensor, optional (default = None) A tensor of shape ``(batch_size, )`` containing the total number of given constraints for CBS. Would be ``None`` for regular beam search decoding. Returns ------- Dict[str, torch.Tensor] Decoded captions and/or per-instance cross entropy loss, dict with keys either ``{"predictions"}`` or ``{"loss"}``. """ batch_size, num_boxes, image_feature_size = image_features.size() batch_size, channel, height, width = penultimate_features.size() penultimate_features = penultimate_features.view( batch_size, channel, -1).transpose(1, 2).contiguous() # Initialize states at zero-th timestep. states = None if self.training and caption_tokens is not None: # Add "@@BOUNDARY@@" tokens to caption sequences. caption_tokens, _ = add_sentence_boundary_token_ids( caption_tokens, (caption_tokens != self._pad_index), self._boundary_index, self._boundary_index, ) batch_size, max_caption_length = caption_tokens.size() # shape: (batch_size, max_caption_length) tokens_mask = caption_tokens != self._pad_index # The last input from the target is either padding or the boundary token. # Either way, we don't have to process it. num_decoding_steps = max_caption_length - 1 step_logits: List[torch.Tensor] = [] step_logits_saliency: List[torch.Tensor] = [] for timestep in range(num_decoding_steps): # shape: (batch_size,) input_tokens = caption_tokens[:, timestep] # shape: (batch_size, num_classes) output_logits, output_logits_saliency, states = \ self._decode_step(image_features, penultimate_features, input_tokens, states) # list of tensors, shape: (batch_size, 1, vocab_size) step_logits.append(output_logits.unsqueeze(1)) # list of tensors, shape: (batch_size, 1, vocab_size) step_logits_saliency.append( output_logits_saliency.unsqueeze(1)) # shape: (batch_size, num_decoding_steps) logits = torch.cat(step_logits, 1) # shape: (batch_size, num_decoding_steps) logits_saliency = torch.cat(step_logits_saliency, 1) # Skip first time-step from targets for calculating loss. output_dict = { "loss": self._get_loss( logits, caption_tokens[:, 1:].contiguous(), tokens_mask[:, 1:].contiguous(), ), "loss_saliency": self._get_loss( logits_saliency, caption_tokens[:, 1:].contiguous(), tokens_mask[:, 1:].contiguous(), ) } else: num_decoding_steps = self._max_caption_length start_predictions = image_features.new_full( (batch_size, ), self._boundary_index).long() # Add image features as a default argument to match callable signature acceptable by # beam search class (previous predictions and states only). beam_decode_step = functools.partial(self._decode_step, image_features, penultimate_features) # shape (all_top_k_predictions): (batch_size, net_beam_size, num_decoding_steps) # shape (log_probabilities): (batch_size, net_beam_size) if self._use_cbs: all_top_k_predictions, log_probabilities = self._beam_search.search( start_predictions, states, beam_decode_step, fsm) if self._is_val: best_beam = select_best_beam_with_constraints( all_top_k_predictions, log_probabilities, num_constraints, self._min_constraints_to_satisfy, ) else: valid_beam, valid_log_probabilities, valid_num = select_valid_beam_with_constraints( all_top_k_predictions, log_probabilities, num_constraints, self._min_constraints_to_satisfy, ) else: all_top_k_predictions, log_probabilities = self._beam_search.search( start_predictions, states, beam_decode_step) best_beam = select_best_beam(all_top_k_predictions, log_probabilities) if self._is_val: # shape: (batch_size, num_decoding_steps) output_dict = {"predictions": best_beam} else: # shape: (batch_size * beam_size, num_decoding_steps) output_dict = { "predictions": valid_beam, "log_probabilities": valid_log_probabilities, "valid_numbers": valid_num } return output_dict
def forward(self, # pylint: disable=arguments-differ inputs: torch.Tensor, word_inputs: torch.Tensor = None) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: """ Parameters ---------- inputs: ``torch.Tensor``, required. Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch. word_inputs : ``torch.Tensor``, required. If you passed a cached vocab, you can in addition pass a tensor of shape ``(batch_size, timesteps)``, which represent word ids which have been pre-cached. Returns ------- Dict with keys: ``'activations'``: ``List[torch.Tensor]`` A list of activations at each layer of the network, each of shape ``(batch_size, timesteps + 2, embedding_dim)`` ``'mask'``: ``torch.Tensor`` Shape ``(batch_size, timesteps + 2)`` long tensor with sequence mask. Note that the output tensors all include additional special begin and end of sequence markers. """ if self._word_embedding is not None and word_inputs is not None: try: mask_without_bos_eos = (word_inputs > 0).long() # The character cnn part is cached - just look it up. embedded_inputs = self._word_embedding(word_inputs) # type: ignore # shape (batch_size, timesteps + 2, embedding_dim) type_representation, mask = add_sentence_boundary_token_ids( embedded_inputs, mask_without_bos_eos, self._bos_embedding, self._eos_embedding ) except RuntimeError: # Back off to running the character convolutions, # as we might not have the words in the cache. token_embedding = self._token_embedder(inputs) mask = token_embedding['mask'] type_representation = token_embedding['token_embedding'] else: token_embedding = self._token_embedder(inputs) mask = token_embedding['mask'] type_representation = token_embedding['token_embedding'] lstm_outputs = self._elmo_lstm(type_representation, mask) # Prepare the output. The first layer is duplicated. # Because of minor differences in how masking is applied depending # on whether the char cnn layers are cached, we'll be defensive and # multiply by the mask here. It's not strictly necessary, as the # mask passed on is correct, but the values in the padded areas # of the char cnn representations can change. output_tensors = [ torch.cat([type_representation, type_representation], dim=-1) * mask.float().unsqueeze(-1) ] for layer_activations in torch.chunk(lstm_outputs, lstm_outputs.size(0), dim=0): output_tensors.append(layer_activations.squeeze(0)) return { 'activations': output_tensors, 'mask': mask, }
def test_fast_elmo_with_allennlp_do_layer_norm(): fast = FastElmo( ELMO_OPTIONS_FILE, ELMO_WEIGHT_FILE, num_output_representations=1, scalar_mix_parameters=[1.0, 1.0, 1.0], do_layer_norm=True, ) allennlp = Elmo( ELMO_OPTIONS_FILE, ELMO_WEIGHT_FILE, num_output_representations=1, dropout=0.0, scalar_mix_parameters=[1.0, 1.0, 1.0], do_layer_norm=True, ) sentences = [ ['ELMo', 'helps', 'disambiguate', 'ELMo', 'from', 'Elmo', '.'], ['The', 'sentence', '.'], ] character_ids = _sentences_to_ids(sentences) fast_out = fast(character_ids) allennlp_out = allennlp(character_ids) # Since we don't include the BOS/EOS reprs during layer normalization, # the result will be different from AllenNLP's implementation. np.testing.assert_raises( AssertionError, np.testing.assert_array_almost_equal, fast_out['elmo_representations'][0], allennlp_out['elmo_representations'][0], ) # We can pack BOS/EOS to inputs manually _beginning_of_sentence_characters = torch.from_numpy( np.array(ELMoCharacterMapper.beginning_of_sentence_characters) + 1) _end_of_sentence_characters = torch.from_numpy( np.array(ELMoCharacterMapper.end_of_sentence_characters) + 1) mask = ((character_ids > 0).long().sum(dim=-1) > 0).long() character_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids( character_ids, mask, _beginning_of_sentence_characters, _end_of_sentence_characters, ) # And disable the mock BOS/EOS actions in FastElmo. fast.exec_managed_lstm_bos_eos = False fast_out_2 = fast(character_ids_with_bos_eos) fast_mixed_repr_2, _ = remove_sentence_boundaries( fast_out_2['elmo_representations'][0], fast_out_2['mask'], ) allennlp_out_2 = allennlp(character_ids) np.testing.assert_array_almost_equal( fast_mixed_repr_2, allennlp_out_2['elmo_representations'][0], )
def test_elmo_lstm_factory_simple(): allennlp_elmo_bilm = _ElmoBiLm( ELMO_OPTIONS_FILE, ELMO_WEIGHT_FILE, ) embedder = ElmoCharacterEncoderFactory( ELMO_OPTIONS_FILE, ELMO_WEIGHT_FILE, ).create() fwd_lstm, bwd_lstm = ElmoLstmFactory( ELMO_OPTIONS_FILE, ELMO_WEIGHT_FILE, ).create(enable_forward=True, enable_backward=True) sentences_1 = [ ['ELMo', 'helps', 'disambiguate', 'ELMo', 'from', 'Elmo', '.'], ['The', 'sentence', '.'], ] sentences_2 = [ ["This", "is", "a", "sentence"], ["Here", "'s", "one"], ["Another", "one"], ] # Internal states should be updated. for sentences in [sentences_1, sentences_2] * 10: # `(2, 7, 50)` character_ids = _sentences_to_ids(sentences) # AllenNLP. allennlp_out = allennlp_elmo_bilm(character_ids) # Ours. inputs = character_ids _beginning_of_sentence_characters = torch.from_numpy( np.array(ELMoCharacterMapper.beginning_of_sentence_characters) + 1) _end_of_sentence_characters = torch.from_numpy( np.array(ELMoCharacterMapper.end_of_sentence_characters) + 1) # Add BOS/EOS mask = ((inputs > 0).long().sum(dim=-1) > 0).long() character_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids( inputs, mask, _beginning_of_sentence_characters, _end_of_sentence_characters, ) # Pack input. lengths = mask_with_bos_eos.sum(dim=-1) inputs = pack_padded_sequence(character_ids_with_bos_eos, lengths, batch_first=True) char_repr = embedder(inputs.data) fwd_lstm_hiddens, _ = fwd_lstm(char_repr, inputs.batch_sizes) bwd_lstm_hiddens, _ = bwd_lstm(char_repr, inputs.batch_sizes) lstm_hiddens = [ torch.cat([fwd, bwd], dim=-1) for fwd, bwd in zip(fwd_lstm_hiddens, bwd_lstm_hiddens) ] # Unpack output. char_repr = _unpack(char_repr, inputs.batch_sizes) duplicated_char_repr = torch.cat( [char_repr, char_repr], dim=-1, ) * mask_with_bos_eos.float().unsqueeze(-1) lstm_hiddens = [_unpack(hx, inputs.batch_sizes) for hx in lstm_hiddens] # TODO: Investigate the numerical stability issue. # np.testing.assert_array_almost_equal( # duplicated_char_repr.data.numpy(), # allennlp_out['activations'][0].data.numpy(), # ) # np.testing.assert_array_almost_equal( # lstm_hiddens[0].data.numpy(), # allennlp_out['activations'][1].data.numpy(), # ) np.testing.assert_array_almost_equal( lstm_hiddens[1].data.numpy(), allennlp_out['activations'][2].data.numpy(), )
def forward(self, image_features, caption_tokens: Optional[torch.Tensor] = None, device: Optional[int] = 0): states = None batch_size, num_boxes, image_feature_size = image_features.size() if self.training and caption_tokens is not None: # Add "@@BOUNDARY@@" tokens to caption sequences. caption_tokens, _ = add_sentence_boundary_token_ids( caption_tokens, (caption_tokens != self._pad_index), self._boundary_index, self._boundary_index, ) batch_size, max_caption_length = caption_tokens.size() # shape: (batch_size, max_caption_length) tokens_mask = caption_tokens != self._pad_index # The last input from the target is either padding or the boundary token. # Either way, we don't have to process it. num_decoding_steps = max_caption_length - 1 image_features = self.adapt_image_features(image_features) image_features = self.encoder(image_features) image_features = self.adapt_again(image_features) step_logits: List[torch.Tensor] = [] for timestep in range(num_decoding_steps): # shape: (batch_size,) input_tokens = caption_tokens[:, timestep] # shape: (batch_size, num_classes) output_logits, states = self._decode_step( image_features, input_tokens, states) # list of tensors, shape: (batch_size, 1, vocab_size) step_logits.append(output_logits.unsqueeze(1)) # shape: (batch_size, num_decoding_steps) logits = torch.cat(step_logits, 1) # Skip first time-step from targets for calculating loss. output_dict = { "loss": self._get_loss(logits, caption_tokens[:, 1:].contiguous(), tokens_mask[:, 1:].contiguous()) } else: num_decoding_steps = self.max_caption_length image_features = self.adapt_image_features(image_features) image_features = self.encoder(image_features) image_features = self.adapt_again(image_features) start_predictions = image_features.new_full( (batch_size, ), self._boundary_index).long() # Add image features as a default argument to match callable signature acceptable by # beam search class (previous predictions and states only). beam_decode_step = functools.partial(self._decode_step, image_features) # shape (all_top_k_predictions): (batch_size, net_beam_size, num_decoding_steps) # shape (log_probabilities): (batch_size, net_beam_size) # if self._use_cbs: # all_top_k_predictions, log_probabilities = self._beam_search.search( # start_predictions, states, beam_decode_step, fsm # ) # best_beam = select_best_beam_with_constraints( # all_top_k_predictions, # log_probabilities, # num_constraints, # self._min_constraints_to_satisfy, # ) # else: all_top_k_predictions, log_probabilities = self._beam_search.search( start_predictions, states, beam_decode_step) best_beam = select_best_beam(all_top_k_predictions, log_probabilities) # shape: (batch_size, num_decoding_steps) output_dict = {"predictions": best_beam} return output_dict
def forward( self, image_ids: torch.Tensor, image_features: torch.Tensor, caption_tokens: Optional[torch.Tensor] = None ) -> Dict[str, torch.Tensor]: r""" Given bottom-up image features, maximize the likelihood of paired captions during training. During evaluation, decode captions given image features using beam search. Parameters ---------- image_features: torch.Tensor A tensor of shape ``(batch_size, num_boxes * image_feature_size)``. ``num_boxes`` for each instance in a batch might be different. Instances with lesser boxes are padded with zeros up to ``num_boxes``. caption_tokens: torch.Tensor, optional (default = None) A tensor of shape ``(batch_size, max_caption_length)`` of tokenized captions. This tensor does not contain ``@@BOUNDARY@@`` tokens yet. Captions are not provided during evaluation. Returns ------- Dict[str, torch.Tensor] Decoded captions and/or per-instance cross entropy loss, dict with keys either ``{"predictions"}`` or ``{"loss"}``. """ # shape: (batch_size, num_boxes * image_feature_size) for adaptive features. # shape: (batch_size, num_boxes, image_feature_size) for fixed features. batch_size = image_features.size(0) # shape: (batch_size, num_boxes, image_feature_size) image_features = image_features.view(batch_size, -1, self.image_feature_size) # Initialize states at zero-th timestep. states = None if self.training and caption_tokens is not None: # Add "@@BOUNDARY@@" tokens to caption sequences. caption_tokens, _ = add_sentence_boundary_token_ids( caption_tokens, (caption_tokens != self._pad_index), self._boundary_index, self._boundary_index, ) _, max_caption_length = caption_tokens.size() # shape: (batch_size, max_caption_length) tokens_mask = caption_tokens != self._pad_index # The last input from the target is either padding or the boundary token. # Either way, we don't have to process it. num_decoding_steps = max_caption_length - 1 step_logits: List[torch.Tensor] = [] for timestep in range(num_decoding_steps): # shape: (batch_size,) input_tokens = caption_tokens[:, timestep] # shape: (batch_size, num_classes) output_logits, states = self._decode_step( image_features, input_tokens, states) # list of tensors, shape: (batch_size, 1, vocab_size) step_logits.append(output_logits.unsqueeze(1)) # shape: (batch_size, num_decoding_steps) logits = torch.cat(step_logits, 1) # Skip first time-step from targets for calculating loss. output_dict = { "loss": self._get_loss(logits, caption_tokens[:, 1:].contiguous(), tokens_mask[:, 1:].contiguous()) } else: num_decoding_steps = self._max_caption_length start_predictions = image_features.new_full( (batch_size, ), fill_value=self._boundary_index).long() state_transform_list = [] state_size_list = [] for image_id in image_ids: state_transform, state_size = self._fc.get_state_matrix( image_id) state_transform_list.append(state_transform) state_size_list.append(state_size) max_state = max(state_size_list) state_transform_list = [ s[:, :max_state, :max_state, :] for s in state_transform_list ] state_transform = torch.from_numpy( np.concatenate(state_transform_list, axis=0)).to(start_predictions.device) # shape (log_probabilities): (batch_size, beam_size) best_predictions = self._beam_search.search( self._decode_step, image_features, start_predictions, states, state_transform, image_ids) output_dict = {"predictions": best_predictions} return output_dict
def forward(self, program_tokens: torch.Tensor): r""" Given tokenized program sequences padded upto maximum length, predict sequence at next time-step and calculate cross entropy loss of this predicted sequence. Parameters ---------- program_tokens: torch.Tensor Tokenized program sequences padded with zeroes upto maximum length. shape: (batch_size, max_sequence_length) Returns ------- Dict[str, torch.Tensor] Predictions of next time-step and cross entropy loss (by teacher forcing), a dict with structure:: { "predictions": torch.Tensor (shape: (batch_size, max_sequence_length - 1)), "loss": torch.Tensor (shape: (batch_size, )) } """ batch_size = program_tokens.size(0) # Add "@start@" and "@end@" tokens to program sequences. program_tokens, _ = add_sentence_boundary_token_ids( program_tokens, (program_tokens != self._pad_index), self._start_index, self._end_index) program_tokens_mask = (program_tokens != self._pad_index).long() # Excluding @start@ token, because this is used with output of LSTM (next time-step). program_lengths = program_tokens_mask[:, 1:].sum(-1).float() # shape: (batch_size, max_sequence_length, input_size) embedded_programs = self._embedder({"programs": program_tokens}) # shape: (batch_size, max_sequence_length, hidden_size) encoded_programs = self._encoder(embedded_programs, program_tokens_mask) # shape: (batch_size, max_sequence_length, input_size) output_projection = self._projection_layer(encoded_programs) # shape: (batch_size, max_sequence_length, vocab_size) output_logits = self._output_layer(output_projection) output_class_probabilities = F.softmax(output_logits, dim=-1) # Don't sample @start@, @@PADDING@@ and @@UNKNOWN@@. output_class_probabilities[:, :, self._start_index] = 0 output_class_probabilities[:, :, self._pad_index] = 0 output_class_probabilities[:, :, self._unk_index] = 0 batch_predictions: List[torch.Tensor] = [] for batch_index in range(output_class_probabilities.size(0)): # Perform ancestral sampling instead of greedy decoding. # shape: (batch_size, ) batch_index_predictions = torch.multinomial( output_class_probabilities[batch_index], 1).squeeze() batch_predictions.append(batch_index_predictions) # shape: (batch_size, max_sequence_length) predictions = torch.stack(batch_predictions, 0) # Multiply with mask just to be sure. predictions = predictions[:, :-1] * program_tokens_mask[:, 1:] # shape: (batch_size, ) sequence_cross_entropy = sequence_cross_entropy_with_logits( output_logits[:, :-1, :].contiguous(), program_tokens[:, 1:].contiguous(), weights=program_tokens_mask[:, 1:], average=None, ) # Record metrics aggregated over current batch during evaluation. if not self.training: self._log2_perplexity(sequence_cross_entropy.mean().item()) return {"predictions": predictions, "loss": sequence_cross_entropy}