Ejemplo n.º 1
0
 def test_add_sentence_boundary_token_ids_handles_2D_input(self):
     tensor = torch.from_numpy(numpy.array([[1, 2, 3], [4, 5, 0]]))
     mask = (tensor > 0).long()
     bos = 9
     eos = 10
     new_tensor, new_mask = util.add_sentence_boundary_token_ids(tensor, mask, bos, eos)
     expected_new_tensor = numpy.array([[9, 1, 2, 3, 10],
                                        [9, 4, 5, 10, 0]])
     assert (new_tensor.data.numpy() == expected_new_tensor).all()
     assert (new_mask.data.numpy() == (expected_new_tensor > 0)).all()
    def forward(self,  # type: ignore
                inputs: torch.Tensor) -> Dict[str, torch.Tensor]:
        """
        Parameters
        ----------
        inputs: ``torch.Tensor``
            Shape ``(batch_size, timesteps, ...)`` of token ids representing the current batch.
            These must have been produced using the same indexer the LM was trained on.

        Returns
        -------
        The bidirectional language model representations for the input sequence, shape
        ``(batch_size, timesteps, embedding_dim)``
        """
        # pylint: disable=arguments-differ
        if self._bos_indices is not None:
            mask = get_text_field_mask({"": inputs})
            inputs, mask = add_sentence_boundary_token_ids(
                    inputs, mask, self._bos_indices, self._eos_indices
            )

        source = {self._token_name: inputs}
        result_dict = self._lm(source)

        # shape (batch_size, timesteps, embedding_size)
        noncontextual_token_embeddings = result_dict["noncontextual_token_embeddings"]
        contextual_embeddings = result_dict["lm_embeddings"]

        # Typically the non-contextual embeddings are smaller than the contextualized embeddings.
        # Since we're averaging all the layers we need to make their dimensions match. Simply
        # repeating the non-contextual embeddings is a crude, but effective, way to do this.
        duplicated_character_embeddings = torch.cat(
                [noncontextual_token_embeddings] * self._character_embedding_duplication_count, -1
        )
        averaged_embeddings = self._scalar_mix(
                [duplicated_character_embeddings] + contextual_embeddings
        )

        # Add dropout
        averaged_embeddings = self._dropout(averaged_embeddings)
        if self._remove_bos_eos:
            averaged_embeddings, _ = remove_sentence_boundaries(
                    averaged_embeddings, result_dict["mask"]
            )

        return averaged_embeddings
Ejemplo n.º 3
0
 def test_add_sentence_boundary_token_ids_handles_3D_input(self):
     tensor = torch.from_numpy(
             numpy.array([[[1, 2, 3, 4],
                           [5, 5, 5, 5],
                           [6, 8, 1, 2]],
                          [[4, 3, 2, 1],
                           [8, 7, 6, 5],
                           [0, 0, 0, 0]]]))
     mask = ((tensor > 0).sum(dim=-1) > 0).type(torch.LongTensor)
     bos = torch.from_numpy(numpy.array([9, 9, 9, 9]))
     eos = torch.from_numpy(numpy.array([10, 10, 10, 10]))
     new_tensor, new_mask = util.add_sentence_boundary_token_ids(tensor, mask, bos, eos)
     expected_new_tensor = numpy.array([[[9, 9, 9, 9],
                                         [1, 2, 3, 4],
                                         [5, 5, 5, 5],
                                         [6, 8, 1, 2],
                                         [10, 10, 10, 10]],
                                        [[9, 9, 9, 9],
                                         [4, 3, 2, 1],
                                         [8, 7, 6, 5],
                                         [10, 10, 10, 10],
                                         [0, 0, 0, 0]]])
     assert (new_tensor.data.numpy() == expected_new_tensor).all()
     assert (new_mask.data.numpy() == ((expected_new_tensor > 0).sum(axis=-1) > 0)).all()
Ejemplo n.º 4
0
    def forward(self, inputs: torch.Tensor) -> Dict[str, torch.Tensor]:  # pylint: disable=arguments-differ
        """
        Compute context insensitive token embeddings for ELMo representations.

        Parameters
        ----------
        inputs: ``torch.autograd.Variable``
            Shape ``(batch_size, sequence_length, 50)`` of character ids representing the
            current batch.

        Returns
        -------
        Dict with keys:
        ``'token_embedding'``: ``torch.autograd.Variable``
            Shape ``(batch_size, sequence_length + 2, embedding_dim)`` tensor with context
            insensitive token representations.
        ``'mask'``:  ``torch.autograd.Variable``
            Shape ``(batch_size, sequence_length + 2)`` long tensor with sequence mask.
        """
        # Add BOS/EOS
        mask = ((inputs > 0).long().sum(dim=-1) > 0).long()
        character_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids(
                inputs,
                mask,
                self._beginning_of_sentence_characters,
                self._end_of_sentence_characters
        )

        # the character id embedding
        max_chars_per_token = self._options['char_cnn']['max_characters_per_token']
        # (batch_size * sequence_length, max_chars_per_token, embed_dim)
        character_embedding = torch.nn.functional.embedding(
                character_ids_with_bos_eos.view(-1, max_chars_per_token),
                self._char_embedding_weights
        )

        # run convolutions
        cnn_options = self._options['char_cnn']
        if cnn_options['activation'] == 'tanh':
            activation = torch.nn.functional.tanh
        elif cnn_options['activation'] == 'relu':
            activation = torch.nn.functional.relu
        else:
            raise ConfigurationError("Unknown activation")

        # (batch_size * sequence_length, embed_dim, max_chars_per_token)
        character_embedding = torch.transpose(character_embedding, 1, 2)
        convs = []
        for i in range(len(self._convolutions)):
            conv = getattr(self, 'char_conv_{}'.format(i))
            convolved = conv(character_embedding)
            # (batch_size * sequence_length, n_filters for this width)
            convolved, _ = torch.max(convolved, dim=-1)
            convolved = activation(convolved)
            convs.append(convolved)

        # (batch_size * sequence_length, n_filters)
        token_embedding = torch.cat(convs, dim=-1)

        # apply the highway layers (batch_size * sequence_length, n_filters)
        token_embedding = self._highways(token_embedding)

        # final projection  (batch_size * sequence_length, embedding_dim)
        token_embedding = self._projection(token_embedding)

        # reshape to (batch_size, sequence_length, embedding_dim)
        batch_size, sequence_length, _ = character_ids_with_bos_eos.size()

        return {
                'mask': mask_with_bos_eos,
                'token_embedding': token_embedding.view(batch_size, sequence_length, -1)
        }
Ejemplo n.º 5
0
    def forward(self, inputs: torch.Tensor) -> Dict[str, torch.Tensor]:
        """
        Compute context insensitive token embeddings for ELMo representations.
        # Parameters
        inputs : `torch.Tensor`
            Shape `(batch_size, sequence_length, 50)` of character ids representing the
            current batch.
        # Returns
        Dict with keys:
        `'token_embedding'` : `torch.Tensor`
            Shape `(batch_size, sequence_length + 2, embedding_dim)` tensor with context
            insensitive token representations.
        `'mask'`:  `torch.BoolTensor`
            Shape `(batch_size, sequence_length + 2)` long tensor with sequence mask.
        """
        # Add BOS/EOS
        mask = (inputs > 0).sum(dim=-1) > 0
        character_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids(
            inputs, mask, self._beginning_of_sentence_characters,
            self._end_of_sentence_characters)

        # the character id embedding
        max_chars_per_token = self._options["char_cnn"][
            "max_characters_per_token"]
        # (batch_size * sequence_length, max_chars_per_token, embed_dim)
        character_embedding = torch.nn.functional.embedding(
            character_ids_with_bos_eos.view(-1, max_chars_per_token),
            self._char_embedding_weights)

        # run convolutions
        cnn_options = self._options["char_cnn"]
        if cnn_options["activation"] == "tanh":
            activation = torch.tanh
        elif cnn_options["activation"] == "relu":
            activation = torch.nn.functional.relu
        else:
            raise ConfigurationError("Unknown activation")

        # (batch_size * sequence_length, embed_dim, max_chars_per_token)
        character_embedding = torch.transpose(character_embedding, 1, 2)
        convs = []
        for i in range(len(self._convolutions)):
            conv = getattr(self, "char_conv_{}".format(i))
            convolved = conv(character_embedding)
            # (batch_size * sequence_length, n_filters for this width)
            convolved, _ = torch.max(convolved, dim=-1)
            convolved = activation(convolved)
            convs.append(convolved)

        # (batch_size * sequence_length, n_filters)
        token_embedding = torch.cat(convs, dim=-1)

        # apply the highway layers (batch_size * sequence_length, n_filters)
        token_embedding = self._highways(token_embedding)

        # final projection  (batch_size * sequence_length, embedding_dim)
        token_embedding = self._projection(token_embedding)

        # reshape to (batch_size, sequence_length, embedding_dim)
        batch_size, sequence_length, _ = character_ids_with_bos_eos.size()

        return {
            "mask":
            mask_with_bos_eos,
            "token_embedding":
            token_embedding.view(batch_size, sequence_length, -1),
        }
Ejemplo n.º 6
0
    def forward(
        self,  # pylint: disable=arguments-differ
        inputs: torch.Tensor,
        word_inputs: torch.Tensor = None
    ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
        """
        Parameters
        ----------
        inputs: ``torch.Tensor``, required.
            Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch.
        word_inputs : ``torch.Tensor``, required.
            If you passed a cached vocab, you can in addition pass a tensor of shape ``(batch_size, timesteps)``,
            which represent word ids which have been pre-cached.

        Returns
        -------
        Dict with keys:

        ``'activations'``: ``List[torch.Tensor]``
            A list of activations at each layer of the network, each of shape
            ``(batch_size, timesteps + 2, embedding_dim)``
        ``'mask'``:  ``torch.Tensor``
            Shape ``(batch_size, timesteps + 2)`` long tensor with sequence mask.

        Note that the output tensors all include additional special begin and end of sequence
        markers.
        """
        if self._word_embedding is not None and word_inputs is not None:
            try:
                mask_without_bos_eos = (word_inputs > 0).long()
                # The character cnn part is cached - just look it up.
                embedded_inputs = self._word_embedding(
                    word_inputs)  # type: ignore
                # shape (batch_size, timesteps + 2, embedding_dim)
                type_representation, mask = add_sentence_boundary_token_ids(
                    embedded_inputs, mask_without_bos_eos, self._bos_embedding,
                    self._eos_embedding)
            except RuntimeError:
                # Back off to running the character convolutions,
                # as we might not have the words in the cache.
                token_embedding = self._token_embedder(inputs)
                mask = token_embedding['mask']
                type_representation = token_embedding['token_embedding']
        else:
            token_embedding = self._token_embedder(inputs)
            mask = token_embedding['mask']
            type_representation = token_embedding['token_embedding']
        lstm_outputs = self._elmo_lstm(type_representation, mask)

        # Prepare the output.  The first layer is duplicated.
        # Because of minor differences in how masking is applied depending
        # on whether the char cnn layers are cached, we'll be defensive and
        # multiply by the mask here. It's not strictly necessary, as the
        # mask passed on is correct, but the values in the padded areas
        # of the char cnn representations can change.
        output_tensors = [
            torch.cat([type_representation, type_representation], dim=-1) *
            mask.float().unsqueeze(-1)
        ]
        for layer_activations in torch.chunk(lstm_outputs,
                                             lstm_outputs.size(0),
                                             dim=0):
            output_tensors.append(layer_activations.squeeze(0))

        return {
            'activations': output_tensors,
            'mask': mask,
        }
Ejemplo n.º 7
0
    def forward(self, inputs: torch.Tensor) -> Dict[str, torch.Tensor]:  # pylint: disable=arguments-differ
        """
        Compute context insensitive token embeddings for ELMo representations.

        Parameters
        ----------
        inputs: ``torch.Tensor``
            Shape ``(batch_size, sequence_length, 50)`` of character ids representing the
            current batch.

        Returns
        -------
        Dict with keys:
        ``'token_embedding'``: ``torch.Tensor``
            Shape ``(batch_size, sequence_length + 2, embedding_dim)`` tensor with context
            insensitive token representations.
        ``'mask'``:  ``torch.Tensor``
            Shape ``(batch_size, sequence_length + 2)`` long tensor with sequence mask.
        """
        # Add BOS/EOS
        mask = ((inputs > 0).long().sum(dim=-1) > 0).long()
        character_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids(
            inputs, mask, self._beginning_of_sentence_characters,
            self._end_of_sentence_characters)

        # the character id embedding
        max_chars_per_token = self._options['char_cnn'][
            'max_characters_per_token']
        # (batch_size * sequence_length, max_chars_per_token, embed_dim)
        character_embedding = torch.nn.functional.embedding(
            character_ids_with_bos_eos.view(-1, max_chars_per_token),
            self._char_embedding_weights)

        # run convolutions
        cnn_options = self._options['char_cnn']
        if cnn_options['activation'] == 'tanh':
            activation = torch.tanh
        elif cnn_options['activation'] == 'relu':
            activation = torch.nn.functional.relu
        else:
            raise ConfigurationError("Unknown activation")

        # (batch_size * sequence_length, embed_dim, max_chars_per_token)
        character_embedding = torch.transpose(character_embedding, 1, 2)
        convs = []
        for i in range(len(self._convolutions)):
            conv = getattr(self, 'char_conv_{}'.format(i))
            convolved = conv(character_embedding)
            # (batch_size * sequence_length, n_filters for this width)
            convolved, _ = torch.max(convolved, dim=-1)
            convolved = activation(convolved)
            convs.append(convolved)

        # (batch_size * sequence_length, n_filters)
        token_embedding = torch.cat(convs, dim=-1)

        # apply the highway layers (batch_size * sequence_length, n_filters)
        token_embedding = self._highways(token_embedding)

        # final projection  (batch_size * sequence_length, embedding_dim)
        token_embedding = self._projection(token_embedding)

        # reshape to (batch_size, sequence_length, embedding_dim)
        batch_size, sequence_length, _ = character_ids_with_bos_eos.size()

        return {
            'mask':
            mask_with_bos_eos,
            'token_embedding':
            token_embedding.view(batch_size, sequence_length, -1)
        }
    def forward(  # type: ignore
        self,
        image_features: torch.Tensor,
        penultimate_features: torch.Tensor,
        caption_tokens: Optional[torch.Tensor] = None,
        fsm: torch.Tensor = None,
        num_constraints: torch.Tensor = None,
    ) -> Dict[str, torch.Tensor]:
        r"""
        Given bottom-up image features, maximize the likelihood of paired captions during
        training. During evaluation, decode captions given image features using beam search.

        Parameters
        ----------
        image_features: torch.Tensor
            A tensor of shape ``(batch_size, num_boxes * image_feature_size)``. ``num_boxes`` for
            each instance in a batch might be different. Instances with lesser boxes are padded
            with zeros up to ``num_boxes``.
        penultimate_features: torch.Tensor
            A tensor of shape ``(batch_size, channel, height, width)``. They are extracted from
            saliency attentive model in the penultimate layers
        caption_tokens: torch.Tensor, optional (default = None)
            A tensor of shape ``(batch_size, max_caption_length)`` of tokenized captions. This
            tensor does not contain ``@@BOUNDARY@@`` tokens yet. Captions are not provided
            during evaluation.
        fsm: torch.Tensor, optional (default = None)
            A tensor of shape ``(batch_size, num_states, num_states, vocab_size)``: finite state
            machines per instance, represented as adjacency matrix. For a particular instance
            ``[_, s1, s2, v] = 1`` shows a transition from state ``s1`` to ``s2`` on decoding
            ``v`` token (constraint). Would be ``None`` for regular beam search decoding.
        num_constraints: torch.Tensor, optional (default = None)
            A tensor of shape ``(batch_size, )`` containing the total number of given constraints
            for CBS. Would be ``None`` for regular beam search decoding.

        Returns
        -------
        Dict[str, torch.Tensor]
            Decoded captions and/or per-instance cross entropy loss, dict with keys either
            ``{"predictions"}`` or ``{"loss"}``.
        """
        batch_size, num_boxes, image_feature_size = image_features.size()
        batch_size, channel, height, width = penultimate_features.size()
        penultimate_features = penultimate_features.view(
            batch_size, channel, -1).transpose(1, 2).contiguous()

        # Initialize states at zero-th timestep.
        states = None

        if self.training and caption_tokens is not None:
            # Add "@@BOUNDARY@@" tokens to caption sequences.
            caption_tokens, _ = add_sentence_boundary_token_ids(
                caption_tokens,
                (caption_tokens != self._pad_index),
                self._boundary_index,
                self._boundary_index,
            )
            batch_size, max_caption_length = caption_tokens.size()

            # shape: (batch_size, max_caption_length)
            tokens_mask = caption_tokens != self._pad_index

            # The last input from the target is either padding or the boundary token.
            # Either way, we don't have to process it.
            num_decoding_steps = max_caption_length - 1

            step_logits: List[torch.Tensor] = []
            step_logits_saliency: List[torch.Tensor] = []
            for timestep in range(num_decoding_steps):
                # shape: (batch_size,)
                input_tokens = caption_tokens[:, timestep]

                # shape: (batch_size, num_classes)
                output_logits, output_logits_saliency, states = \
                    self._decode_step(image_features, penultimate_features, input_tokens, states)

                # list of tensors, shape: (batch_size, 1, vocab_size)
                step_logits.append(output_logits.unsqueeze(1))

                # list of tensors, shape: (batch_size, 1, vocab_size)
                step_logits_saliency.append(
                    output_logits_saliency.unsqueeze(1))

            # shape: (batch_size, num_decoding_steps)
            logits = torch.cat(step_logits, 1)

            # shape: (batch_size, num_decoding_steps)
            logits_saliency = torch.cat(step_logits_saliency, 1)

            # Skip first time-step from targets for calculating loss.
            output_dict = {
                "loss":
                self._get_loss(
                    logits,
                    caption_tokens[:, 1:].contiguous(),
                    tokens_mask[:, 1:].contiguous(),
                ),
                "loss_saliency":
                self._get_loss(
                    logits_saliency,
                    caption_tokens[:, 1:].contiguous(),
                    tokens_mask[:, 1:].contiguous(),
                )
            }
        else:
            num_decoding_steps = self._max_caption_length
            start_predictions = image_features.new_full(
                (batch_size, ), self._boundary_index).long()

            # Add image features as a default argument to match callable signature acceptable by
            # beam search class (previous predictions and states only).
            beam_decode_step = functools.partial(self._decode_step,
                                                 image_features,
                                                 penultimate_features)

            # shape (all_top_k_predictions): (batch_size, net_beam_size, num_decoding_steps)
            # shape (log_probabilities): (batch_size, net_beam_size)
            if self._use_cbs:
                all_top_k_predictions, log_probabilities = self._beam_search.search(
                    start_predictions, states, beam_decode_step, fsm)
                if self._is_val:
                    best_beam = select_best_beam_with_constraints(
                        all_top_k_predictions,
                        log_probabilities,
                        num_constraints,
                        self._min_constraints_to_satisfy,
                    )
                else:
                    valid_beam, valid_log_probabilities, valid_num = select_valid_beam_with_constraints(
                        all_top_k_predictions,
                        log_probabilities,
                        num_constraints,
                        self._min_constraints_to_satisfy,
                    )
            else:
                all_top_k_predictions, log_probabilities = self._beam_search.search(
                    start_predictions, states, beam_decode_step)
                best_beam = select_best_beam(all_top_k_predictions,
                                             log_probabilities)

            if self._is_val:
                # shape: (batch_size, num_decoding_steps)
                output_dict = {"predictions": best_beam}
            else:
                # shape: (batch_size * beam_size, num_decoding_steps)
                output_dict = {
                    "predictions": valid_beam,
                    "log_probabilities": valid_log_probabilities,
                    "valid_numbers": valid_num
                }
        return output_dict
Ejemplo n.º 9
0
    def forward(self,  # pylint: disable=arguments-differ
                inputs: torch.Tensor,
                word_inputs: torch.Tensor = None) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
        """
        Parameters
        ----------
        inputs: ``torch.Tensor``, required.
            Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch.
        word_inputs : ``torch.Tensor``, required.
            If you passed a cached vocab, you can in addition pass a tensor of shape ``(batch_size, timesteps)``,
            which represent word ids which have been pre-cached.

        Returns
        -------
        Dict with keys:

        ``'activations'``: ``List[torch.Tensor]``
            A list of activations at each layer of the network, each of shape
            ``(batch_size, timesteps + 2, embedding_dim)``
        ``'mask'``:  ``torch.Tensor``
            Shape ``(batch_size, timesteps + 2)`` long tensor with sequence mask.

        Note that the output tensors all include additional special begin and end of sequence
        markers.
        """
        if self._word_embedding is not None and word_inputs is not None:
            try:
                mask_without_bos_eos = (word_inputs > 0).long()
                # The character cnn part is cached - just look it up.
                embedded_inputs = self._word_embedding(word_inputs) # type: ignore
                # shape (batch_size, timesteps + 2, embedding_dim)
                type_representation, mask = add_sentence_boundary_token_ids(
                        embedded_inputs,
                        mask_without_bos_eos,
                        self._bos_embedding,
                        self._eos_embedding
                )
            except RuntimeError:
                # Back off to running the character convolutions,
                # as we might not have the words in the cache.
                token_embedding = self._token_embedder(inputs)
                mask = token_embedding['mask']
                type_representation = token_embedding['token_embedding']
        else:
            token_embedding = self._token_embedder(inputs)
            mask = token_embedding['mask']
            type_representation = token_embedding['token_embedding']
        lstm_outputs = self._elmo_lstm(type_representation, mask)

        # Prepare the output.  The first layer is duplicated.
        # Because of minor differences in how masking is applied depending
        # on whether the char cnn layers are cached, we'll be defensive and
        # multiply by the mask here. It's not strictly necessary, as the
        # mask passed on is correct, but the values in the padded areas
        # of the char cnn representations can change.
        output_tensors = [
                torch.cat([type_representation, type_representation], dim=-1) * mask.float().unsqueeze(-1)
        ]
        for layer_activations in torch.chunk(lstm_outputs, lstm_outputs.size(0), dim=0):
            output_tensors.append(layer_activations.squeeze(0))

        return {
                'activations': output_tensors,
                'mask': mask,
        }
Ejemplo n.º 10
0
def test_fast_elmo_with_allennlp_do_layer_norm():
    fast = FastElmo(
        ELMO_OPTIONS_FILE,
        ELMO_WEIGHT_FILE,
        num_output_representations=1,
        scalar_mix_parameters=[1.0, 1.0, 1.0],
        do_layer_norm=True,
    )

    allennlp = Elmo(
        ELMO_OPTIONS_FILE,
        ELMO_WEIGHT_FILE,
        num_output_representations=1,
        dropout=0.0,
        scalar_mix_parameters=[1.0, 1.0, 1.0],
        do_layer_norm=True,
    )

    sentences = [
        ['ELMo', 'helps', 'disambiguate', 'ELMo', 'from', 'Elmo', '.'],
        ['The', 'sentence', '.'],
    ]
    character_ids = _sentences_to_ids(sentences)

    fast_out = fast(character_ids)
    allennlp_out = allennlp(character_ids)

    # Since we don't include the BOS/EOS reprs during layer normalization,
    # the result will be different from AllenNLP's implementation.
    np.testing.assert_raises(
        AssertionError,
        np.testing.assert_array_almost_equal,
        fast_out['elmo_representations'][0],
        allennlp_out['elmo_representations'][0],
    )

    # We can pack BOS/EOS to inputs manually
    _beginning_of_sentence_characters = torch.from_numpy(
        np.array(ELMoCharacterMapper.beginning_of_sentence_characters) + 1)
    _end_of_sentence_characters = torch.from_numpy(
        np.array(ELMoCharacterMapper.end_of_sentence_characters) + 1)

    mask = ((character_ids > 0).long().sum(dim=-1) > 0).long()
    character_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids(
        character_ids,
        mask,
        _beginning_of_sentence_characters,
        _end_of_sentence_characters,
    )

    # And disable the mock BOS/EOS actions in FastElmo.
    fast.exec_managed_lstm_bos_eos = False
    fast_out_2 = fast(character_ids_with_bos_eos)
    fast_mixed_repr_2, _ = remove_sentence_boundaries(
        fast_out_2['elmo_representations'][0],
        fast_out_2['mask'],
    )

    allennlp_out_2 = allennlp(character_ids)

    np.testing.assert_array_almost_equal(
        fast_mixed_repr_2,
        allennlp_out_2['elmo_representations'][0],
    )
Ejemplo n.º 11
0
def test_elmo_lstm_factory_simple():
    allennlp_elmo_bilm = _ElmoBiLm(
        ELMO_OPTIONS_FILE,
        ELMO_WEIGHT_FILE,
    )

    embedder = ElmoCharacterEncoderFactory(
        ELMO_OPTIONS_FILE,
        ELMO_WEIGHT_FILE,
    ).create()
    fwd_lstm, bwd_lstm = ElmoLstmFactory(
        ELMO_OPTIONS_FILE,
        ELMO_WEIGHT_FILE,
    ).create(enable_forward=True, enable_backward=True)

    sentences_1 = [
        ['ELMo', 'helps', 'disambiguate', 'ELMo', 'from', 'Elmo', '.'],
        ['The', 'sentence', '.'],
    ]
    sentences_2 = [
        ["This", "is", "a", "sentence"],
        ["Here", "'s", "one"],
        ["Another", "one"],
    ]

    # Internal states should be updated.
    for sentences in [sentences_1, sentences_2] * 10:
        # `(2, 7, 50)`
        character_ids = _sentences_to_ids(sentences)

        # AllenNLP.
        allennlp_out = allennlp_elmo_bilm(character_ids)

        # Ours.
        inputs = character_ids
        _beginning_of_sentence_characters = torch.from_numpy(
            np.array(ELMoCharacterMapper.beginning_of_sentence_characters) + 1)
        _end_of_sentence_characters = torch.from_numpy(
            np.array(ELMoCharacterMapper.end_of_sentence_characters) + 1)
        # Add BOS/EOS
        mask = ((inputs > 0).long().sum(dim=-1) > 0).long()
        character_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids(
            inputs,
            mask,
            _beginning_of_sentence_characters,
            _end_of_sentence_characters,
        )
        # Pack input.
        lengths = mask_with_bos_eos.sum(dim=-1)
        inputs = pack_padded_sequence(character_ids_with_bos_eos,
                                      lengths,
                                      batch_first=True)
        char_repr = embedder(inputs.data)
        fwd_lstm_hiddens, _ = fwd_lstm(char_repr, inputs.batch_sizes)
        bwd_lstm_hiddens, _ = bwd_lstm(char_repr, inputs.batch_sizes)
        lstm_hiddens = [
            torch.cat([fwd, bwd], dim=-1)
            for fwd, bwd in zip(fwd_lstm_hiddens, bwd_lstm_hiddens)
        ]
        # Unpack output.
        char_repr = _unpack(char_repr, inputs.batch_sizes)
        duplicated_char_repr = torch.cat(
            [char_repr, char_repr],
            dim=-1,
        ) * mask_with_bos_eos.float().unsqueeze(-1)
        lstm_hiddens = [_unpack(hx, inputs.batch_sizes) for hx in lstm_hiddens]

        # TODO: Investigate the numerical stability issue.
        # np.testing.assert_array_almost_equal(
        #         duplicated_char_repr.data.numpy(),
        #         allennlp_out['activations'][0].data.numpy(),
        # )
        # np.testing.assert_array_almost_equal(
        #         lstm_hiddens[0].data.numpy(),
        #         allennlp_out['activations'][1].data.numpy(),
        # )
        np.testing.assert_array_almost_equal(
            lstm_hiddens[1].data.numpy(),
            allennlp_out['activations'][2].data.numpy(),
        )
    def forward(self,
                image_features,
                caption_tokens: Optional[torch.Tensor] = None,
                device: Optional[int] = 0):
        states = None
        batch_size, num_boxes, image_feature_size = image_features.size()
        if self.training and caption_tokens is not None:
            # Add "@@BOUNDARY@@" tokens to caption sequences.
            caption_tokens, _ = add_sentence_boundary_token_ids(
                caption_tokens,
                (caption_tokens != self._pad_index),
                self._boundary_index,
                self._boundary_index,
            )

            batch_size, max_caption_length = caption_tokens.size()

            # shape: (batch_size, max_caption_length)
            tokens_mask = caption_tokens != self._pad_index

            # The last input from the target is either padding or the boundary token.
            # Either way, we don't have to process it.
            num_decoding_steps = max_caption_length - 1

            image_features = self.adapt_image_features(image_features)
            image_features = self.encoder(image_features)
            image_features = self.adapt_again(image_features)

            step_logits: List[torch.Tensor] = []
            for timestep in range(num_decoding_steps):
                # shape: (batch_size,)
                input_tokens = caption_tokens[:, timestep]

                # shape: (batch_size, num_classes)
                output_logits, states = self._decode_step(
                    image_features, input_tokens, states)

                # list of tensors, shape: (batch_size, 1, vocab_size)
                step_logits.append(output_logits.unsqueeze(1))

            # shape: (batch_size, num_decoding_steps)
            logits = torch.cat(step_logits, 1)

            # Skip first time-step from targets for calculating loss.
            output_dict = {
                "loss":
                self._get_loss(logits, caption_tokens[:, 1:].contiguous(),
                               tokens_mask[:, 1:].contiguous())
            }

        else:
            num_decoding_steps = self.max_caption_length

            image_features = self.adapt_image_features(image_features)
            image_features = self.encoder(image_features)
            image_features = self.adapt_again(image_features)

            start_predictions = image_features.new_full(
                (batch_size, ), self._boundary_index).long()

            # Add image features as a default argument to match callable signature acceptable by
            # beam search class (previous predictions and states only).
            beam_decode_step = functools.partial(self._decode_step,
                                                 image_features)

            # shape (all_top_k_predictions): (batch_size, net_beam_size, num_decoding_steps)
            # shape (log_probabilities): (batch_size, net_beam_size)
            # if self._use_cbs:
            #     all_top_k_predictions, log_probabilities = self._beam_search.search(
            #         start_predictions, states, beam_decode_step, fsm
            #     )
            #     best_beam = select_best_beam_with_constraints(
            #         all_top_k_predictions,
            #         log_probabilities,
            #         num_constraints,
            #         self._min_constraints_to_satisfy,
            #     )
            # else:

            all_top_k_predictions, log_probabilities = self._beam_search.search(
                start_predictions, states, beam_decode_step)
            best_beam = select_best_beam(all_top_k_predictions,
                                         log_probabilities)

            # shape: (batch_size, num_decoding_steps)
            output_dict = {"predictions": best_beam}

        return output_dict
Ejemplo n.º 13
0
    def forward(
        self,
        image_ids: torch.Tensor,
        image_features: torch.Tensor,
        caption_tokens: Optional[torch.Tensor] = None
    ) -> Dict[str, torch.Tensor]:
        r"""
        Given bottom-up image features, maximize the likelihood of paired captions during
        training. During evaluation, decode captions given image features using beam search.

        Parameters
        ----------
        image_features: torch.Tensor
            A tensor of shape ``(batch_size, num_boxes * image_feature_size)``. ``num_boxes`` for
            each instance in a batch might be different. Instances with lesser boxes are padded
            with zeros up to ``num_boxes``.
        caption_tokens: torch.Tensor, optional (default = None)
            A tensor of shape ``(batch_size, max_caption_length)`` of tokenized captions. This
            tensor does not contain ``@@BOUNDARY@@`` tokens yet. Captions are not provided
            during evaluation.

        Returns
        -------
        Dict[str, torch.Tensor]
            Decoded captions and/or per-instance cross entropy loss, dict with keys either
            ``{"predictions"}`` or ``{"loss"}``.
        """

        # shape: (batch_size, num_boxes * image_feature_size) for adaptive features.
        # shape: (batch_size, num_boxes, image_feature_size) for fixed features.
        batch_size = image_features.size(0)

        # shape: (batch_size, num_boxes, image_feature_size)
        image_features = image_features.view(batch_size, -1,
                                             self.image_feature_size)

        # Initialize states at zero-th timestep.
        states = None

        if self.training and caption_tokens is not None:
            # Add "@@BOUNDARY@@" tokens to caption sequences.
            caption_tokens, _ = add_sentence_boundary_token_ids(
                caption_tokens,
                (caption_tokens != self._pad_index),
                self._boundary_index,
                self._boundary_index,
            )

            _, max_caption_length = caption_tokens.size()

            # shape: (batch_size, max_caption_length)
            tokens_mask = caption_tokens != self._pad_index

            # The last input from the target is either padding or the boundary token.
            # Either way, we don't have to process it.
            num_decoding_steps = max_caption_length - 1

            step_logits: List[torch.Tensor] = []
            for timestep in range(num_decoding_steps):
                # shape: (batch_size,)
                input_tokens = caption_tokens[:, timestep]

                # shape: (batch_size, num_classes)
                output_logits, states = self._decode_step(
                    image_features, input_tokens, states)

                # list of tensors, shape: (batch_size, 1, vocab_size)
                step_logits.append(output_logits.unsqueeze(1))

            # shape: (batch_size, num_decoding_steps)
            logits = torch.cat(step_logits, 1)

            # Skip first time-step from targets for calculating loss.
            output_dict = {
                "loss":
                self._get_loss(logits, caption_tokens[:, 1:].contiguous(),
                               tokens_mask[:, 1:].contiguous())
            }
        else:
            num_decoding_steps = self._max_caption_length

            start_predictions = image_features.new_full(
                (batch_size, ), fill_value=self._boundary_index).long()

            state_transform_list = []
            state_size_list = []
            for image_id in image_ids:
                state_transform, state_size = self._fc.get_state_matrix(
                    image_id)
                state_transform_list.append(state_transform)
                state_size_list.append(state_size)
            max_state = max(state_size_list)
            state_transform_list = [
                s[:, :max_state, :max_state, :] for s in state_transform_list
            ]
            state_transform = torch.from_numpy(
                np.concatenate(state_transform_list,
                               axis=0)).to(start_predictions.device)
            # shape (log_probabilities): (batch_size, beam_size)
            best_predictions = self._beam_search.search(
                self._decode_step, image_features, start_predictions, states,
                state_transform, image_ids)

            output_dict = {"predictions": best_predictions}

        return output_dict
Ejemplo n.º 14
0
    def forward(self, program_tokens: torch.Tensor):
        r"""
        Given tokenized program sequences padded upto maximum length, predict sequence at next
        time-step and calculate cross entropy loss of this predicted sequence.

        Parameters
        ----------
        program_tokens: torch.Tensor
            Tokenized program sequences padded with zeroes upto maximum length.
            shape: (batch_size, max_sequence_length)

        Returns
        -------
        Dict[str, torch.Tensor]
            Predictions of next time-step and cross entropy loss (by teacher forcing), a dict
            with structure::

                {
                    "predictions": torch.Tensor (shape: (batch_size, max_sequence_length - 1)),
                    "loss": torch.Tensor (shape: (batch_size, ))
                }
        """

        batch_size = program_tokens.size(0)

        # Add "@start@" and "@end@" tokens to program sequences.
        program_tokens, _ = add_sentence_boundary_token_ids(
            program_tokens, (program_tokens != self._pad_index),
            self._start_index, self._end_index)
        program_tokens_mask = (program_tokens != self._pad_index).long()
        # Excluding @start@ token, because this is used with output of LSTM (next time-step).
        program_lengths = program_tokens_mask[:, 1:].sum(-1).float()

        # shape: (batch_size, max_sequence_length, input_size)
        embedded_programs = self._embedder({"programs": program_tokens})

        # shape: (batch_size, max_sequence_length, hidden_size)
        encoded_programs = self._encoder(embedded_programs,
                                         program_tokens_mask)

        # shape: (batch_size, max_sequence_length, input_size)
        output_projection = self._projection_layer(encoded_programs)
        # shape: (batch_size, max_sequence_length, vocab_size)
        output_logits = self._output_layer(output_projection)

        output_class_probabilities = F.softmax(output_logits, dim=-1)
        # Don't sample @start@, @@PADDING@@ and @@UNKNOWN@@.
        output_class_probabilities[:, :, self._start_index] = 0
        output_class_probabilities[:, :, self._pad_index] = 0
        output_class_probabilities[:, :, self._unk_index] = 0

        batch_predictions: List[torch.Tensor] = []
        for batch_index in range(output_class_probabilities.size(0)):
            # Perform ancestral sampling instead of greedy decoding.
            # shape: (batch_size, )
            batch_index_predictions = torch.multinomial(
                output_class_probabilities[batch_index], 1).squeeze()
            batch_predictions.append(batch_index_predictions)

        # shape: (batch_size, max_sequence_length)
        predictions = torch.stack(batch_predictions, 0)

        # Multiply with mask just to be sure.
        predictions = predictions[:, :-1] * program_tokens_mask[:, 1:]

        # shape: (batch_size, )
        sequence_cross_entropy = sequence_cross_entropy_with_logits(
            output_logits[:, :-1, :].contiguous(),
            program_tokens[:, 1:].contiguous(),
            weights=program_tokens_mask[:, 1:],
            average=None,
        )
        # Record metrics aggregated over current batch during evaluation.
        if not self.training:
            self._log2_perplexity(sequence_cross_entropy.mean().item())
        return {"predictions": predictions, "loss": sequence_cross_entropy}