Exemple #1
0
    def batch_to_embeddings(self, batch):
        u"""
        Parameters
        ----------
        batch : ``List[List[str]]``, required
            A list of tokenized sentences.

        Returns
        -------
            A tuple of tensors, the first representing activations (batch_size, 3, num_timesteps, 1024) and
        the second a mask (batch_size, num_timesteps).
        """
        character_ids = batch_to_ids(batch)
        if self.cuda_device >= 0:
            character_ids = character_ids.cuda(device=self.cuda_device)

        bilm_output = self.elmo_bilm(character_ids)
        layer_activations = bilm_output[u'activations']
        mask_with_bos_eos = bilm_output[u'mask']

        # without_bos_eos is a 3 element list of (activation, mask) tensor pairs,
        # each with size (batch_size, num_timesteps, dim and (batch_size, num_timesteps)
        # respectively.
        without_bos_eos = [
            remove_sentence_boundaries(layer, mask_with_bos_eos)
            for layer in layer_activations
        ]
        # Converts a list of pairs (activation, mask) tensors to a single tensor of activations.
        activations = torch.cat(
            [ele[0].unsqueeze(1) for ele in without_bos_eos], dim=1)
        # The mask is the same for each ELMo vector, so just take the first.
        mask = without_bos_eos[0][1]

        return activations, mask
Exemple #2
0
    def embed_batch(
            self, batch: List[List[str]],
            batch_metas) -> Tuple[List[Tuple[str, int]], List[torch.Tensor]]:
        """
        Parameters
        ----------
        batch : ``List[List[str]]``, required
            A list of tokenized sentences.
        batch_metas : ``List[Dict]``, required
            A list of metadata:
              sentence_id: str
              verb_indices: List[int]
        Returns
        -------
        """
        character_ids = batch_to_ids(batch)
        if self.cuda_device >= 0:
            character_ids = character_ids.cuda(device=self.cuda_device)

        bilm_output = self.elmo_bilm(character_ids)
        layer_activations_with_bos_eos = bilm_output['activations']
        mask_with_bos_eos = bilm_output['mask']

        top_activations = remove_sentence_boundaries(
            layer_activations_with_bos_eos[2], mask_with_bos_eos)[0]

        results = []
        for i, meta in enumerate(batch_metas):
            sid = meta["sentence_id"]
            for vi in meta["verb_indices"]:
                verb_id = {"sentenceId": sid, "verbIndex": vi}
                results.append((verb_id, top_activations[i, vi]))

        return results
Exemple #3
0
    def test_elmo_token_representation(self):
        # Load the test words and convert to char ids
        with open(os.path.join(self.fixtures_path, 'vocab_test.txt'), 'r') as fin:
            tokens = fin.read().strip().split('\n')

        indexer = ELMoTokenCharactersIndexer()
        indices = [indexer.token_to_indices(Token(token), Vocabulary()) for token in tokens]
        # There are 457 tokens. Reshape into 10 batches of 50 tokens.
        sentences = []
        for k in range(10):
            sentences.append(
                    indexer.pad_token_sequence(
                            indices[(k * 50):((k + 1) * 50)], desired_num_tokens=50, padding_lengths={}
                    )
            )
        batch = Variable(torch.from_numpy(numpy.array(sentences)))

        elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file)
        elmo_token_embedder_output = elmo_token_embedder(batch)

        # Reshape back to a list of words and compare with ground truth.  Need to also
        # remove <S>, </S>
        actual_embeddings = remove_sentence_boundaries(
                elmo_token_embedder_output['token_embedding'],
                elmo_token_embedder_output['mask']
        )[0].data.numpy()
        actual_embeddings = actual_embeddings.reshape(-1, actual_embeddings.shape[-1])

        embedding_file = os.path.join(self.fixtures_path, 'elmo_token_embeddings.hdf5')
        with h5py.File(embedding_file, 'r') as fin:
            expected_embeddings = fin['embedding'][...]

        assert numpy.allclose(actual_embeddings[:len(tokens)], expected_embeddings, atol=1e-6)
Exemple #4
0
    def test_elmo_token_representation(self):
        # Load the test words and convert to char ids
        with open(os.path.join(self.elmo_fixtures_path, 'vocab_test.txt'), 'r') as fin:
            tokens = fin.read().strip().split('\n')

        indexer = ELMoTokenCharactersIndexer()
        indices = [indexer.token_to_indices(Token(token), Vocabulary()) for token in tokens]
        # There are 457 tokens. Reshape into 10 batches of 50 tokens.
        sentences = []
        for k in range(10):
            sentences.append(
                    indexer.pad_token_sequence(
                            indices[(k * 50):((k + 1) * 50)], desired_num_tokens=50, padding_lengths={}
                    )
            )
        batch = torch.from_numpy(numpy.array(sentences))

        elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file)
        elmo_token_embedder_output = elmo_token_embedder(batch)

        # Reshape back to a list of words and compare with ground truth.  Need to also
        # remove <S>, </S>
        actual_embeddings = remove_sentence_boundaries(
                elmo_token_embedder_output['token_embedding'],
                elmo_token_embedder_output['mask']
        )[0].data.numpy()
        actual_embeddings = actual_embeddings.reshape(-1, actual_embeddings.shape[-1])

        embedding_file = os.path.join(self.elmo_fixtures_path, 'elmo_token_embeddings.hdf5')
        with h5py.File(embedding_file, 'r') as fin:
            expected_embeddings = fin['embedding'][...]

        assert numpy.allclose(actual_embeddings[:len(tokens)], expected_embeddings, atol=1e-6)
Exemple #5
0
    def batch_to_embeddings(self, batch: List[List[str]]) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Parameters
        ----------
        batch : ``List[List[str]]``, required
            A list of tokenized sentences.

        Returns
        -------
            A tuple of tensors, the first representing activations (batch_size, 3, num_timesteps, 1024) and
        the second a mask (batch_size, num_timesteps).
        """
        character_ids = batch_to_ids(batch)
        if self.cuda_device >= 0:
            character_ids = character_ids.cuda(device=self.cuda_device)

        bilm_output = self.elmo_bilm(character_ids)
        layer_activations = bilm_output['activations']
        mask_with_bos_eos = bilm_output['mask']

        # without_bos_eos is a 3 element list of (activation, mask) tensor pairs,
        # each with size (batch_size, num_timesteps, dim and (batch_size, num_timesteps)
        # respectively.
        without_bos_eos = [remove_sentence_boundaries(layer, mask_with_bos_eos)
                           for layer in layer_activations]
        # Converts a list of pairs (activation, mask) tensors to a single tensor of activations.
        activations = torch.cat([ele[0].unsqueeze(1) for ele in without_bos_eos], dim=1)
        # The mask is the same for each ELMo vector, so just take the first.
        mask = without_bos_eos[0][1]

        return activations, mask
Exemple #6
0
    def create_cached_cnn_embeddings(self, tokens: List[str]) -> None:
        """
        Given a list of tokens, this method precomputes word representations
        by running just the character convolutions and highway layers of elmo,
        essentially creating uncontextual word vectors. On subsequent forward passes,
        the word ids are looked up from an embedding, rather than being computed on
        the fly via the CNN encoder.

        This function sets 3 attributes:

        _word_embedding : ``torch.Tensor``
            The word embedding for each word in the tokens passed to this method.
        _bos_embedding : ``torch.Tensor``
            The embedding for the BOS token.
        _eos_embedding : ``torch.Tensor``
            The embedding for the EOS token.

        Parameters
        ----------
        tokens : ``List[str]``, required.
            A list of tokens to precompute character convolutions for.
        """
        tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens
        timesteps = 32
        batch_size = 32
        chunked_tokens = lazy_groups_of(iter(tokens), timesteps)

        all_embeddings = []
        device = get_device_of(next(self.parameters()))
        for batch in lazy_groups_of(chunked_tokens, batch_size):
            # Shape (batch_size, timesteps, 50)
            batched_tensor = batch_to_ids(batch)
            # NOTE: This device check is for when a user calls this method having
            # already placed the model on a device. If this is called in the
            # constructor, it will probably happen on the CPU. This isn't too bad,
            # because it's only a few convolutions and will likely be very fast.
            if device >= 0:
                batched_tensor = batched_tensor.cuda(device)
            output = self._token_embedder(batched_tensor)
            token_embedding = output["token_embedding"]
            mask = output["mask"]
            token_embedding, _ = remove_sentence_boundaries(token_embedding, mask)
            all_embeddings.append(token_embedding.view(-1, token_embedding.size(-1)))
        full_embedding = torch.cat(all_embeddings, 0)

        # We might have some trailing embeddings from padding in the batch, so
        # we clip the embedding and lookup to the right size.
        full_embedding = full_embedding[:len(tokens), :]
        embedding = full_embedding[2:len(tokens), :]
        vocab_size, embedding_dim = list(embedding.size())

        from allennlp.modules.token_embedders import Embedding # type: ignore
        self._bos_embedding = full_embedding[0, :]
        self._eos_embedding = full_embedding[1, :]
        self._word_embedding = Embedding(vocab_size, # type: ignore
                                         embedding_dim,
                                         weight=embedding.data,
                                         trainable=self._requires_grad,
                                         padding_index=0)
Exemple #7
0
    def create_cached_cnn_embeddings(self, tokens           )        :
        u"""
        Given a list of tokens, this method precomputes word representations
        by running just the character convolutions and highway layers of elmo,
        essentially creating uncontextual word vectors. On subsequent forward passes,
        the word ids are looked up from an embedding, rather than being computed on
        the fly via the CNN encoder.

        This function sets 3 attributes:

        _word_embedding : ``torch.Tensor``
            The word embedding for each word in the tokens passed to this method.
        _bos_embedding : ``torch.Tensor``
            The embedding for the BOS token.
        _eos_embedding : ``torch.Tensor``
            The embedding for the EOS token.

        Parameters
        ----------
        tokens : ``List[str]``, required.
            A list of tokens to precompute character convolutions for.
        """
        tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens
        timesteps = 32
        batch_size = 32
        chunked_tokens = lazy_groups_of(iter(tokens), timesteps)

        all_embeddings = []
        device = get_device_of(next(self.parameters()))
        for batch in lazy_groups_of(chunked_tokens, batch_size):
            # Shape (batch_size, timesteps, 50)
            batched_tensor = batch_to_ids(batch)
            # NOTE: This device check is for when a user calls this method having
            # already placed the model on a device. If this is called in the
            # constructor, it will probably happen on the CPU. This isn't too bad,
            # because it's only a few convolutions and will likely be very fast.
            if device >= 0:
                batched_tensor = batched_tensor.cuda(device)
            output = self._token_embedder(batched_tensor)
            token_embedding = output[u"token_embedding"]
            mask = output[u"mask"]
            token_embedding, _ = remove_sentence_boundaries(token_embedding, mask)
            all_embeddings.append(token_embedding.view(-1, token_embedding.size(-1)))
        full_embedding = torch.cat(all_embeddings, 0)

        # We might have some trailing embeddings from padding in the batch, so
        # we clip the embedding and lookup to the right size.
        full_embedding = full_embedding[:len(tokens), :]
        embedding = full_embedding[2:len(tokens), :]
        vocab_size, embedding_dim = list(embedding.size())

        from allennlp.modules.token_embedders import Embedding # type: ignore
        self._bos_embedding = full_embedding[0, :]
        self._eos_embedding = full_embedding[1, :]
        self._word_embedding = Embedding(vocab_size, # type: ignore
                                         embedding_dim,
                                         weight=embedding.data,
                                         trainable=self._requires_grad,
                                         padding_index=0)
Exemple #8
0
    def forward(
            self,
            inputs: torch.Tensor,
            word_inputs: torch.Tensor = None,
            prevs=None,
            rev_prevs=None
    ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
        # reshape the input if needed
        original_shape = inputs.size()
        if len(original_shape) > 3:
            timesteps, num_characters = original_shape[-2:]
            reshaped_inputs = inputs.view(-1, timesteps, num_characters)
        else:
            reshaped_inputs = inputs

        reshaped_word_inputs = word_inputs

        # run the biLM
        bilm_output = self._elmo_lstm(reshaped_inputs, reshaped_word_inputs,
                                      prevs, rev_prevs)
        layer_activations = bilm_output["activations"]
        mask_with_bos_eos = bilm_output["mask"]

        # compute the elmo representations
        representations = []
        for i in range(len(self._scalar_mixes)):
            scalar_mix = getattr(self, "scalar_mix_{}".format(i))
            representation_with_bos_eos = scalar_mix(layer_activations,
                                                     mask_with_bos_eos)
            if self._keep_sentence_boundaries:
                processed_representation = representation_with_bos_eos
                processed_mask = mask_with_bos_eos
            else:
                representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
                    representation_with_bos_eos, mask_with_bos_eos)
                processed_representation = representation_without_bos_eos
                processed_mask = mask_without_bos_eos
            representations.append(self._dropout(processed_representation))

        # reshape if necessary
        if word_inputs is not None and len(original_word_size) > 2:
            mask = processed_mask.view(original_word_size)
            elmo_representations = [
                representation.view(original_word_size + (-1, ))
                for representation in representations
            ]
        elif len(original_shape) > 3:
            mask = processed_mask.view(original_shape[:-1])
            elmo_representations = [
                representation.view(original_shape[:-1] + (-1, ))
                for representation in representations
            ]
        else:
            mask = processed_mask
            elmo_representations = representations

        return {"elmo_representations": elmo_representations, "mask": mask}
Exemple #9
0
    def forward(
        self,  # pylint: disable=arguments-differ
        inputs: torch.Tensor
    ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
        """
        Parameters
        ----------
        inputs : ``torch.autograd.Variable``
            Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch.
            We also accept tensors with additional optional dimensions:
            ``(batch_size, dim0, dim1, ..., dimn, timesteps, 50)``

        Returns
        -------
        Dict with keys:
        ``'elmo_representations'``: ``List[torch.autograd.Variable]``
            A ``num_output_representations`` list of ELMo representations for the input sequence.
            Each representation is shape ``(batch_size, timesteps, embedding_dim)``
        ``'mask'``:  ``torch.autograd.Variable``
            Shape ``(batch_size, timesteps)`` long tensor with sequence mask.
        """
        # reshape the input if needed
        original_shape = inputs.size()
        timesteps, num_characters = original_shape[-2:]
        if len(original_shape) > 3:
            reshaped_inputs = inputs.view(-1, timesteps, num_characters)
        else:
            reshaped_inputs = inputs

        # run the biLM
        bilm_output = self._elmo_lstm(reshaped_inputs)
        layer_activations = bilm_output['activations']
        mask_with_bos_eos = bilm_output['mask']

        # compute the elmo representations
        representations = []
        for i in range(len(self._scalar_mixes)):
            scalar_mix = getattr(self, 'scalar_mix_{}'.format(i))
            representation_with_bos_eos = scalar_mix(layer_activations,
                                                     mask_with_bos_eos)
            representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
                representation_with_bos_eos, mask_with_bos_eos)
            representations.append(
                self._dropout(representation_without_bos_eos))

        # reshape if necessary
        if len(original_shape) > 3:
            mask = mask_without_bos_eos.view(original_shape[:-1])
            elmo_representations = [
                representation.view(original_shape[:-1] + (-1, ))
                for representation in representations
            ]
        else:
            mask = mask_without_bos_eos
            elmo_representations = representations

        return {'elmo_representations': elmo_representations, 'mask': mask}
Exemple #10
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        options_file = os.path.join(FIXTURES, 'options.json')
        weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5')
        elmo_bilm = _ElmoBiLm(options_file, weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {'character_ids': indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        dataset = Dataset(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)

        # Now finally we can iterate through batches.
        iterator = BasicIterator(3)
        for i, batch in enumerate(iterator(dataset, num_epochs=1, shuffle=False)):
            batch_tensor = Variable(torch.from_numpy(batch['elmo']['character_ids']))
            lm_embeddings = elmo_bilm(batch_tensor)
            top_layer_embeddings, mask = remove_sentence_boundaries(
                    lm_embeddings['activations'][2],
                    lm_embeddings['mask']
            )

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                    len(sentence.split()) for sentence in batch_sentences
            ]
            self.assertEqual(lengths.tolist(), expected_lengths)

            # get the expected embeddings and compare!
            expected_top_layer = [expected_lm_embeddings[k][i] for k in range(3)]
            for k in range(3):
                self.assertTrue(
                        numpy.allclose(
                                top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                                expected_top_layer[k],
                                atol=1.0e-6
                        )
                )
Exemple #11
0
    def forward(self,    # pylint: disable=arguments-differ
                inputs: torch.Tensor) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
        """
        Parameters
        ----------
        inputs : ``torch.autograd.Variable``
            Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch.
            We also accept tensors with additional optional dimensions:
            ``(batch_size, dim0, dim1, ..., dimn, timesteps, 50)``

        Returns
        -------
        Dict with keys:
        ``'elmo_representations'``: ``List[torch.autograd.Variable]``
            A ``num_output_representations`` list of ELMo representations for the input sequence.
            Each representation is shape ``(batch_size, timesteps, embedding_dim)``
        ``'mask'``:  ``torch.autograd.Variable``
            Shape ``(batch_size, timesteps)`` long tensor with sequence mask.
        """
        # reshape the input if needed
        original_shape = inputs.size()
        timesteps, num_characters = original_shape[-2:]
        if len(original_shape) > 3:
            reshaped_inputs = inputs.view(-1, timesteps, num_characters)
        else:
            reshaped_inputs = inputs

        # run the biLM
        bilm_output = self._elmo_lstm(reshaped_inputs)
        layer_activations = bilm_output['activations']
        mask_with_bos_eos = bilm_output['mask']

        # compute the elmo representations
        representations = []
        for i in range(len(self._scalar_mixes)):
            scalar_mix = getattr(self, 'scalar_mix_{}'.format(i))
            representation_with_bos_eos = scalar_mix(layer_activations, mask_with_bos_eos)
            representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
                    representation_with_bos_eos, mask_with_bos_eos
            )
            representations.append(self._dropout(representation_without_bos_eos))

        # reshape if necessary
        if len(original_shape) > 3:
            mask = mask_without_bos_eos.view(original_shape[:-1])
            elmo_representations = [representation.view(original_shape[:-1] + (-1, ))
                                    for representation in representations]
        else:
            mask = mask_without_bos_eos
            elmo_representations = representations

        return {'elmo_representations': elmo_representations, 'mask': mask}
Exemple #12
0
    def forward(self, inputs):
        """
        Parameters
        ----------
        inputs: ``torch.Tensor``, required.
        Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch.
        word_inputs : ``torch.Tensor``, required.
            If you passed a cached vocab, you can in addition pass a tensor of shape
            ``(batch_size, timesteps)``, which represent word ids which have been pre-cached.
        Returns
        -------
        Dict with keys:
        """
        # reshape the input if needed
        original_shape = inputs.size()
        if len(original_shape) > 3:
            timesteps, num_characters = original_shape[-2:]
            reshaped_inputs = inputs.view(-1, timesteps, num_characters)
        else:
            reshaped_inputs = inputs

        # run the biLM
        bilm_output = self._elmo_lstm(reshaped_inputs, None)
        layer_activations = bilm_output['activations']
        mask_with_bos_eos = bilm_output['mask']

        word_embedding_and_hiddens = torch.cat(layer_activations, dim=-1)
        assert self.output_dim * len(
            layer_activations) == word_embedding_and_hiddens.size(-1)

        # compute the elmo representations
        representation_with_bos_eos = word_embedding_and_hiddens
        representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
            representation_with_bos_eos, mask_with_bos_eos)
        processed_representation = representation_without_bos_eos
        processed_mask = mask_without_bos_eos

        # reshape if necessary
        out_representations = []
        out_representations.append(
            processed_representation[:, :, :self.output_dim])
        if len(layer_activations) > 1:
            for i in range(1, len(layer_activations)):
                out_representations.append(
                    processed_representation[:, :, self.output_dim *
                                             i:self.output_dim * (i + 1)])

        return {
            'elmo_representations': out_representations,
            'mask': processed_mask
        }
    def forward(
        self,  # type: ignore
        tokens: torch.Tensor,
    ) -> Dict[str, torch.Tensor]:
        """
        # Parameters

        tokens : `torch.Tensor`
            Shape `(batch_size, timesteps, ...)` of token ids representing the current batch.
            These must have been produced using the same indexer the LM was trained on.

        # Returns

        The bidirectional language model representations for the input sequence, shape
        `(batch_size, timesteps, embedding_dim)`
        """

        if self._bos_indices is not None:
            num_wrapping_dims = max(tokens.dim() - 2, 0)
            mask = get_text_field_mask({"": {
                "": tokens
            }},
                                       num_wrapping_dims=num_wrapping_dims)
            tokens, mask = add_sentence_boundary_token_ids(
                tokens, mask, self._bos_indices, self._eos_indices)

        source = {self._token_name: {"token_characters": tokens}}
        result_dict = self._lm(source)

        # shape (batch_size, timesteps, embedding_size)
        noncontextual_token_embeddings = result_dict[
            "noncontextual_token_embeddings"]
        contextual_embeddings = result_dict["lm_embeddings"]

        # Typically the non-contextual embeddings are smaller than the contextualized embeddings.
        # Since we're averaging all the layers we need to make their dimensions match. Simply
        # repeating the non-contextual embeddings is a crude, but effective, way to do this.
        duplicated_character_embeddings = torch.cat(
            [noncontextual_token_embeddings] *
            self._character_embedding_duplication_count, -1)
        averaged_embeddings = self._scalar_mix(
            [duplicated_character_embeddings] + contextual_embeddings)

        # Add dropout
        averaged_embeddings = self._dropout(averaged_embeddings)
        if self._remove_bos_eos:
            averaged_embeddings, _ = remove_sentence_boundaries(
                averaged_embeddings, result_dict["mask"])

        return averaged_embeddings
Exemple #14
0
    def embed(self, tokens):
        character_ids = batch_to_ids([tokens])
        bilm_out = self._elmo_bilm(character_ids)

        wo_bos_eos = [
            remove_sentence_boundaries(layer, bilm_out['mask'])
            for layer in bilm_out['activations']
        ]

        emb = torch.cat([ele[0][:, None] for ele in wo_bos_eos], dim=1)
        sep = int(wo_bos_eos[0][1][0, :].sum())

        emb = emb[0, :, :sep, :].detach()[self._level]
        return emb
Exemple #15
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {'character_ids': indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        vocab = Vocabulary()

        # Now finally we can iterate through batches.
        iterator = BasicIterator(3)
        iterator.index_with(vocab)
        for i, batch in enumerate(iterator(instances, num_epochs=1, shuffle=False)):
            lm_embeddings = elmo_bilm(batch['elmo']['character_ids'])
            top_layer_embeddings, mask = remove_sentence_boundaries(
                    lm_embeddings['activations'][2],
                    lm_embeddings['mask']
            )

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                    len(sentence.split()) for sentence in batch_sentences
            ]
            self.assertEqual(lengths.tolist(), expected_lengths)

            # get the expected embeddings and compare!
            expected_top_layer = [expected_lm_embeddings[k][i] for k in range(3)]
            for k in range(3):
                self.assertTrue(
                        numpy.allclose(
                                top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                                expected_top_layer[k],
                                atol=1.0e-6
                        )
                )
Exemple #16
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {"character_ids": indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        vocab = Vocabulary()

        # Now finally we can iterate through batches.
        iterator = BasicIterator(3)
        iterator.index_with(vocab)
        for i, batch in enumerate(
                iterator(instances, num_epochs=1, shuffle=False)):
            lm_embeddings = elmo_bilm(batch["elmo"]["character_ids"]["tokens"])
            top_layer_embeddings, mask = remove_sentence_boundaries(
                lm_embeddings["activations"][2], lm_embeddings["mask"])

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                len(sentence.split()) for sentence in batch_sentences
            ]
            self.assertEqual(lengths.tolist(), expected_lengths)

            # get the expected embeddings and compare!
            expected_top_layer = [
                expected_lm_embeddings[k][i] for k in range(3)
            ]
            for k in range(3):
                self.assertTrue(
                    numpy.allclose(
                        top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                        expected_top_layer[k],
                        atol=1.0e-6,
                    ))
Exemple #17
0
def character_ids_to_embeddings(character_ids, elmo_bilm, device):
    # returns (batch_size, 3, num_times, 1024) embeddings and (batch_size, num_times) mask
    if device >= 0:
        character_ids = character_ids.cuda(device=device)
    bilm_output = elmo_bilm(character_ids)
    layer_activations = bilm_output['activations']
    mask_with_bos_eos = bilm_output['mask']
    without_bos_eos = [
        remove_sentence_boundaries(layer, mask_with_bos_eos)
        for layer in layer_activations
    ]
    # without_bos_eos is a 3 element list of (batch_size, num_times, dim) arrays
    activations = torch.cat([ele[0].unsqueeze(1) for ele in without_bos_eos],
                            dim=1)
    mask = without_bos_eos[0][1]
    return activations, mask
    def forward(self,  # type: ignore
                inputs: torch.Tensor) -> Dict[str, torch.Tensor]:
        """
        Parameters
        ----------
        inputs: ``torch.Tensor``
            Shape ``(batch_size, timesteps, ...)`` of token ids representing the current batch.
            These must have been produced using the same indexer the LM was trained on.

        Returns
        -------
        The bidirectional language model representations for the input sequence, shape
        ``(batch_size, timesteps, embedding_dim)``
        """
        # pylint: disable=arguments-differ
        if self._bos_indices is not None:
            mask = get_text_field_mask({"": inputs})
            inputs, mask = add_sentence_boundary_token_ids(
                    inputs, mask, self._bos_indices, self._eos_indices
            )

        source = {self._token_name: inputs}
        result_dict = self._lm(source)

        # shape (batch_size, timesteps, embedding_size)
        noncontextual_token_embeddings = result_dict["noncontextual_token_embeddings"]
        contextual_embeddings = result_dict["lm_embeddings"]

        # Typically the non-contextual embeddings are smaller than the contextualized embeddings.
        # Since we're averaging all the layers we need to make their dimensions match. Simply
        # repeating the non-contextual embeddings is a crude, but effective, way to do this.
        duplicated_character_embeddings = torch.cat(
                [noncontextual_token_embeddings] * self._character_embedding_duplication_count, -1
        )
        averaged_embeddings = self._scalar_mix(
                [duplicated_character_embeddings] + contextual_embeddings
        )

        # Add dropout
        averaged_embeddings = self._dropout(averaged_embeddings)
        if self._remove_bos_eos:
            averaged_embeddings, _ = remove_sentence_boundaries(
                    averaged_embeddings, result_dict["mask"]
            )

        return averaged_embeddings
    def forward(self, inputs, elmo_lstm_output):
        texts = self.inputs_to_texts(inputs)
        instances = self.texts_to_instances(texts)
        dataset = Batch(instances)
        dataset.index_instances(self.model.vocab)
        cp_inputs = util.move_to_device(dataset.as_tensor_dict(),
                                        self.cuda_device)
        words, pos_tags = cp_inputs['tokens'], cp_inputs['pos_tags']

        mask = get_text_field_mask(words)

        layer_activations = elmo_lstm_output['activations']
        mask_with_bos_eos = elmo_lstm_output['mask']

        # compute the elmo representations
        representations = []
        for i in range(len(self._scalar_mixes)):
            scalar_mix = getattr(self, 'scalar_mix_{}'.format(i))
            representation_with_bos_eos = scalar_mix(layer_activations,
                                                     mask_with_bos_eos)
            if self._keep_sentence_boundaries:
                processed_representation = representation_with_bos_eos
                processed_mask = mask_with_bos_eos
            else:
                representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
                    representation_with_bos_eos, mask_with_bos_eos)
                processed_representation = representation_without_bos_eos
                processed_mask = mask_without_bos_eos
            representations.append(self._dropout(processed_representation))

        # reshape if necessary
        mask = processed_mask
        elmo_representations = representations

        embedded_text_input = elmo_representations[0]

        if pos_tags is not None and self.model.pos_tag_embedding is not None:
            embedded_pos_tags = self.model.pos_tag_embedding(pos_tags)
            embedded_text_input = torch.cat(
                [embedded_text_input, embedded_pos_tags], -1)
        elif self.model.pos_tag_embedding is not None:
            raise ConfigurationError(
                "Model uses a POS embedding, but no POS tags were passed.")

        encoded_text = self.model.encoder(embedded_text_input, mask)
        return encoded_text.detach()
Exemple #20
0
    def forward(self, inputs):
        bilm_output = self.elmo(inputs)
        layer_activations = bilm_output['activations']
        mask_with_bos_eos = bilm_output['mask']

        representations = []
        for representation in layer_activations:
            r, mask = remove_sentence_boundaries(representation,
                                                 mask_with_bos_eos)
            representations.append(r)

        repr_forward, repr_backward = representations[-1].split(
            self.output_dim_half, dim=2)
        logits_forward = self.decoder(repr_forward)
        logits_backward = self.decoder(repr_backward)

        return logits_forward, logits_backward, representations, mask
Exemple #21
0
    def test_remove_sentence_boundaries(self):
        tensor = Variable(torch.from_numpy(numpy.random.rand(3, 5, 7)))
        mask = Variable(
            torch.from_numpy(
                numpy.array([[1, 1, 1, 0, 0], [1, 1, 1, 1, 1], [1, 1, 1, 1,
                                                                0]]))).long()
        new_tensor, new_mask = util.remove_sentence_boundaries(tensor, mask)

        expected_new_tensor = Variable(torch.zeros(3, 3, 7))
        expected_new_tensor[0, 0, :] = tensor[0, 1, :]
        expected_new_tensor[1, 0:3, :] = tensor[1, 1:4, :]
        expected_new_tensor[2, 0:2, :] = tensor[2, 1:3, :]
        assert_array_almost_equal(new_tensor.data.numpy(),
                                  expected_new_tensor.data.numpy())

        expected_new_mask = Variable(
            torch.from_numpy(numpy.array([[1, 0, 0], [1, 1, 1], [1, 1,
                                                                 0]]))).long()
        assert (new_mask.data.numpy() == expected_new_mask.data.numpy()).all()
Exemple #22
0
    def test_remove_sentence_boundaries(self):
        tensor = torch.from_numpy(numpy.random.rand(3, 5, 7))
        mask = torch.from_numpy(
            # The mask with two elements is to test the corner case
            # of an empty sequence, so here we are removing boundaries
            # from  "<S> </S>"
            numpy.array([[1, 1, 0, 0, 0], [1, 1, 1, 1, 1], [1, 1, 1, 1,
                                                            0]])).long()
        new_tensor, new_mask = util.remove_sentence_boundaries(tensor, mask)

        expected_new_tensor = torch.zeros(3, 3, 7)
        expected_new_tensor[1, 0:3, :] = tensor[1, 1:4, :]
        expected_new_tensor[2, 0:2, :] = tensor[2, 1:3, :]
        assert_array_almost_equal(new_tensor.data.numpy(),
                                  expected_new_tensor.data.numpy())

        expected_new_mask = torch.from_numpy(
            numpy.array([[0, 0, 0], [1, 1, 1], [1, 1, 0]])).long()
        assert (new_mask.data.numpy() == expected_new_mask.data.numpy()).all()
Exemple #23
0
    def test_elmo_token_representation(self):
        # Load the test words and convert to char ids
        with open(os.path.join(self.elmo_fixtures_path, "vocab_test.txt"),
                  "r") as fin:
            words = fin.read().strip().split("\n")

        vocab = Vocabulary()
        indexer = ELMoTokenCharactersIndexer()
        tokens = [Token(word) for word in words]

        indices = indexer.tokens_to_indices(tokens, vocab)
        # There are 457 tokens. Reshape into 10 batches of 50 tokens.
        sentences = []
        for k in range(10):
            char_indices = indices["elmo_tokens"][(k * 50):((k + 1) * 50)]
            sentences.append(
                indexer.as_padded_tensor_dict(
                    {"elmo_tokens": char_indices},
                    padding_lengths={"elmo_tokens": 50})["elmo_tokens"])
        batch = torch.stack(sentences)

        elmo_token_embedder = _ElmoCharacterEncoder(self.options_file,
                                                    self.weight_file)
        elmo_token_embedder_output = elmo_token_embedder(batch)

        # Reshape back to a list of words and compare with ground truth.  Need to also
        # remove <S>, </S>
        actual_embeddings = remove_sentence_boundaries(
            elmo_token_embedder_output["token_embedding"],
            elmo_token_embedder_output["mask"])[0].data.numpy()
        actual_embeddings = actual_embeddings.reshape(
            -1, actual_embeddings.shape[-1])

        embedding_file = os.path.join(self.elmo_fixtures_path,
                                      "elmo_token_embeddings.hdf5")
        with h5py.File(embedding_file, "r") as fin:
            expected_embeddings = fin["embedding"][...]

        assert numpy.allclose(actual_embeddings[:len(tokens)],
                              expected_embeddings,
                              atol=1e-6)
Exemple #24
0
    def test_remove_sentence_boundaries(self):
        tensor = torch.from_numpy(numpy.random.rand(3, 5, 7))
        mask = torch.from_numpy(
                # The mask with two elements is to test the corner case
                # of an empty sequence, so here we are removing boundaries
                # from  "<S> </S>"
                numpy.array([[1, 1, 0, 0, 0],
                             [1, 1, 1, 1, 1],
                             [1, 1, 1, 1, 0]])).long()
        new_tensor, new_mask = util.remove_sentence_boundaries(tensor, mask)

        expected_new_tensor = torch.zeros(3, 3, 7)
        expected_new_tensor[1, 0:3, :] = tensor[1, 1:4, :]
        expected_new_tensor[2, 0:2, :] = tensor[2, 1:3, :]
        assert_array_almost_equal(new_tensor.data.numpy(), expected_new_tensor.data.numpy())

        expected_new_mask = torch.from_numpy(
                numpy.array([[0, 0, 0],
                             [1, 1, 1],
                             [1, 1, 0]])).long()
        assert (new_mask.data.numpy() == expected_new_mask.data.numpy()).all()
Exemple #25
0
 def __call__(self, docs: List[Document], infos: List[InfoPb],
              input_fields: List[str], output_field: str,
              max_tokens_count: int):
     from allennlp.modules.elmo import batch_to_ids
     from allennlp.nn.util import remove_sentence_boundaries
     batch = []
     for doc_num, doc in enumerate(docs):
         sample = " ".join(
             [getattr(doc, input_field) for input_field in input_fields])
         tokens = self.preprocess(sample)[:max_tokens_count]
         batch.append(tokens)
     character_ids = batch_to_ids(batch)
     if self.cuda_device >= 0:
         character_ids = character_ids.cuda(device=self.cuda_device)
     bilm_output = self.elmo_bilm(character_ids)
     layer_activations = bilm_output['activations']
     mask_with_bos_eos = bilm_output['mask']
     without_bos_eos = [
         remove_sentence_boundaries(layer, mask_with_bos_eos)
         for layer in layer_activations
     ]
     embeddings = torch.cat(
         [pair[0].unsqueeze(1) for pair in without_bos_eos], dim=1)
     mask = without_bos_eos[0][1]
     for doc_num, info in enumerate(infos):
         length = int(mask[doc_num, :].sum())
         doc_embeddings = np.zeros((3, 0, 1024))
         if length != 0:
             doc_embeddings = embeddings[
                 doc_num, :, :length, :].detach().cpu().numpy()
         doc_embeddings = doc_embeddings.swapaxes(0, 1).reshape(
             doc_embeddings.shape[0], -1)
         mean_embeddings = doc_embeddings.mean(axis=0)
         max_embeddings = doc_embeddings.max(axis=0)
         final_embedding = np.concatenate((mean_embeddings, max_embeddings),
                                          axis=0)
         getattr(info, output_field).extend(final_embedding)
    def forward(self, inputs, elmo_lstm_output):
        texts = self.inputs_to_texts(inputs)
        instances = self.texts_to_instances(texts)
        dataset = Batch(instances)
        dataset.index_instances(self.model.vocab)
        cp_inputs = util.move_to_device(dataset.as_tensor_dict(),
                                        self.cuda_device)
        tokens = cp_inputs['tokens']
        mask = get_text_field_mask(tokens)

        layer_activations = elmo_lstm_output['activations']
        mask_with_bos_eos = elmo_lstm_output['mask']

        # compute the elmo representations
        representations = []
        for i in range(len(self._scalar_mixes)):
            scalar_mix = getattr(self, 'scalar_mix_{}'.format(i))
            representation_with_bos_eos = scalar_mix(layer_activations,
                                                     mask_with_bos_eos)
            if self._keep_sentence_boundaries:
                processed_representation = representation_with_bos_eos
                processed_mask = mask_with_bos_eos
            else:
                representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
                    representation_with_bos_eos, mask_with_bos_eos)
                processed_representation = representation_without_bos_eos
                processed_mask = mask_without_bos_eos
            representations.append(self._dropout(processed_representation))

        # reshape if necessary
        mask = processed_mask
        elmo_representations = representations

        embedded_text_input = elmo_representations[0]

        encoded_text = self.model.encoder(embedded_text_input, mask)
        return encoded_text.detach()
Exemple #27
0
    def forward(
        self,  # pylint: disable=arguments-differ
        inputs: torch.Tensor
    ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
        """
        Parameters
        ----------
        inputs: ``torch.autograd.Variable``
            Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch.

        Returns
        -------
        Dict with keys:

        ``'elmo_representations'``: ``List[torch.autograd.Variable]``
            A ``num_output_representations`` list of ELMo representations for the input sequence.
            Each representation is shape ``(batch_size, timesteps, embedding_dim)``
        ``'mask'``:  ``torch.autograd.Variable``
            Shape ``(batch_size, timesteps)`` long tensor with sequence mask.
        """
        bilm_output = self._elmo_lstm(inputs)
        layer_activations = bilm_output['activations']
        mask_with_bos_eos = bilm_output['mask']

        elmo_representations = []
        for scalar_mix in self._scalar_mixes:
            representation_with_bos_eos = scalar_mix.forward(
                layer_activations, mask_with_bos_eos)
            representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
                representation_with_bos_eos, mask_with_bos_eos)
            elmo_representations.append(representation_without_bos_eos)

        return {
            'elmo_representations': elmo_representations,
            'mask': mask_without_bos_eos
        }
Exemple #28
0
    def forward(self,
                x1,
                x1_c,
                x1_f,
                x1_pos,
                x1_ner,
                x1_mask,
                x2_full,
                x2_c,
                x2_full_mask,
                x3=None):
        """Inputs:
        x1 = document word indices             [batch * len_d]
        x1_c = document char indices           [batch * len_d * len_w] or [1]
        x1_f = document word features indices  [batch * q_num * len_d * nfeat]
        x1_pos = document POS tags             [batch * len_d]
        x1_ner = document entity tags          [batch * len_d]
        x1_mask = document padding mask        [batch * len_d]
        x2_full = question word indices        [batch * q_num * len_q]
        x2_c = question char indices           [(batch * q_num) * len_q * len_w]
        x2_full_mask = question padding mask   [batch * q_num * len_q]
        x3 = answer word indices [batch * q_num * len_a]
        """

        # precomputing ELMo is only for context (to speedup computation)
        if self.opt['use_elmo'] and self.opt['elmo_batch_size'] > self.opt[
                'batch_size']:  # precomputing ELMo is used
            if x1_c.dim() != 1:  # precomputation is needed
                precomputed_bilm_output = self.elmo._elmo_lstm(x1_c)
                self.precomputed_layer_activations = [
                    t.detach().cpu()
                    for t in precomputed_bilm_output['activations']
                ]
                self.precomputed_mask_with_bos_eos = precomputed_bilm_output[
                    'mask'].detach().cpu()
                self.precomputed_cnt = 0

            # get precomputed ELMo
            layer_activations = [
                t[x1.size(0) * self.precomputed_cnt:x1.size(0) *
                  (self.precomputed_cnt + 1), :, :]
                for t in self.precomputed_layer_activations
            ]
            mask_with_bos_eos = self.precomputed_mask_with_bos_eos[
                x1.size(0) * self.precomputed_cnt:x1.size(0) *
                (self.precomputed_cnt + 1), :]
            if x1.is_cuda:
                layer_activations = [t.cuda() for t in layer_activations]
                mask_with_bos_eos = mask_with_bos_eos.cuda()

            representations = []
            for i in range(len(self.elmo._scalar_mixes)):
                scalar_mix = getattr(self.elmo, 'scalar_mix_{}'.format(i))
                representation_with_bos_eos = scalar_mix(
                    layer_activations, mask_with_bos_eos)
                representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
                    representation_with_bos_eos, mask_with_bos_eos)
                representations.append(
                    self.elmo._dropout(representation_without_bos_eos))

            x1_elmo = representations[0][:, :x1.size(1), :]
            self.precomputed_cnt += 1

            precomputed_elmo = True
        else:
            precomputed_elmo = False
        """
        x1_full = document word indices        [batch * q_num * len_d]
        x1_full_mask = document padding mask   [batch * q_num * len_d]
        """
        x1_full = x1.unsqueeze(1).expand(x2_full.size(0), x2_full.size(1),
                                         x1.size(1)).contiguous()
        x1_full_mask = x1_mask.unsqueeze(1).expand(x2_full.size(0),
                                                   x2_full.size(1),
                                                   x1.size(1)).contiguous()

        drnn_input_list, qrnn_input_list = [], []

        x2 = x2_full.view(-1, x2_full.size(-1))
        x2_mask = x2_full_mask.view(-1, x2_full.size(-1))

        if self.opt['use_wemb']:
            # Word embedding for both document and question
            emb = self.embedding if self.training else self.eval_embed
            x1_emb = emb(x1)
            x2_emb = emb(x2)
            # Dropout on embeddings
            if self.opt['dropout_emb'] > 0:
                x1_emb = layers.dropout(x1_emb,
                                        p=self.opt['dropout_emb'],
                                        training=self.training)
                x2_emb = layers.dropout(x2_emb,
                                        p=self.opt['dropout_emb'],
                                        training=self.training)

            drnn_input_list.append(x1_emb)
            qrnn_input_list.append(x2_emb)

        if self.opt['CoVe_opt'] > 0:
            x1_cove_mid, x1_cove_high = self.CoVe(x1, x1_mask)
            x2_cove_mid, x2_cove_high = self.CoVe(x2, x2_mask)
            # Dropout on contexualized embeddings
            if self.opt['dropout_emb'] > 0:
                x1_cove_mid = layers.dropout(x1_cove_mid,
                                             p=self.opt['dropout_emb'],
                                             training=self.training)
                x1_cove_high = layers.dropout(x1_cove_high,
                                              p=self.opt['dropout_emb'],
                                              training=self.training)
                x2_cove_mid = layers.dropout(x2_cove_mid,
                                             p=self.opt['dropout_emb'],
                                             training=self.training)
                x2_cove_high = layers.dropout(x2_cove_high,
                                              p=self.opt['dropout_emb'],
                                              training=self.training)

            drnn_input_list.append(x1_cove_mid)
            qrnn_input_list.append(x2_cove_mid)

        if self.opt['use_elmo']:
            if not precomputed_elmo:
                x1_elmo = self.elmo(x1_c)['elmo_representations'][
                    0]  #torch.zeros(x1_emb.size(0), x1_emb.size(1), 1024, dtype=x1_emb.dtype, layout=x1_emb.layout, device=x1_emb.device)
            x2_elmo = self.elmo(x2_c)['elmo_representations'][
                0]  #torch.zeros(x2_emb.size(0), x2_emb.size(1), 1024, dtype=x2_emb.dtype, layout=x2_emb.layout, device=x2_emb.device)
            # Dropout on contexualized embeddings
            if self.opt['dropout_emb'] > 0:
                x1_elmo = layers.dropout(x1_elmo,
                                         p=self.opt['dropout_emb'],
                                         training=self.training)
                x2_elmo = layers.dropout(x2_elmo,
                                         p=self.opt['dropout_emb'],
                                         training=self.training)

            drnn_input_list.append(x1_elmo)
            qrnn_input_list.append(x2_elmo)

        if self.opt['use_pos']:
            x1_pos_emb = self.pos_embedding(x1_pos)
            drnn_input_list.append(x1_pos_emb)

        if self.opt['use_ner']:
            x1_ner_emb = self.ner_embedding(x1_ner)
            drnn_input_list.append(x1_ner_emb)

        x1_input = torch.cat(drnn_input_list, dim=2)
        x2_input = torch.cat(qrnn_input_list, dim=2)

        def expansion_for_doc(z):
            return z.unsqueeze(1).expand(z.size(0), x2_full.size(1), z.size(1),
                                         z.size(2)).contiguous().view(
                                             -1, z.size(1), z.size(2))

        x1_emb_expand = expansion_for_doc(x1_emb)
        x1_cove_high_expand = expansion_for_doc(x1_cove_high)
        #x1_elmo_expand = expansion_for_doc(x1_elmo)
        if self.opt['no_em']:
            x1_f = x1_f[:, :, :, 3:]

        x1_input = torch.cat([
            expansion_for_doc(x1_input),
            x1_f.view(-1, x1_f.size(-2), x1_f.size(-1))
        ],
                             dim=2)
        x1_mask = x1_full_mask.view(-1, x1_full_mask.size(-1))

        if self.opt['do_prealign']:
            x1_atten = self.pre_align(x1_emb_expand, x2_emb, x2_mask)
            x1_input = torch.cat([x1_input, x1_atten], dim=2)

        # === Start processing the dialog ===
        # cur_h: [batch_size * max_qa_pair, context_length, hidden_state]
        # flow : fn (rnn)
        def flow_operation(cur_h, flow):
            flow_in = cur_h.transpose(0, 1).view(x1_full.size(2),
                                                 x1_full.size(0),
                                                 x1_full.size(1), -1)
            flow_in = flow_in.transpose(0, 2).contiguous().view(
                x1_full.size(1),
                x1_full.size(0) * x1_full.size(2), -1).transpose(0, 1)
            # [bsz * context_length, max_qa_pair, hidden_state]
            if self.opt['residual_step']:
                flow_out, residual_out = flow(flow_in)
            else:
                flow_out = flow(flow_in)
            # [bsz * context_length, max_qa_pair, flow_hidden_state_dim (hidden_state/2)]
            if self.opt['no_dialog_flow']:
                flow_out = flow_out * 0

            flow_out = flow_out.transpose(0, 1).view(x1_full.size(1),
                                                     x1_full.size(0),
                                                     x1_full.size(2),
                                                     -1).transpose(
                                                         0, 2).contiguous()
            # [bsz * max_qa_pair, context_length, flow_hidden_state_dim]
            flow_out = flow_out.view(x1_full.size(2),
                                     x1_full.size(0) * x1_full.size(1),
                                     -1).transpose(0, 1)
            if self.opt['residual_step']:
                residual_out = residual_out.transpose(0, 1).view(
                    x1_full.size(1), x1_full.size(0), x1_full.size(2),
                    -1).transpose(0, 2).contiguous()
                residual_out = residual_out.view(
                    x1_full.size(2),
                    x1_full.size(0) * x1_full.size(1), -1).transpose(0, 1)
                return flow_out, residual_out
            else:
                return flow_out, None

        # Encode document with RNN
        doc_abstr_ls = []

        doc_hiddens = self.doc_rnn1(x1_input, x1_mask)
        doc_hiddens_flow, residual_flow = flow_operation(
            doc_hiddens, self.dialog_flow1)

        doc_abstr_ls.append(doc_hiddens)
        #doc_hiddens_flow = torch.cat((doc_hiddens_flow, residual_flow), dim=2)

        doc_hiddens = self.doc_rnn2(
            torch.cat((doc_hiddens, doc_hiddens_flow, x1_cove_high_expand),
                      dim=2), x1_mask)
        doc_hiddens_flow, residual_flow = flow_operation(
            doc_hiddens, self.dialog_flow2)

        doc_abstr_ls.append(doc_hiddens)
        #doc_hiddens_flow = torch.cat((doc_hiddens_flow, residual_flow), dim=2)

        #with open('flow_bef_att.pkl', 'wb') as output:
        #    pickle.dump(doc_hiddens_flow, output, pickle.HIGHEST_PROTOCOL)
        #while(1):
        #    pass

        # Encode question with RNN
        _, que_abstr_ls = self.question_rnn(x2_input,
                                            x2_mask,
                                            return_list=True,
                                            additional_x=x2_cove_high)

        # Final question layer
        question_hiddens = self.high_lvl_qrnn(torch.cat(que_abstr_ls, 2),
                                              x2_mask)
        que_abstr_ls += [question_hiddens]

        # Main Attention Fusion Layer
        doc_info = self.deep_attn(
            [torch.cat([x1_emb_expand, x1_cove_high_expand], 2)], doc_abstr_ls,
            [torch.cat([x2_emb, x2_cove_high], 2)], que_abstr_ls, x1_mask,
            x2_mask)

        doc_hiddens = self.deep_attn_rnn(
            torch.cat((doc_info, doc_hiddens_flow), dim=2), x1_mask)
        doc_hiddens_flow, residual_flow = flow_operation(
            doc_hiddens, self.dialog_flow3)

        doc_abstr_ls += [doc_hiddens]
        #doc_hiddens_flow = torch.cat((doc_hiddens_flow, residual_flow), dim=2)
        #if self.opt['residual_step']:
        #doc_abstr_ls.append(residual_flow)

        # Self Attention Fusion Layer
        if self.opt['use_hoc']:
            # handle history of context, considering batch=1
            x1_att = torch.cat(doc_abstr_ls, 2)
            hoc = torch.cat(
                (doc_hiddens[0, :, :].unsqueeze(0), doc_hiddens[:-1, :, :]),
                dim=0)
            x1_att = torch.cat((x1_att, hoc), dim=2)
        else:
            x1_att = torch.cat(doc_abstr_ls, 2)

        if self.opt['self_attention_opt'] > 0:
            highlvl_self_attn_hiddens = self.highlvl_self_att(
                x1_att, x1_att, x1_mask, x3=doc_hiddens, drop_diagonal=True)
            doc_hiddens = self.high_lvl_crnn(
                torch.cat(
                    [doc_hiddens, highlvl_self_attn_hiddens, doc_hiddens_flow],
                    dim=2), x1_mask)
        elif self.opt['self_attention_opt'] == 0:
            doc_hiddens = self.high_lvl_crnn(
                torch.cat([doc_hiddens, doc_hiddens_flow], dim=2), x1_mask)

        doc_abstr_ls += [doc_hiddens]

        # Merge the question hidden vectors
        q_merge_weights = self.self_attn(question_hiddens, x2_mask)
        question_avg_hidden = layers.weighted_avg(question_hiddens,
                                                  q_merge_weights)
        if self.opt['do_hierarchical_query']:
            question_avg_hidden = self.hier_query_rnn(
                question_avg_hidden.view(x1_full.size(0), x1_full.size(1), -1))
            question_avg_hidden = question_avg_hidden.contiguous().view(
                -1, question_avg_hidden.size(-1))

        # Get Start, End span
        start_scores, end_scores = self.get_answer(doc_hiddens,
                                                   question_avg_hidden,
                                                   x1_mask)
        all_start_scores = start_scores.view_as(
            x1_full)  # batch x q_num x len_d
        all_end_scores = end_scores.view_as(x1_full)  # batch x q_num x len_d

        # Get whether there is an answer
        # doc_hiddens: [bsz * max_qa_pair, context_length, hidden_size]
        doc_avg_hidden = torch.cat(
            (torch.max(doc_hiddens, dim=1)[0], torch.mean(doc_hiddens, dim=1)),
            dim=1)
        class_scores = self.ans_type_prediction(doc_avg_hidden,
                                                question_avg_hidden)
        all_class_scores = class_scores.view(x1_full.size(0), x1_full.size(1),
                                             -1)  # batch x q_num x class_num
        all_class_scores = all_class_scores.squeeze(-1)  # when class_num = 1

        return all_start_scores, all_end_scores, all_class_scores
    def forward(self, x1, x1_c, x1_f, x1_pos, x1_ner, x1_mask, x2_full, x2_c,
                x2_full_mask, node_id, node_mask, edge_id):
        """Inputs:
        x1 = document word indices             [batch * len_d]
        x1_c = document char indices           [batch * len_d * len_w] or [1]
        x1_f = document word features indices  [batch * q_num * len_d * nfeat]
        x1_pos = document POS tags             [batch * len_d]
        x1_ner = document entity tags          [batch * len_d]
        x1_mask = document padding mask        [batch * len_d]
        x2_full = question word indices        [batch * q_num * len_q]
        x2_c = question char indices           [(batch * q_num) * len_q * len_w]
        x2_full_mask = question padding mask   [batch * q_num * len_q]
        node_id     [batch * max_node_num * max_node_length]
        node__mask   [batch * max_node_num * max_node_length]
        edge_id     [batch * max_node_num * max_node_num ]
        """
        # print('node_id{}'.format(node_id))
        # print('x1{}'.format(x1))
        # precomputing ELMo is only for context (to speedup computation)
        # print('startembeddingweight{}'.format(self.embedding.weight))
        if self.opt['use_elmo'] and self.opt['elmo_batch_size'] > self.opt[
                'batch_size']:  # precomputing ELMo is used
            if x1_c.dim() != 1:  # precomputation is needed
                precomputed_bilm_output = self.elmo._elmo_lstm(x1_c)
                self.precomputed_layer_activations = [
                    t.detach().cpu()
                    for t in precomputed_bilm_output['activations']
                ]
                self.precomputed_mask_with_bos_eos = precomputed_bilm_output[
                    'mask'].detach().cpu()
                self.precomputed_cnt = 0

            # get precomputed ELMo
            layer_activations = [
                t[x1.size(0) * self.precomputed_cnt:x1.size(0) *
                  (self.precomputed_cnt + 1), :, :]
                for t in self.precomputed_layer_activations
            ]
            mask_with_bos_eos = self.precomputed_mask_with_bos_eos[
                x1.size(0) * self.precomputed_cnt:x1.size(0) *
                (self.precomputed_cnt + 1), :]
            if x1.is_cuda:
                layer_activations = [t.cuda() for t in layer_activations]
                mask_with_bos_eos = mask_with_bos_eos.cuda()

            representations = []
            for i in range(len(self.elmo._scalar_mixes)):
                scalar_mix = getattr(self.elmo, 'scalar_mix_{}'.format(i))
                representation_with_bos_eos = scalar_mix(
                    layer_activations, mask_with_bos_eos)
                representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
                    representation_with_bos_eos, mask_with_bos_eos)
                representations.append(
                    self.elmo._dropout(representation_without_bos_eos))

            x1_elmo = representations[0][:, :x1.size(1), :]
            self.precomputed_cnt += 1

            precomputed_elmo = True
        else:
            precomputed_elmo = False
        """
        x1 = document word indices             [batch * len_d]
        x1_c = document char indices           [batch * len_d * len_w] or [1]
        x1_f = document word features indices  [batch * q_num * len_d * nfeat]
        x1_pos = document POS tags             [batch * len_d]
        x1_ner = document entity tags          [batch * len_d]
        x1_mask = document padding mask        [batch * len_d]
        x2_full = question word indices        [batch * q_num * len_q]
        x2_c = question char indices           [(batch * q_num) * len_q * len_w]
        x2_full_mask = question padding mask   [batch * q_num * len_q]
        
        x1_full = document word indices        [batch * q_num * len_d]
        x1_full_mask = document padding mask   [batch * q_num * len_d]
        x2_full = question word indices        [batch * q_num * len_q]
        x2_full_mask = question padding mask   [batch * q_num * len_q]
        node_id     [batch * max_node_num * max_node_length]
        node__mask   [batch * max_node_num * max_node_length]
        edge_id     [batch * max_node_num * max_node_num ]
        """
        # x1_full [batch * 1 * len_d] -> batch, q_num, len_d
        # x1_full_mask batch, q_num, len_d
        x1_full = x1.unsqueeze(1).expand(x2_full.size(0), x2_full.size(1),
                                         x1.size(1)).contiguous()
        x1_full_mask = x1_mask.unsqueeze(1).expand(x2_full.size(0),
                                                   x2_full.size(1),
                                                   x1.size(1)).contiguous()

        #[batch * max_node_num * max_node_length]-> batch, 1, max_node_num, max_node_length -> batch, q_num , max_node_num, max_node_length
        # node=node_id.unsqueeze(1).expand(x2_full.size(0), x2_full.size(1), node_id.size(1), node_id.size(2)).contiguous()
        # node_full_mask=node_mask.unsqueeze(1).expand(x2_full.size(0), x2_full.size(1), node_mask.size(1), node_mask.size(2)).contiguous()
        # edge=edge_id.unsqueeze(1).expand(x2_full.size(0), x2_full.size(1), edge_id.size(1), edge_id.size(2)).contiguous()
        node = node_id.view(-1, node_id.size(
            -1)).contiguous()  #(batch*max_node_num), max_node_length
        # print('node{}'.format(node))
        node_full_mask = node_mask.view(
            -1, node_mask.size(-1))  ##(batch*max_node_num), max_node_length

        drnn_input_list, qrnn_input_list, grnn_input_list = [], [], []
        # x2  [(batch * q_num) * len_q]
        # x2_mask [(batch * q_num) * len_q]
        x2 = x2_full.view(-1, x2_full.size(-1))  #((batch*q_num),len_q)
        x2_mask = x2_full_mask.view(-1,
                                    x2_full.size(-1))  #((batch*q_num),len_q)
        # print('embeddingweight{}'.format(self.embedding.weight))

        if self.opt['use_wemb']:
            # Word embedding for both document and question
            emb = self.embedding if self.training else self.eval_embed
            x1_emb = emb(x1)  #batch, len_d, emb_size
            x2_emb = emb(x2)  #(batch * q_num), q_length, emb_size
            node_emb = emb(
                node)  #(batch*max_node_num), max_node_length, emb_size

            # print('node_emb{}'.format(node_emb[0, 0, :]))
            # Dropout on embeddings
            if self.opt['dropout_emb'] > 0:
                x1_emb = layers.dropout(x1_emb,
                                        p=self.opt['dropout_emb'],
                                        training=self.training)
                x2_emb = layers.dropout(x2_emb,
                                        p=self.opt['dropout_emb'],
                                        training=self.training)
                node_emb = layers.dropout(node_emb,
                                          p=self.opt['dropout_emb'],
                                          training=self.training)
            drnn_input_list.append(x1_emb)
            qrnn_input_list.append(x2_emb)
            grnn_input_list.append(node_emb)

        if self.opt['CoVe_opt'] > 0:
            x1_cove_mid, x1_cove_high = self.CoVe(x1, x1_mask)  #MTLSTM
            x2_cove_mid, x2_cove_high = self.CoVe(x2, x2_mask)
            # node_cove_mid, node_cove_high = self.CoVe(node, node_full_mask)
            # Dropout on contexualized embeddings
            if self.opt['dropout_emb'] > 0:
                x1_cove_mid = layers.dropout(x1_cove_mid,
                                             p=self.opt['dropout_emb'],
                                             training=self.training)
                x1_cove_high = layers.dropout(x1_cove_high,
                                              p=self.opt['dropout_emb'],
                                              training=self.training)
                x2_cove_mid = layers.dropout(x2_cove_mid,
                                             p=self.opt['dropout_emb'],
                                             training=self.training)
                x2_cove_high = layers.dropout(x2_cove_high,
                                              p=self.opt['dropout_emb'],
                                              training=self.training)
                # node_cove_mid = layers.dropout(node_cove_mid, p=self.opt['dropout_emb'], training=self.training)
                # node_cove_high = layers.dropout(node_cove_high, p=self.opt['dropout_emb'], training=self.training)

            drnn_input_list.append(x1_cove_mid)
            qrnn_input_list.append(x2_cove_mid)

        if self.opt['use_elmo']:
            if not precomputed_elmo:
                x1_elmo = self.elmo(x1_c)['elmo_representations'][
                    0]  #torch.zeros(x1_emb.size(0), x1_emb.size(1), 1024, dtype=x1_emb.dtype, layout=x1_emb.layout, device=x1_emb.device)
            x2_elmo = self.elmo(x2_c)['elmo_representations'][
                0]  #torch.zeros(x2_emb.size(0), x2_emb.size(1), 1024, dtype=x2_emb.dtype, layout=x2_emb.layout, device=x2_emb.device)
            # Dropout on contexualized embeddings
            if self.opt['dropout_emb'] > 0:
                x1_elmo = layers.dropout(x1_elmo,
                                         p=self.opt['dropout_emb'],
                                         training=self.training)
                x2_elmo = layers.dropout(x2_elmo,
                                         p=self.opt['dropout_emb'],
                                         training=self.training)

            drnn_input_list.append(x1_elmo)
            qrnn_input_list.append(x2_elmo)

        if self.opt['use_pos']:
            x1_pos_emb = self.pos_embedding(x1_pos)
            drnn_input_list.append(x1_pos_emb)

        if self.opt['use_ner']:
            x1_ner_emb = self.ner_embedding(x1_ner)
            drnn_input_list.append(x1_ner_emb)

        x1_input = torch.cat(drnn_input_list, dim=2)  #barch,len_d,?
        x2_input = torch.cat(qrnn_input_list, dim=2)  #(batch*q_num),len_q,??
        node_input = torch.cat(
            grnn_input_list,
            dim=2)  #(batch*max_node_num), max_node_length, emb_size

        def expansion_for_doc(z):
            #x2_full = question word indices        [batch * q_num * len_q]
            return z.unsqueeze(1).expand(z.size(0), x2_full.size(1), z.size(1),
                                         z.size(2)).contiguous().view(
                                             -1, z.size(1), z.size(2))

        # x1_emb batch, len_d, emb_size  x1_emb_expand
        x1_emb_expand = expansion_for_doc(
            x1_emb)  #(batch*q_num),len_d,emb_size
        x1_cove_high_expand = expansion_for_doc(x1_cove_high)

        #node_id     [batch * max_node_num * max_node_length]
        # node_emb  (batch*max_node_num), max_node_length, emb_size ->batch, max_node_num, max_node_length, emb_size
        #batch, 1,  max_node_num, max_node_length, emb_size->batch, q_num,  max_node_num, max_node_length, emb_size
        pre_node_emb = node_emb.view(node_id.size(0), node_id.size(1),
                                     node_emb.size(1),
                                     node_emb.size(2)).contiguous()
        pre_node_emb_expand = pre_node_emb.unsqueeze(1).expand(
            pre_node_emb.size(0), x2_full.size(1), pre_node_emb.size(1),
            pre_node_emb.size(2), pre_node_emb.size(3)).contiguous()
        #(batch*q_num), max_node_num, max_node_length, emb_size
        #要用的
        node_emb_expand = pre_node_emb_expand.view(-1,
                                                   pre_node_emb_expand.size(2),
                                                   pre_node_emb_expand.size(3),
                                                   pre_node_emb_expand.size(4))

        #node_mask [batch * max_node_num * max_node_length]->[batch * q_num * max_node_num * max_node_length]
        pre_node_emb_expand_mask = node_mask.unsqueeze(1).expand(
            node_mask.size(0), x2_full.size(1), node_mask.size(1),
            node_mask.size(2)).contiguous()
        # #(batch*q_num),  max_node_num, max_node_length
        #要用的
        node_emb_expand_mask = pre_node_emb_expand_mask.view(
            -1, node_mask.size(1), node_mask.size(2))

        #edge_id     [batch * max_node_num * max_node_num ]  [batch *q_num * max_node_num * max_node_num ]
        pre_edge_expand = edge_id.unsqueeze(1).expand(
            edge_id.size(0), x2_full.size(1), edge_id.size(1),
            edge_id.size(2)).contiguous()
        edge_expand = pre_edge_expand.view(-1, edge_id.size(1),
                                           edge_id.size(2))

        #x1_elmo_expand = expansion_for_doc(x1_elmo)
        if self.opt['no_em']:
            x1_f = x1_f[:, :, :, 3:]

        x1_input = torch.cat([
            expansion_for_doc(x1_input),
            x1_f.view(-1, x1_f.size(-2), x1_f.size(-1))
        ],
                             dim=2)
        x1_mask = x1_full_mask.view(
            -1, x1_full_mask.size(-1))  # (batch*q_num, len_d)

        if self.opt['do_prealign']:
            x1_atten = self.pre_align(
                x1_emb_expand, x2_emb,
                x2_mask)  # # batch*q_num* lend * xq_input_size
            x1_input = torch.cat([x1_input, x1_atten], dim=2)

        # === Start processing the dialog ===
        # cur_h: [batch_size * max_qa_pair, context_length, hidden_state]
        # flow : fn (rnn)
        # x1_full: [batch_size, max_qa_pair, context_length]   x1_full = document word indices [batch * q_num * len_d]
        def flow_operation(cur_h, flow):
            # ( len_d, batch*q_num, hidden_size)-> len_d,batch, q_num , hidden_size
            #例如执行view操作之后,不会开辟新的内存空间来存放处理之后的数据,实际上新数据与原始数据共享同一块内存。
            # #而在调用contiguous()之后,PyTorch会开辟一块新的内存空间存放变换之后的数据,并会真正改变Tensor的内容,按照变换之后的顺序存放数据。
            flow_in = cur_h.transpose(0, 1).view(x1_full.size(2),
                                                 x1_full.size(0),
                                                 x1_full.size(1), -1)
            #q_num,batch, len_d , hidden_size  q_num, batch*lend ,hidden
            flow_in = flow_in.transpose(0, 2).contiguous().view(
                x1_full.size(1),
                x1_full.size(0) * x1_full.size(2), -1).transpose(0, 1)
            # [bsz * context_length, max_qa_pair, hidden_state]
            flow_out = flow(flow_in)
            # [bsz * context_length, max_qa_pair, flow_hidden_state_dim (hidden_state/2)]
            if self.opt['no_dialog_flow']:
                flow_out = flow_out * 0

            flow_out = flow_out.transpose(0, 1).view(x1_full.size(1),
                                                     x1_full.size(0),
                                                     x1_full.size(2),
                                                     -1).transpose(
                                                         0, 2).contiguous()
            flow_out = flow_out.view(x1_full.size(2),
                                     x1_full.size(0) * x1_full.size(1),
                                     -1).transpose(0, 1)
            # [bsz * max_qa_pair, context_length, flow_hidden_state_dim]
            return flow_out

        # Encode document with RNN
        doc_abstr_ls = []

        doc_hiddens = self.doc_rnn1(
            x1_input, x1_mask)  # (batch*q_num, len_d, hidden_size)
        graph_output = self.graph_encoder(
            doc_hiddens, x1_mask, node_emb_expand, node_emb_expand_mask,
            edge_expand)  # bsz', max_node_num,  hidden
        # doc_hiddens=graph_output
        doc_hiddens_flow = flow_operation(
            doc_hiddens,
            self.dialog_flow1)  # [bsz * q_num, len_d, flow_hidden_state_dim]

        doc_abstr_ls.append(graph_output)

        doc_hiddens = self.doc_rnn2(
            torch.cat((doc_hiddens, doc_hiddens_flow, x1_cove_high_expand),
                      dim=2),
            x1_mask)  #opt['hidden_size'] * 2 + flow_size + CoVe_size
        doc_hiddens_flow = flow_operation(
            doc_hiddens,
            self.dialog_flow2)  # [bsz * q_num, len_d, flow_hidden_state_dim]
        doc_abstr_ls.append(doc_hiddens)

        #with open('flow_bef_att.pkl', 'wb') as output:
        #    pickle.dump(doc_hiddens_flow, output, pickle.HIGHEST_PROTOCOL)
        #while(1):
        #    pass

        # Encode question with RNN  x2_input (batch*q_num),len_q,x2_input_size
        _, que_abstr_ls = self.question_rnn(
            x2_input, x2_mask, return_list=True,
            additional_x=x2_cove_high)  # [((batch*q_num), len_q, hidden_size)]

        # Final question layer
        question_hiddens = self.high_lvl_qrnn(torch.cat(que_abstr_ls, 2),
                                              x2_mask)
        que_abstr_ls += [question_hiddens]

        # Main Attention Fusion Layer
        # x1_emb_expand x1_cove_high_expand (batch*q_num),len_d,emb_size   doc_abstr_ls  [(batch*q_num, len_d, hidden_size),(batch*q_num, len_d, hidden_size)]
        # x2_em (batch*q_num),len_q,embsize)  que_abstr_ls [(batch*q_num), len_q, hidden_size),(batch*q_num), len_q, hidden_size)]
        doc_info = self.deep_attn(
            [torch.cat([x1_emb_expand, x1_cove_high_expand], 2)], doc_abstr_ls,
            [torch.cat([x2_emb, x2_cove_high], 2)], que_abstr_ls, x1_mask,
            x2_mask)  # # batch*q_num * len1 * x2_input_size
        #doc_hiddens_flow [bsz * q_num, len_d, flow_hidden_state_dim]
        doc_hiddens = self.deep_attn_rnn(
            torch.cat((doc_info, doc_hiddens_flow), dim=2), x1_mask)
        doc_hiddens_flow = flow_operation(doc_hiddens, self.dialog_flow3)

        doc_abstr_ls += [
            doc_hiddens
        ]  #[(batch*q_num, len_d, hidden_size),(batch*q_num, len_d, hidden_size),(batch*q_num, len_d, hidden_size)]

        # Self Attention Fusion Layer
        x1_att = torch.cat(doc_abstr_ls, 2)

        if self.opt['self_attention_opt'] > 0:
            #x1_att  c1,c2,c3
            highlvl_self_attn_hiddens = self.highlvl_self_att(
                x1_att, x1_att, x1_mask, x3=doc_hiddens, drop_diagonal=True)
            ##  highlvl_self_attn_hiddens batch * len1 * x2_input_size fully aware context on c3
            doc_hiddens = self.high_lvl_crnn(
                torch.cat(
                    [doc_hiddens, highlvl_self_attn_hiddens, doc_hiddens_flow],
                    dim=2), x1_mask)
        elif self.opt['self_attention_opt'] == 0:
            doc_hiddens = self.high_lvl_crnn(
                torch.cat([doc_hiddens, doc_hiddens_flow], dim=2),
                x1_mask)  # (batch*q_num, seq_len, hidden_size)

        doc_abstr_ls += [doc_hiddens]

        # Merge the question hidden vectors
        q_merge_weights = self.self_attn(question_hiddens, x2_mask)
        question_avg_hidden = layers.weighted_avg(
            question_hiddens, q_merge_weights)  #(batch*q_num )* hidden
        if self.opt['do_hierarchical_query']:
            #x1_full: [batch_size, q_num context_length]
            #question_avg_hidden  (bsz, q_num, hidden)
            question_avg_hidden = self.hier_query_rnn(
                question_avg_hidden.view(x1_full.size(0), x1_full.size(1), -1))
            question_avg_hidden = question_avg_hidden.contiguous().view(
                -1, question_avg_hidden.size(-1))  # (batch*q_num ), hidden

        # Get Start, End span
        # question_avg_hidden (batch*q_num ), hidden   doc_hiddens doc_hiddens_flow [bsz * q_num, len_d, flow_hidden_state_dim]
        start_scores, end_scores = self.get_answer(doc_hiddens,
                                                   question_avg_hidden,
                                                   x1_mask)
        all_start_scores = start_scores.view_as(
            x1_full)  # batch x q_num x len_d
        all_end_scores = end_scores.view_as(x1_full)  # batch x q_num x len_d

        # Get whether there is an answer
        #doc_hiddens doc_hiddens_flow [bsz * q_num, len_d, flow_hidden_state_dim]
        #torch max 函数会返回两个tensor,第一个tensor是每行的最大值 第二个tensor是每行最大值的索引
        doc_avg_hidden = torch.cat(
            (torch.max(doc_hiddens, dim=1)[0], torch.mean(doc_hiddens, dim=1)),
            dim=1)  #batch, hidden
        class_scores = self.ans_type_prediction(doc_avg_hidden,
                                                question_avg_hidden)
        all_class_scores = class_scores.view(x1_full.size(0), x1_full.size(1),
                                             -1)  # batch x q_num x class_num
        all_class_scores = all_class_scores.squeeze(-1)  # when class_num = 1

        return all_start_scores, all_end_scores, all_class_scores
Exemple #30
0
    def forward(self,    # pylint: disable=arguments-differ
                inputs              ,
                word_inputs               = None)                                                      :
        u"""
        Parameters
        ----------
        inputs: ``torch.Tensor``, required.
        Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch.
        word_inputs : ``torch.Tensor``, required.
            If you passed a cached vocab, you can in addition pass a tensor of shape
            ``(batch_size, timesteps)``, which represent word ids which have been pre-cached.

        Returns
        -------
        Dict with keys:
        ``'elmo_representations'``: ``List[torch.Tensor]``
            A ``num_output_representations`` list of ELMo representations for the input sequence.
            Each representation is shape ``(batch_size, timesteps, embedding_dim)``
        ``'mask'``:  ``torch.Tensor``
            Shape ``(batch_size, timesteps)`` long tensor with sequence mask.
        """
        # reshape the input if needed
        original_shape = inputs.size()
        if len(original_shape) > 3:
            timesteps, num_characters = original_shape[-2:]
            reshaped_inputs = inputs.view(-1, timesteps, num_characters)
        else:
            reshaped_inputs = inputs

        if word_inputs is not None:
            original_word_size = word_inputs.size()
            if self._has_cached_vocab and len(original_word_size) > 2:
                reshaped_word_inputs = word_inputs.view(-1, original_word_size[-1])
            elif not self._has_cached_vocab:
                logger.warning(u"Word inputs were passed to ELMo but it does not have a cached vocab.")
                reshaped_word_inputs = None
            else:
                reshaped_word_inputs = word_inputs
        else:
            reshaped_word_inputs = word_inputs

        # run the biLM
        bilm_output = self._elmo_lstm(reshaped_inputs, reshaped_word_inputs)
        layer_activations = bilm_output[u'activations']
        mask_with_bos_eos = bilm_output[u'mask']

        # compute the elmo representations
        representations = []
        for i in range(len(self._scalar_mixes)):
            scalar_mix = getattr(self, u'scalar_mix_{}'.format(i))
            representation_with_bos_eos = scalar_mix(layer_activations, mask_with_bos_eos)
            representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
                    representation_with_bos_eos, mask_with_bos_eos
            )
            representations.append(self._dropout(representation_without_bos_eos))

        # reshape if necessary
        if word_inputs is not None and len(original_word_size) > 2:
            mask = mask_without_bos_eos.view(original_word_size)
            elmo_representations = [representation.view(original_word_size + (-1, ))
                                    for representation in representations]
        elif len(original_shape) > 3:
            mask = mask_without_bos_eos.view(original_shape[:-1])
            elmo_representations = [representation.view(original_shape[:-1] + (-1, ))
                                    for representation in representations]
        else:
            mask = mask_without_bos_eos
            elmo_representations = representations

        return {u'elmo_representations': elmo_representations, u'mask': mask}
Exemple #31
0
    def forward(
        self,
        inputs: torch.Tensor,
        word_inputs: torch.Tensor = None
    ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
        """
        # Parameters

        inputs : `torch.Tensor`, required.
        Shape `(batch_size, timesteps, 50)` of character ids representing the current batch.
        word_inputs : `torch.Tensor`, required.
            If you passed a cached vocab, you can in addition pass a tensor of shape
            `(batch_size, timesteps)`, which represent word ids which have been pre-cached.

        # Returns

        Dict with keys:
        `'elmo_representations'` : `List[torch.Tensor]`
            A `num_output_representations` list of ELMo representations for the input sequence.
            Each representation is shape `(batch_size, timesteps, embedding_dim)`
        `'mask'`:  `torch.Tensor`
            Shape `(batch_size, timesteps)` long tensor with sequence mask.
        """
        # reshape the input if needed
        original_shape = inputs.size()
        if len(original_shape) > 3:
            timesteps, num_characters = original_shape[-2:]
            reshaped_inputs = inputs.view(-1, timesteps, num_characters)
        else:
            reshaped_inputs = inputs

        if word_inputs is not None:
            original_word_size = word_inputs.size()
            if self._has_cached_vocab and len(original_word_size) > 2:
                reshaped_word_inputs = word_inputs.view(
                    -1, original_word_size[-1])
            elif not self._has_cached_vocab:
                logger.warning(
                    "Word inputs were passed to ELMo but it does not have a cached vocab."
                )
                reshaped_word_inputs = None
            else:
                reshaped_word_inputs = word_inputs
        else:
            reshaped_word_inputs = word_inputs

        # run the biLM
        bilm_output = self._elmo_lstm(reshaped_inputs, reshaped_word_inputs)
        layer_activations = bilm_output["activations"]
        mask_with_bos_eos = bilm_output["mask"]

        # compute the elmo representations
        representations = []
        for i in range(len(self._scalar_mixes)):
            scalar_mix = getattr(self, "scalar_mix_{}".format(i))
            representation_with_bos_eos = scalar_mix(layer_activations,
                                                     mask_with_bos_eos)
            if self._keep_sentence_boundaries:
                processed_representation = representation_with_bos_eos
                processed_mask = mask_with_bos_eos
            else:
                representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
                    representation_with_bos_eos, mask_with_bos_eos)
                processed_representation = representation_without_bos_eos
                processed_mask = mask_without_bos_eos
            representations.append(self._dropout(processed_representation))

        # reshape if necessary
        if word_inputs is not None and len(original_word_size) > 2:
            mask = processed_mask.view(original_word_size)
            elmo_representations = [
                representation.view(original_word_size + (-1, ))
                for representation in representations
            ]
        elif len(original_shape) > 3:
            mask = processed_mask.view(original_shape[:-1])
            elmo_representations = [
                representation.view(original_shape[:-1] + (-1, ))
                for representation in representations
            ]
        else:
            mask = processed_mask
            elmo_representations = representations

        return {"elmo_representations": elmo_representations, "mask": mask}
Exemple #32
0
def test_fast_elmo_with_allennlp_do_layer_norm():
    fast = FastElmo(
        ELMO_OPTIONS_FILE,
        ELMO_WEIGHT_FILE,
        num_output_representations=1,
        scalar_mix_parameters=[1.0, 1.0, 1.0],
        do_layer_norm=True,
    )

    allennlp = Elmo(
        ELMO_OPTIONS_FILE,
        ELMO_WEIGHT_FILE,
        num_output_representations=1,
        dropout=0.0,
        scalar_mix_parameters=[1.0, 1.0, 1.0],
        do_layer_norm=True,
    )

    sentences = [
        ['ELMo', 'helps', 'disambiguate', 'ELMo', 'from', 'Elmo', '.'],
        ['The', 'sentence', '.'],
    ]
    character_ids = _sentences_to_ids(sentences)

    fast_out = fast(character_ids)
    allennlp_out = allennlp(character_ids)

    # Since we don't include the BOS/EOS reprs during layer normalization,
    # the result will be different from AllenNLP's implementation.
    np.testing.assert_raises(
        AssertionError,
        np.testing.assert_array_almost_equal,
        fast_out['elmo_representations'][0],
        allennlp_out['elmo_representations'][0],
    )

    # We can pack BOS/EOS to inputs manually
    _beginning_of_sentence_characters = torch.from_numpy(
        np.array(ELMoCharacterMapper.beginning_of_sentence_characters) + 1)
    _end_of_sentence_characters = torch.from_numpy(
        np.array(ELMoCharacterMapper.end_of_sentence_characters) + 1)

    mask = ((character_ids > 0).long().sum(dim=-1) > 0).long()
    character_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids(
        character_ids,
        mask,
        _beginning_of_sentence_characters,
        _end_of_sentence_characters,
    )

    # And disable the mock BOS/EOS actions in FastElmo.
    fast.exec_managed_lstm_bos_eos = False
    fast_out_2 = fast(character_ids_with_bos_eos)
    fast_mixed_repr_2, _ = remove_sentence_boundaries(
        fast_out_2['elmo_representations'][0],
        fast_out_2['mask'],
    )

    allennlp_out_2 = allennlp(character_ids)

    np.testing.assert_array_almost_equal(
        fast_mixed_repr_2,
        allennlp_out_2['elmo_representations'][0],
    )
Exemple #33
0
    def forward(
        self,  # type: ignore
        source: Dict[str, torch.LongTensor]
    ) -> Dict[str, torch.Tensor]:
        """
        Computes the averaged forward and backward LM loss from the batch.

        By convention, the input dict is required to have at least a ``"tokens"``
        entry that's the output of a ``SingleIdTokenIndexer``, which is used
        to compute the language model targets.

        If the model was instantiated with ``remove_bos_eos=True``,
        then it is expected that each of the input sentences was augmented with
        begin-sentence and end-sentence tokens.

        Parameters
        ----------
        tokens: ``torch.Tensor``, required.
            The output of ``Batch.as_tensor_dict()`` for a batch of sentences.

        Returns
        -------
        Dict with keys:

        ``'loss'``: ``torch.Tensor``
            averaged forward/backward negative log likelihood
        ``'forward_loss'``: ``torch.Tensor``
            forward direction negative log likelihood
        ``'backward_loss'``: ``torch.Tensor``
            backward direction negative log likelihood
        ``'lm_embeddings'``: ``torch.Tensor``
            (batch_size, timesteps, embed_dim) tensor of top layer contextual representations
        ``'mask'``: ``torch.Tensor``
            (batch_size, timesteps) mask for the embeddings
        """
        # pylint: disable=arguments-differ
        mask = get_text_field_mask(source)

        # We must have token_ids so that we can compute targets
        token_ids = source.get("tokens")
        if token_ids is None:
            raise ConfigurationError(
                "Your data must have a 'tokens': SingleIdTokenIndexer() "
                "in order to use the BidirectionalLM")

        # Use token_ids to compute targets
        forward_targets = torch.zeros_like(token_ids)
        backward_targets = torch.zeros_like(token_ids)
        forward_targets[:, 0:-1] = token_ids[:, 1:]
        backward_targets[:, 1:] = token_ids[:, 0:-1]

        # shape (batch_size, timesteps + 2, embedding_size)
        embeddings = self._text_field_embedder(source)

        contextual_embeddings = self._contextualizer(embeddings, mask)

        # add dropout
        contextual_embeddings = self._dropout(contextual_embeddings)

        # compute softmax loss
        forward_loss, backward_loss = self._compute_loss(
            contextual_embeddings, embeddings, forward_targets,
            backward_targets)

        num_targets = torch.sum((forward_targets > 0).long())
        if num_targets > 0:
            average_loss = 0.5 * (forward_loss +
                                  backward_loss) / num_targets.float()
        else:
            average_loss = torch.tensor(0.0).to(forward_targets.device)  # pylint: disable=not-callable
        # this is stored to compute perplexity if needed
        self._last_average_loss[0] = average_loss.detach().item()

        if num_targets > 0:
            # loss is directly minimized
            if self._loss_scale == 'n_samples':
                scale_factor = num_targets.float()
            else:
                scale_factor = self._loss_scale

            return_dict = {
                'loss': average_loss * scale_factor,
                'forward_loss':
                forward_loss * scale_factor / num_targets.float(),
                'backward_loss':
                backward_loss * scale_factor / num_targets.float()
            }
        else:
            # average_loss zero tensor, return it for all
            return_dict = {
                'loss': average_loss,
                'forward_loss': average_loss,
                'backward_loss': average_loss
            }

        if self._remove_bos_eos:
            contextual_embeddings, mask = remove_sentence_boundaries(
                contextual_embeddings, mask)

        return_dict.update({
            'lm_embeddings': contextual_embeddings,
            'mask': mask
        })

        return return_dict
Exemple #34
0
def test_elmo_character_encoder_with_allennlp():
    allennlp_embedder = _ElmoCharacterEncoder(
        ELMO_OPTIONS_FILE,
        ELMO_WEIGHT_FILE,
    )
    embedder = ElmoCharacterEncoderFactory(
        ELMO_OPTIONS_FILE,
        ELMO_WEIGHT_FILE,
    ).create()

    allennlp_parameters = [
        '_char_embedding_weights',
        'char_conv_0.bias',
        'char_conv_0.weight',
        'char_conv_1.bias',
        'char_conv_1.weight',
        'char_conv_2.bias',
        'char_conv_2.weight',
        'char_conv_3.bias',
        'char_conv_3.weight',
        'char_conv_4.bias',
        'char_conv_4.weight',
        '_projection.bias',
        '_projection.weight',
    ]
    embedder_parameters = [
        'char_embedding.weight',
        'char_conv_0.bias',
        'char_conv_0.weight',
        'char_conv_1.bias',
        'char_conv_1.weight',
        'char_conv_2.bias',
        'char_conv_2.weight',
        'char_conv_3.bias',
        'char_conv_3.weight',
        'char_conv_4.bias',
        'char_conv_4.weight',
        'output_proj.bias',
        'output_proj.weight',
    ]
    allennlp_parameters_diff = [
        '_highways._layers.0.bias',
        '_highways._layers.0.weight',
        '_highways._layers.1.bias',
        '_highways._layers.1.weight',
    ]
    embedder_parameters_diff = [
        'highway.layers_0.bias',
        'highway.layers_0.weight',
        'highway.layers_1.bias',
        'highway.layers_1.weight',
    ]
    assert len(allennlp_parameters) == len(embedder_parameters)
    assert len(allennlp_parameters_diff) == len(embedder_parameters_diff)

    allennlp_embedder_named_parameters = dict(
        allennlp_embedder.named_parameters())
    # Same.
    for allennlp_param, embedder_param in zip(allennlp_parameters,
                                              embedder_parameters):
        allennlp_w = allennlp_embedder_named_parameters[allennlp_param].data
        embedder_w = embedder.named_parameters()[embedder_param].data

        np.testing.assert_array_equal(embedder_w.numpy(), allennlp_w.numpy())
        assert embedder_w.dtype == allennlp_w.dtype
    # Diff on highway.
    for allennlp_param, embedder_param in zip(allennlp_parameters_diff,
                                              embedder_parameters_diff):
        allennlp_w = allennlp_embedder_named_parameters[allennlp_param].data
        embedder_w = embedder.named_parameters()[embedder_param].data

        assert embedder_w.dtype == allennlp_w.dtype
        np.testing.assert_raises(
            AssertionError,
            np.testing.assert_array_equal,
            embedder_w.numpy(),
            allennlp_w.numpy(),
        )

    sentences = [
        ['ELMo', 'helps', 'disambiguate', 'ELMo', 'from', 'Elmo', '.'],
        ['The', 'sentence', '.'],
    ]
    # `(2, 7, 50)`
    character_ids = _sentences_to_ids(sentences)

    # AllenNLP.
    out = allennlp_embedder(character_ids)
    allennlp_token_embedding, _ = remove_sentence_boundaries(
        out['token_embedding'], out['mask'])
    assert list(allennlp_token_embedding.shape) == [2, 7, 16]

    # Ours.
    inputs = pack_padded_sequence(character_ids, [7, 3], batch_first=True)
    out = embedder(inputs.data)
    ours_token_embedding = _unpack(out, inputs.batch_sizes)
    assert list(ours_token_embedding.shape) == [2, 7, 16]

    np.testing.assert_array_almost_equal(
        ours_token_embedding.data.numpy(),
        allennlp_token_embedding.data.numpy(),
    )
Exemple #35
0
    def forward(self,    # pylint: disable=arguments-differ
                inputs: torch.Tensor,
                word_inputs: torch.Tensor = None) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
        """
        Parameters
        ----------
        inputs: ``torch.Tensor``, required.
        Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch.
        word_inputs : ``torch.Tensor``, required.
            If you passed a cached vocab, you can in addition pass a tensor of shape
            ``(batch_size, timesteps)``, which represent word ids which have been pre-cached.

        Returns
        -------
        Dict with keys:
        ``'elmo_representations'``: ``List[torch.Tensor]``
            A ``num_output_representations`` list of ELMo representations for the input sequence.
            Each representation is shape ``(batch_size, timesteps, embedding_dim)``
        ``'mask'``:  ``torch.Tensor``
            Shape ``(batch_size, timesteps)`` long tensor with sequence mask.
        """
        # reshape the input if needed
        original_shape = inputs.size()
        if len(original_shape) > 3:
            timesteps, num_characters = original_shape[-2:]
            reshaped_inputs = inputs.view(-1, timesteps, num_characters)
        else:
            reshaped_inputs = inputs

        if word_inputs is not None:
            original_word_size = word_inputs.size()
            if self._has_cached_vocab and len(original_word_size) > 2:
                reshaped_word_inputs = word_inputs.view(-1, original_word_size[-1])
            elif not self._has_cached_vocab:
                logger.warning("Word inputs were passed to ELMo but it does not have a cached vocab.")
                reshaped_word_inputs = None
            else:
                reshaped_word_inputs = word_inputs
        else:
            reshaped_word_inputs = word_inputs

        # run the biLM
        bilm_output = self._elmo_lstm(reshaped_inputs, reshaped_word_inputs)
        layer_activations = bilm_output['activations']
        mask_with_bos_eos = bilm_output['mask']

        # compute the elmo representations
        representations = []
        for i in range(len(self._scalar_mixes)):
            scalar_mix = getattr(self, 'scalar_mix_{}'.format(i))
            representation_with_bos_eos = scalar_mix(layer_activations, mask_with_bos_eos)
            representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
                    representation_with_bos_eos, mask_with_bos_eos
            )
            representations.append(self._dropout(representation_without_bos_eos))

        # reshape if necessary
        if word_inputs is not None and len(original_word_size) > 2:
            mask = mask_without_bos_eos.view(original_word_size)
            elmo_representations = [representation.view(original_word_size + (-1, ))
                                    for representation in representations]
        elif len(original_shape) > 3:
            mask = mask_without_bos_eos.view(original_shape[:-1])
            elmo_representations = [representation.view(original_shape[:-1] + (-1, ))
                                    for representation in representations]
        else:
            mask = mask_without_bos_eos
            elmo_representations = representations

        return {'elmo_representations': elmo_representations, 'mask': mask}
Exemple #36
0
    def forward(self, x1, x1_c, x1_f, x1_pos, x1_ner, x1_mask, x2_full, x2_c,
                x2_full_mask):
        #在model_CoQA.py中的QA_model中被调用。
        #输入的9个是
        #context_id, context_cid, context_feature, context_tag, context_ent, context_mask,
        #           question_id, question_cid, question_mask,
        """Inputs:
        x1 = document word indices             [batch * len_d] len_d:len_document
        x1_c = document char indices           [batch * len_d * len_w] or [1]
        x1_c have precompute times batch example , that 's why , i hope i can got answer here
        x1_f = document word features indices  [batch * q_num * len_d * nfeat]
        x1_pos = document POS tags             [batch * len_d]
        x1_ner = document entity tags          [batch * len_d]
        x1_mask = document padding mask        [batch * len_d]
        x2_full = question word indices        [batch * q_num * len_q]
        x2_c = question char indices           [(batch * q_num) * len_q * len_w]
        x2_full_mask = question padding mask   [batch * q_num * len_q]
        """
        '''
        context_id, context_cid, context_feature, context_tag, context_ent, context_mask,
                   question_id, 
        x2_full = question_cid, 
        question_mask, overall_mask,
        '''

        # precomputing ELMo is only for context (to speedup computation)
        if self.opt['use_elmo'] and self.opt['elmo_batch_size'] > self.opt[
                'batch_size']:  # precomputing ELMo is used
            if x1_c.dim() != 1:  # precomputation is needed
                precomputed_bilm_output = self.elmo._elmo_lstm(
                    x1_c
                )  #这个_elmo_lstm()是会在一个句子的前后加上<s> 和</s> ,就是比batch_to_id给出的数据的sentence_len维度多2
                self.precomputed_layer_activations = [
                    t.detach().cpu()
                    for t in precomputed_bilm_output['activations']
                ]
                #detach()从当前的图中分离,.cpu()放到cpu()上
                self.precomputed_mask_with_bos_eos = precomputed_bilm_output[
                    'mask'].detach().cpu()
                #先一次性,将很多倍于batch_size的elmo向量拿出来
                self.precomputed_cnt = 0
                #下面precomputed_cnt 这个值会加1,程序采用这种做法,讲context的elmo的embeddding提前取出来,存在self.precomputed_layer_activates 和self.precomputed_mask_with_bos_eos中
                #每次还是取正常的一个batch大小,precompute_cnt的值会在0-elmo_batch_size // batch_size 之间变化

            # get precomputed ELMo
            layer_activations = [
                t[x1.size(0) * self.precomputed_cnt:x1.size(0) *
                  (self.precomputed_cnt + 1), :, :]
                for t in self.precomputed_layer_activations
            ]
            mask_with_bos_eos = self.precomputed_mask_with_bos_eos[
                x1.size(0) * self.precomputed_cnt:x1.size(0) *
                (self.precomputed_cnt + 1), :]
            # 用precomputed_cnt * x1.size(0) 来计数,每个batch的训练,取这么多的数据
            if x1.is_cuda:
                layer_activations = [t.cuda() for t in layer_activations]
                mask_with_bos_eos = mask_with_bos_eos.cuda()

            representations = []
            for i in range(len(
                    self.elmo._scalar_mixes)):  #len(elmo._scalar_mixes) 就是等于2
                '''
                elmo._scalar_mixes =  [ScalarMix(
                  (scalar_parameters): ParameterList(
                      (0): Parameter containing: [torch.FloatTensor of size 1]
                      (1): Parameter containing: [torch.FloatTensor of size 1]
                      (2): Parameter containing: [torch.FloatTensor of size 1]
                  )
                ), ScalarMix(
                  (scalar_parameters): ParameterList(
                      (0): Parameter containing: [torch.FloatTensor of size 1]
                      (1): Parameter containing: [torch.FloatTensor of size 1]
                      (2): Parameter containing: [torch.FloatTensor of size 1]
                  )
                )]
                '''
                scalar_mix = getattr(self.elmo, 'scalar_mix_{}'.format(i))
                representation_with_bos_eos = scalar_mix(
                    layer_activations, mask_with_bos_eos)
                representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
                    representation_with_bos_eos, mask_with_bos_eos)
                representations.append(
                    self.elmo._dropout(representation_without_bos_eos))
                #循环一共两遍,所以一共两个元素,每个元素是[句子个数, 句子长度, 1024]的尺度,这个句子长度中是不包含前后特殊符号的
                #而且在我的样例中,数值还是一样的,那为了什么要循环两次呢。

            x1_elmo = representations[0][:, :x1.size(
                1), :]  #x1.size(1)是为了截取最大长度以内的向量
            self.precomputed_cnt += 1

            precomputed_elmo = True
        else:
            precomputed_elmo = False
        """
        x1_full = document word indices        [batch * q_num * len_d]
        x1_full_mask = document padding mask   [batch * q_num * len_d]
        x2_full question word indices          [batch * q_num * len_q]
        x2_full_mask = question padding mask   [batch * q_num * len_q]
        """
        # x1 [batch , len_d]-->unsqueeze(1)-->[batch , 1 , len_d] -->expand-->[batch , num_q , len_d]
        x1_full = x1.unsqueeze(1).expand(
            x2_full.size(0), x2_full.size(1),
            x1.size(1)).contiguous()  #第二个维度扩展为句子数目的维度
        # x1_mask [batch , len_d] --> [batch ,1 , len_d] -->[batch , num_q , len_d]
        x1_full_mask = x1_mask.unsqueeze(1).expand(x2_full.size(0),
                                                   x2_full.size(1),
                                                   x1.size(1)).contiguous()

        drnn_input_list, qrnn_input_list = [], [
        ]  #处理document的rnn和处理question的rnn

        x2 = x2_full.view(-1, x2_full.size(
            -1))  #[batch , q_num , len_q] -> [batch * q_num , len_q]
        x2_mask = x2_full_mask.view(-1, x2_full.size(-1))

        if self.opt['use_wemb']:
            # Word embedding for both document and question
            emb = self.embedding if self.training else self.eval_embed
            x1_emb = emb(x1)
            x2_emb = emb(x2)
            # Dropout on embeddings
            if self.opt['dropout_emb'] > 0:
                x1_emb = layers.dropout(x1_emb,
                                        p=self.opt['dropout_emb'],
                                        training=self.training)
                x2_emb = layers.dropout(x2_emb,
                                        p=self.opt['dropout_emb'],
                                        training=self.training)

            drnn_input_list.append(x1_emb)
            qrnn_input_list.append(x2_emb)

        if self.opt['CoVe_opt'] > 0:
            x1_cove_mid, x1_cove_high = self.CoVe(x1, x1_mask)
            x2_cove_mid, x2_cove_high = self.CoVe(x2, x2_mask)
            # Dropout on contexualized embeddings
            if self.opt['dropout_emb'] > 0:
                x1_cove_mid = layers.dropout(x1_cove_mid,
                                             p=self.opt['dropout_emb'],
                                             training=self.training)
                x1_cove_high = layers.dropout(x1_cove_high,
                                              p=self.opt['dropout_emb'],
                                              training=self.training)
                x2_cove_mid = layers.dropout(x2_cove_mid,
                                             p=self.opt['dropout_emb'],
                                             training=self.training)
                x2_cove_high = layers.dropout(x2_cove_high,
                                              p=self.opt['dropout_emb'],
                                              training=self.training)

            drnn_input_list.append(x1_cove_mid)
            qrnn_input_list.append(x2_cove_mid)

        if self.opt['use_elmo']:
            if not precomputed_elmo:
                x1_elmo = self.elmo(x1_c)['elmo_representations'][
                    0]  #torch.zeros(x1_emb.size(0), x1_emb.size(1), 1024, dtype=x1_emb.dtype, layout=x1_emb.layout, device=x1_emb.device)
            x2_elmo = self.elmo(x2_c)['elmo_representations'][
                0]  #torch.zeros(x2_emb.size(0), x2_emb.size(1), 1024, dtype=x2_emb.dtype, layout=x2_emb.layout, device=x2_emb.device)
            # Dropout on contexualized embeddings
            if self.opt['dropout_emb'] > 0:
                x1_elmo = layers.dropout(x1_elmo,
                                         p=self.opt['dropout_emb'],
                                         training=self.training)
                x2_elmo = layers.dropout(x2_elmo,
                                         p=self.opt['dropout_emb'],
                                         training=self.training)

            drnn_input_list.append(x1_elmo)
            qrnn_input_list.append(x2_elmo)

        if self.opt['use_pos']:
            x1_pos_emb = self.pos_embedding(x1_pos)
            drnn_input_list.append(x1_pos_emb)

        if self.opt['use_ner']:
            x1_ner_emb = self.ner_embedding(x1_ner)
            drnn_input_list.append(x1_ner_emb)

        x1_input = torch.cat(drnn_input_list, dim=2)
        x2_input = torch.cat(qrnn_input_list, dim=2)

        def expansion_for_doc(z):
            return z.unsqueeze(1).expand(z.size(0), x2_full.size(1), z.size(1),
                                         z.size(2)).contiguous().view(
                                             -1, z.size(1), z.size(2))
            #[batch * num_q , len_d , emb_dim]

        x1_emb_expand = expansion_for_doc(x1_emb)
        x1_cove_high_expand = expansion_for_doc(x1_cove_high)
        #x1_elmo_expand = expansion_for_doc(x1_elmo)
        if self.opt[
                'no_em']:  #x1_f = document word features indices  [batch * q_num * len_d * nfeat]
            x1_f = x1_f[:, :, :, 3:]

        x1_input = torch.cat([
            expansion_for_doc(x1_input),
            x1_f.view(-1, x1_f.size(-2), x1_f.size(-1))
        ],
                             dim=2)
        x1_mask = x1_full_mask.view(-1, x1_full_mask.size(-1))

        # Interaction Layer(1.flow  2.integration  两者交互)
        if self.opt[
                'do_prealign']:  #x1_emb_expand [batch * num_q , len_d, emb_dim] 这里面的emb_dim是最纯朴的单词的词向量,不是elmo也不是CoVe
            # x2_emb [batch * num_q , len_q , emb_dim]
            x1_atten = self.pre_align(
                x1_emb_expand, x2_emb, x2_mask
            )  #self.pre_align = layers.GetAttentionHiddens(embedding_dim, opt['prealign_hidden'], similarity_attention=True)

            x1_input = torch.cat([x1_input, x1_atten], dim=2)  #有了问题信息加权的篇章表示

        # === Start processing the dialog ===
        # cur_h: [batch_size * max_qa_pair, context_length, hidden_state]
        # flow : fn (rnn)
        # x1_full: [batch_size, max_qa_pair, context_length]
        def flow_operation(cur_h, flow):  #flow操作就是在经过rnn之前要保证对qa_pairs这个维度滚rnn
            # cur_h [batch * max_qa_pair, len_d , hidden * 2] --> [len_d , batch * num_q , hidden * 2] -> [len_d , batch , num_q , hidden * 2]
            flow_in = cur_h.transpose(0, 1).view(x1_full.size(2),
                                                 x1_full.size(0),
                                                 x1_full.size(1), -1)
            #         [len_d , batch , num_q , hidden * 2] -> [num_q ,batch * len_d , hidden * 2] ->[batch * len_d , num_q , hidden * 2]
            flow_in = flow_in.transpose(0, 2).contiguous().view(
                x1_full.size(1),
                x1_full.size(0) * x1_full.size(2), -1).transpose(0, 1)
            # [bsz * context_length, max_qa_pair, hidden_state]
            flow_out = flow(flow_in)
            # [bsz * context_length, max_qa_pair, flow_hidden_state_dim (hidden_state/2)]
            if self.opt['no_dialog_flow']:
                flow_out = flow_out * 0

            flow_out = flow_out.transpose(0, 1).view(x1_full.size(1),
                                                     x1_full.size(0),
                                                     x1_full.size(2),
                                                     -1).transpose(
                                                         0, 2).contiguous()
            flow_out = flow_out.view(x1_full.size(2),
                                     x1_full.size(0) * x1_full.size(1),
                                     -1).transpose(0, 1)
            # [bsz * max_qa_pair, context_length, flow_hidden_state_dim]
            return flow_out

        # Encode document with RNN; Passage and Question Interaction
        doc_abstr_ls = []

        doc_hiddens = self.doc_rnn1(x1_input,
                                    x1_mask)  #[batch , len_d , hidden * 2]
        doc_hiddens_flow = flow_operation(doc_hiddens, self.dialog_flow1)
        doc_abstr_ls.append(doc_hiddens)

        doc_hiddens = self.doc_rnn2(
            torch.cat((doc_hiddens, doc_hiddens_flow, x1_cove_high_expand),
                      dim=2), x1_mask)
        doc_hiddens_flow = flow_operation(doc_hiddens, self.dialog_flow2)
        doc_abstr_ls.append(doc_hiddens)
        '''
        #with open('flow_bef_att.pkl', 'wb') as output:
        #    pickle.dump(doc_hiddens_flow, output, pickle.HIGHEST_PROTOCOL)
        #while(1):
        #    pass
        '''

        # Encode question with RNN
        _, que_abstr_ls = self.question_rnn(x2_input,
                                            x2_mask,
                                            return_list=True,
                                            additional_x=x2_cove_high)
        # que_abstr_ls  将两层的问题向量都返回了,每一层都是[batch * q_num , len_q , hidden * 2]

        # Final question layer
        question_hiddens = self.high_lvl_qrnn(torch.cat(que_abstr_ls, 2),
                                              x2_mask)
        #[batch * num_q , len_q , hidden * 2]
        que_abstr_ls += [question_hiddens]

        # Main Attention Fusion Layer
        doc_info = self.deep_attn(
            [torch.cat([x1_emb_expand, x1_cove_high_expand], 2)], doc_abstr_ls,
            [torch.cat([x2_emb, x2_cove_high], 2)], que_abstr_ls, x1_mask,
            x2_mask)
        # history-aware attention,(修改question的的某一层的向量的时候,将passage和question所有的层拼接起来作为query和key)
        # query:all_layer_cancated_passage, key:all_layer_concated_question, value:question_layer[i] when calculating the i-th question_layer embedding

        # 修改问题之后,注意力加权平均,得到与doc在len_d维度一样的tensor,拼接到第二个flow层输出的doc表征上
        doc_hiddens = self.deep_attn_rnn(
            torch.cat((doc_info, doc_hiddens_flow), dim=2), x1_mask)  #过了rnn的结果
        doc_hiddens_flow = flow_operation(doc_hiddens, self.dialog_flow3)
        doc_abstr_ls += [doc_hiddens]

        # Self Attention Fusion Layer
        # For Passage do self attention
        x1_att = torch.cat(
            doc_abstr_ls,
            2)  # x1_att是过往所有层passage结合question之后的信息在hid_dim维度上的拼接
        if self.opt['self_attention_opt'] > 0:
            highlvl_self_attn_hiddens = self.highlvl_self_att(
                x1_att, x1_att, x1_mask, x3=doc_hiddens, drop_diagonal=True)
            # 在第三个flow处  doc_hiddens:passage的在len_d的维度上走过rnn的 doc_hiddens_flow:passage在max_qa_pairs这个维度上走过rnn,即第三个,最后一个,flow的输出
            # 拼接之后在len_d这个维度上走过rnn
            doc_hiddens = self.high_lvl_crnn(
                torch.cat(
                    [doc_hiddens, highlvl_self_attn_hiddens, doc_hiddens_flow],
                    dim=2), x1_mask)
        elif self.opt['self_attention_opt'] == 0:
            doc_hiddens = self.high_lvl_crnn(
                torch.cat([doc_hiddens, doc_hiddens_flow], dim=2), x1_mask)
        doc_abstr_ls += [doc_hiddens]

        # Merge the question hidden vectors
        q_merge_weights = self.self_attn(
            question_hiddens, x2_mask
        )  #question_hiddens is the final question hidden layer [batch * num_q , len_q , hidden * 2]
        # 计算出了自注意力的权重,#这个不是真的自注意力机制,是利用一个额外的向量z,对各个hidden进行点乘的注意力分数
        question_avg_hidden = layers.weighted_avg(
            question_hiddens, q_merge_weights)  #按照自注意力权重获得加权平均
        #[batch , hid]
        if self.opt['do_hierarchical_query']:  #default True
            #                                                                  [batch, max_qa_pair , hid ]
            #                     [batch , max_qa_pair , hid]  只是单向的,所以隐层还是hid,我好奇他最后是取句子级别的最后一个隐层单元吗?还是有attention,pooling一下
            question_avg_hidden = self.hier_query_rnn(
                question_avg_hidden.view(x1_full.size(0), x1_full.size(1), -1))
            question_avg_hidden = question_avg_hidden.contiguous().view(
                -1, question_avg_hidden.size(-1))  #[batch * max_qa_pair , hid]

        # Prediction Layer
        # Get Start, End span
        start_scores, end_scores = self.get_answer(doc_hiddens,
                                                   question_avg_hidden,
                                                   x1_mask)
        # both are [batch * q_num, len_d]
        all_start_scores = start_scores.view_as(
            x1_full)  # batch x q_num x len_d
        all_end_scores = end_scores.view_as(x1_full)  # batch x q_num x len_d

        # Get whether there is an answer
        #                           torch.cat( [batch , hidden] ,[batch , hidden]  , dim = 1) -> [batch , 2 * hidden]
        doc_avg_hidden = torch.cat(
            (torch.max(doc_hiddens, dim=1)[0], torch.mean(doc_hiddens, dim=1)),
            dim=1)
        # 预测答案的类型
        class_scores = self.ans_type_prediction(doc_avg_hidden,
                                                question_avg_hidden)
        all_class_scores = class_scores.view(x1_full.size(0), x1_full.size(1),
                                             -1)  # batch x q_num x class_num
        all_class_scores = all_class_scores.squeeze(-1)  # when class_num = 1
        #all_class_scores 没有在最后的class_num 维度上归一化softmax,这是为了方式class_num = 1的情况吧,当种类数目是1的时候,结果无论真实的分数是什么,softmax之后都是1

        return all_start_scores, all_end_scores, all_class_scores
    def forward(
            self,  # pylint: disable=arguments-differ
            character_ids: torch.Tensor,
            mask: torch.Tensor,
            mask_with_bos_eos: torch.Tensor,
            seg_ends: torch.Tensor,
            seg_map: torch.Tensor,
            seg_starts: torch.Tensor,
            tags: torch.Tensor) -> torch.Tensor:
        """
        Parameters
        ----------
        """
        # TODO(Swabha/Matt): detach tensors??? - Matt
        args_dict = {
            "mask": mask_with_bos_eos,
            "seg_ends": seg_ends,
            "seg_map": seg_map,
            "seg_starts": seg_starts,
            "tags": tags
        }
        if isinstance(self.seglm, LanguageModel):
            args_dict["tokens"] = {"elmo": character_ids}
        else:
            args_dict["character_ids"] = character_ids

        lm_output_dict = self.seglm(**args_dict)

        sequential_embeddings = lm_output_dict["sequential"]
        segmental_embeddings = lm_output_dict["segmental"]
        projection_embeddings = lm_output_dict["projection"]

        embeddings_list = []
        if self.use_all_base_layers:
            if isinstance(self.seglm, LanguageModel):
                raise NotImplementedError
            base_layer_embeddings = [
                emb.squeeze(1) for emb in lm_output_dict["activations"]
            ]
            embeddings_list.append(base_layer_embeddings)
        else:
            embeddings_list.append(sequential_embeddings)

        # Always include segmental layer.
        embeddings_list.append(segmental_embeddings)

        if self.use_projection_layer:
            embeddings_list.append(projection_embeddings)

        if self._scalar_mix is None:
            averaged_embeddings = segmental_embeddings
        elif self.concat_segmental:
            averaged_embeddings = torch.cat(
                (sequential_embeddings, segmental_embeddings), dim=-1)
        else:
            averaged_embeddings = self._dropout(
                self._scalar_mix(embeddings_list))

        averaged_embeddings_no_bos_eos, _ = remove_sentence_boundaries(
            averaged_embeddings, mask_with_bos_eos)
        return averaged_embeddings_no_bos_eos
Exemple #38
0
from allennlp.commands.elmo import ElmoEmbedder
from allennlp.nn.util import remove_sentence_boundaries
# url to the pre-trained model
options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
# the ELMo class
elmo_bilm = ElmoEmbedder(options_file, weight_file).elmo_bilm
elmo_bilm.cuda()
sentences = [['Today', 'is', 'sunny', '.'], ['Hello', '!']]
# obtain character ids for each word. Size: batch_size × max_sentence_len × word_len
character_ids = batch_to_ids(sentences).cuda()
# ELMo's output
bilm_output = elmo_bilm(character_ids)
# ELMo embeddings for each layer
layer_activations = bilm_output['activations']
# indicate whether there is a word at each position
mask_with_bos_eos = bilm_output['mask']
# remove the special sentence start and end symbols added by ELMo
without_bos_eos = [remove_sentence_boundaries(layer, mask_with_bos_eos) for layer in layer_activations]
# three layers of 1024D ELMo embeddings. Size: 3 × batch_size × max_sentence_len × 1024
all_layers = torch.cat([ele[0].unsqueeze(0) for ele in without_bos_eos], dim=0)
# paraemeters for weighted sum
s = nn.Parameter(torch.Tensor([1., 1., 1.]), requires_grad=True).cuda()
# normalize the weights 
s = F.softmax(s, dim=0)
# the multiplier γ
gamma = nn.Parameter(torch.Tensor(1, 1), requires_grad=True).cuda()
# ELMo embedding. Size: batch_size × max_sentence_len × 1024
res = (all_layers[0]*s[0]+ all_layers[1]*s[1]+ all_layers[2]*s[2]) * gamma
print(res.shape)