Beispiel #1
0
    def test_elmo_char_cnn_cache_does_not_raise_error_for_uncached_words(self):
        sentences = [["This", "is", "OOV"], ["so", "is", "this"]]
        in_vocab_sentences = [["here", "is"], ["a", "vocab"]]
        oov_tensor = self.get_vocab_and_both_elmo_indexed_ids(sentences)[1]
        vocab, in_vocab_tensor = self.get_vocab_and_both_elmo_indexed_ids(in_vocab_sentences)
        words_to_cache = list(vocab.get_token_to_index_vocabulary("tokens").keys())
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file, vocab_to_cache=words_to_cache)

        elmo_bilm(in_vocab_tensor["character_ids"], in_vocab_tensor["tokens"])
        elmo_bilm(oov_tensor["character_ids"], oov_tensor["tokens"])
Beispiel #2
0
    def __init__(self, options_file, weight_file, device=None):
        self._elmo_lstm = _ElmoBiLm(options_file,
                                    weight_file,
                                    requires_grad=False,
                                    vocab_to_cache=None)

        if device is not None:
            self._elmo_lstm = self._elmo_lstm.to(device)

        self.output_dim = self._elmo_lstm.get_output_dim()
Beispiel #3
0
    def test_elmo_bilm_can_cache_char_cnn_embeddings(self):
        sentences = [["This", "is", "a", "sentence"],
                     ["Here", "'s", "one"],
                     ["Another", "one"]]
        vocab, tensor = self.get_vocab_and_both_elmo_indexed_ids(sentences)
        words_to_cache = list(vocab.get_token_to_index_vocabulary("tokens").keys())
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)
        elmo_bilm.eval()
        no_cache = elmo_bilm(tensor["character_ids"], tensor["character_ids"])

        # ELMo is stateful, so we need to actually re-initialise it for this comparison to work.
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file, vocab_to_cache=words_to_cache)
        elmo_bilm.eval()
        cached = elmo_bilm(tensor["character_ids"], tensor["tokens"])

        numpy.testing.assert_array_almost_equal(no_cache["mask"].data.cpu().numpy(),
                                                cached["mask"].data.cpu().numpy())
        for activation_cached, activation in zip(cached["activations"], no_cache["activations"]):
            numpy.testing.assert_array_almost_equal(activation_cached.data.cpu().numpy(),
                                                    activation.data.cpu().numpy(), decimal=6)
Beispiel #4
0
    def test_elmo_with_module(self):
        # We will create the _ElmoBilm class and pass it in as a module.
        sentences = [['The', 'sentence', '.'],
                     ['ELMo', 'helps', 'disambiguate', 'ELMo', 'from', 'Elmo', '.']]

        character_ids = self._sentences_to_ids(sentences)
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)
        elmo = Elmo(None, None, 2, dropout=0.0, module=elmo_bilm)
        output = elmo(character_ids)
        elmo_representations = output['elmo_representations']

        assert len(elmo_representations) == 2
        for k in range(2):
            assert list(elmo_representations[k].size()) == [2, 7, 32]
Beispiel #5
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {'character_ids': indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        vocab = Vocabulary()

        # Now finally we can iterate through batches.
        iterator = BasicIterator(3)
        iterator.index_with(vocab)
        for i, batch in enumerate(iterator(instances, num_epochs=1, shuffle=False)):
            lm_embeddings = elmo_bilm(batch['elmo']['character_ids'])
            top_layer_embeddings, mask = remove_sentence_boundaries(
                    lm_embeddings['activations'][2],
                    lm_embeddings['mask']
            )

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                    len(sentence.split()) for sentence in batch_sentences
            ]
            self.assertEqual(lengths.tolist(), expected_lengths)

            # get the expected embeddings and compare!
            expected_top_layer = [expected_lm_embeddings[k][i] for k in range(3)]
            for k in range(3):
                self.assertTrue(
                        numpy.allclose(
                                top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                                expected_top_layer[k],
                                atol=1.0e-6
                        )
                )
Beispiel #6
0
    def __init__(self,
                 options_file: str = DEFAULT_OPTIONS_FILE,
                 weight_file: str = DEFAULT_WEIGHT_FILE,
                 cuda_device: int = -1) -> None:
        """
        Parameters
        ----------
        options_file : ``str``, optional
            A path or URL to an ELMo options file.
        weight_file : ``str``, optional
            A path or URL to an ELMo weights file.
        cuda_device : ``int``, optional, (default=-1)
            The GPU device to run on.
        """
        self.indexer = ELMoTokenCharactersIndexer()

        logger.info("Initializing ELMo.")
        self.elmo_bilm = _ElmoBiLm(options_file, weight_file)
        if cuda_device >= 0:
            self.elmo_bilm = self.elmo_bilm.cuda(device=cuda_device)

        self.cuda_device = cuda_device
Beispiel #7
0
def test_elmo_lstm_factory_simple():
    allennlp_elmo_bilm = _ElmoBiLm(
        ELMO_OPTIONS_FILE,
        ELMO_WEIGHT_FILE,
    )

    embedder = ElmoCharacterEncoderFactory(
        ELMO_OPTIONS_FILE,
        ELMO_WEIGHT_FILE,
    ).create()
    fwd_lstm, bwd_lstm = ElmoLstmFactory(
        ELMO_OPTIONS_FILE,
        ELMO_WEIGHT_FILE,
    ).create(enable_forward=True, enable_backward=True)

    sentences_1 = [
        ['ELMo', 'helps', 'disambiguate', 'ELMo', 'from', 'Elmo', '.'],
        ['The', 'sentence', '.'],
    ]
    sentences_2 = [
        ["This", "is", "a", "sentence"],
        ["Here", "'s", "one"],
        ["Another", "one"],
    ]

    # Internal states should be updated.
    for sentences in [sentences_1, sentences_2] * 10:
        # `(2, 7, 50)`
        character_ids = _sentences_to_ids(sentences)

        # AllenNLP.
        allennlp_out = allennlp_elmo_bilm(character_ids)

        # Ours.
        inputs = character_ids
        _beginning_of_sentence_characters = torch.from_numpy(
            np.array(ELMoCharacterMapper.beginning_of_sentence_characters) + 1)
        _end_of_sentence_characters = torch.from_numpy(
            np.array(ELMoCharacterMapper.end_of_sentence_characters) + 1)
        # Add BOS/EOS
        mask = ((inputs > 0).long().sum(dim=-1) > 0).long()
        character_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids(
            inputs,
            mask,
            _beginning_of_sentence_characters,
            _end_of_sentence_characters,
        )
        # Pack input.
        lengths = mask_with_bos_eos.sum(dim=-1)
        inputs = pack_padded_sequence(character_ids_with_bos_eos,
                                      lengths,
                                      batch_first=True)
        char_repr = embedder(inputs.data)
        fwd_lstm_hiddens, _ = fwd_lstm(char_repr, inputs.batch_sizes)
        bwd_lstm_hiddens, _ = bwd_lstm(char_repr, inputs.batch_sizes)
        lstm_hiddens = [
            torch.cat([fwd, bwd], dim=-1)
            for fwd, bwd in zip(fwd_lstm_hiddens, bwd_lstm_hiddens)
        ]
        # Unpack output.
        char_repr = _unpack(char_repr, inputs.batch_sizes)
        duplicated_char_repr = torch.cat(
            [char_repr, char_repr],
            dim=-1,
        ) * mask_with_bos_eos.float().unsqueeze(-1)
        lstm_hiddens = [_unpack(hx, inputs.batch_sizes) for hx in lstm_hiddens]

        # TODO: Investigate the numerical stability issue.
        # np.testing.assert_array_almost_equal(
        #         duplicated_char_repr.data.numpy(),
        #         allennlp_out['activations'][0].data.numpy(),
        # )
        # np.testing.assert_array_almost_equal(
        #         lstm_hiddens[0].data.numpy(),
        #         allennlp_out['activations'][1].data.numpy(),
        # )
        np.testing.assert_array_almost_equal(
            lstm_hiddens[1].data.numpy(),
            allennlp_out['activations'][2].data.numpy(),
        )
Beispiel #8
0
import h5py

import numpy as np

from allennlp.data.dataset import Dataset
from allennlp.data import Token, Vocabulary, Instance
from allennlp.data.fields import TextField
from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer
from allennlp.nn.util import remove_sentence_boundaries
from allennlp.modules.elmo import _ElmoBiLm

from chunking.data import variableFromSentence

options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
elmo_bilm = _ElmoBiLm(options_file, weight_file).cuda()

indexer = ELMoTokenCharactersIndexer()

__all__ = [
    'elmo_bilm', 'embed_sentence', 'ElmoEmbedder', 'variablesFromPairElmo',
    'elmo_variable_from_sentence'
]

use_cuda = torch.cuda.is_available()


class ElmoEmbedder(Module):
    def __init__(self, elmo_bilm, special_tokens, device):
        super(ElmoEmbedder, self).__init__()
        self.elmo_bilm = elmo_bilm