def test_elmo_char_cnn_cache_does_not_raise_error_for_uncached_words(self): sentences = [["This", "is", "OOV"], ["so", "is", "this"]] in_vocab_sentences = [["here", "is"], ["a", "vocab"]] oov_tensor = self.get_vocab_and_both_elmo_indexed_ids(sentences)[1] vocab, in_vocab_tensor = self.get_vocab_and_both_elmo_indexed_ids(in_vocab_sentences) words_to_cache = list(vocab.get_token_to_index_vocabulary("tokens").keys()) elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file, vocab_to_cache=words_to_cache) elmo_bilm(in_vocab_tensor["character_ids"], in_vocab_tensor["tokens"]) elmo_bilm(oov_tensor["character_ids"], oov_tensor["tokens"])
def __init__(self, options_file, weight_file, device=None): self._elmo_lstm = _ElmoBiLm(options_file, weight_file, requires_grad=False, vocab_to_cache=None) if device is not None: self._elmo_lstm = self._elmo_lstm.to(device) self.output_dim = self._elmo_lstm.get_output_dim()
def test_elmo_bilm_can_cache_char_cnn_embeddings(self): sentences = [["This", "is", "a", "sentence"], ["Here", "'s", "one"], ["Another", "one"]] vocab, tensor = self.get_vocab_and_both_elmo_indexed_ids(sentences) words_to_cache = list(vocab.get_token_to_index_vocabulary("tokens").keys()) elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) elmo_bilm.eval() no_cache = elmo_bilm(tensor["character_ids"], tensor["character_ids"]) # ELMo is stateful, so we need to actually re-initialise it for this comparison to work. elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file, vocab_to_cache=words_to_cache) elmo_bilm.eval() cached = elmo_bilm(tensor["character_ids"], tensor["tokens"]) numpy.testing.assert_array_almost_equal(no_cache["mask"].data.cpu().numpy(), cached["mask"].data.cpu().numpy()) for activation_cached, activation in zip(cached["activations"], no_cache["activations"]): numpy.testing.assert_array_almost_equal(activation_cached.data.cpu().numpy(), activation.data.cpu().numpy(), decimal=6)
def test_elmo_with_module(self): # We will create the _ElmoBilm class and pass it in as a module. sentences = [['The', 'sentence', '.'], ['ELMo', 'helps', 'disambiguate', 'ELMo', 'from', 'Elmo', '.']] character_ids = self._sentences_to_ids(sentences) elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) elmo = Elmo(None, None, 2, dropout=0.0, module=elmo_bilm) output = elmo(character_ids) elmo_representations = output['elmo_representations'] assert len(elmo_representations) == 2 for k in range(2): assert list(elmo_representations[k].size()) == [2, 7, 32]
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in zip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) vocab = Vocabulary() # Now finally we can iterate through batches. iterator = BasicIterator(3) iterator.index_with(vocab) for i, batch in enumerate(iterator(instances, num_epochs=1, shuffle=False)): lm_embeddings = elmo_bilm(batch['elmo']['character_ids']) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings['activations'][2], lm_embeddings['mask'] ) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] self.assertEqual(lengths.tolist(), expected_lengths) # get the expected embeddings and compare! expected_top_layer = [expected_lm_embeddings[k][i] for k in range(3)] for k in range(3): self.assertTrue( numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6 ) )
def __init__(self, options_file: str = DEFAULT_OPTIONS_FILE, weight_file: str = DEFAULT_WEIGHT_FILE, cuda_device: int = -1) -> None: """ Parameters ---------- options_file : ``str``, optional A path or URL to an ELMo options file. weight_file : ``str``, optional A path or URL to an ELMo weights file. cuda_device : ``int``, optional, (default=-1) The GPU device to run on. """ self.indexer = ELMoTokenCharactersIndexer() logger.info("Initializing ELMo.") self.elmo_bilm = _ElmoBiLm(options_file, weight_file) if cuda_device >= 0: self.elmo_bilm = self.elmo_bilm.cuda(device=cuda_device) self.cuda_device = cuda_device
def test_elmo_lstm_factory_simple(): allennlp_elmo_bilm = _ElmoBiLm( ELMO_OPTIONS_FILE, ELMO_WEIGHT_FILE, ) embedder = ElmoCharacterEncoderFactory( ELMO_OPTIONS_FILE, ELMO_WEIGHT_FILE, ).create() fwd_lstm, bwd_lstm = ElmoLstmFactory( ELMO_OPTIONS_FILE, ELMO_WEIGHT_FILE, ).create(enable_forward=True, enable_backward=True) sentences_1 = [ ['ELMo', 'helps', 'disambiguate', 'ELMo', 'from', 'Elmo', '.'], ['The', 'sentence', '.'], ] sentences_2 = [ ["This", "is", "a", "sentence"], ["Here", "'s", "one"], ["Another", "one"], ] # Internal states should be updated. for sentences in [sentences_1, sentences_2] * 10: # `(2, 7, 50)` character_ids = _sentences_to_ids(sentences) # AllenNLP. allennlp_out = allennlp_elmo_bilm(character_ids) # Ours. inputs = character_ids _beginning_of_sentence_characters = torch.from_numpy( np.array(ELMoCharacterMapper.beginning_of_sentence_characters) + 1) _end_of_sentence_characters = torch.from_numpy( np.array(ELMoCharacterMapper.end_of_sentence_characters) + 1) # Add BOS/EOS mask = ((inputs > 0).long().sum(dim=-1) > 0).long() character_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids( inputs, mask, _beginning_of_sentence_characters, _end_of_sentence_characters, ) # Pack input. lengths = mask_with_bos_eos.sum(dim=-1) inputs = pack_padded_sequence(character_ids_with_bos_eos, lengths, batch_first=True) char_repr = embedder(inputs.data) fwd_lstm_hiddens, _ = fwd_lstm(char_repr, inputs.batch_sizes) bwd_lstm_hiddens, _ = bwd_lstm(char_repr, inputs.batch_sizes) lstm_hiddens = [ torch.cat([fwd, bwd], dim=-1) for fwd, bwd in zip(fwd_lstm_hiddens, bwd_lstm_hiddens) ] # Unpack output. char_repr = _unpack(char_repr, inputs.batch_sizes) duplicated_char_repr = torch.cat( [char_repr, char_repr], dim=-1, ) * mask_with_bos_eos.float().unsqueeze(-1) lstm_hiddens = [_unpack(hx, inputs.batch_sizes) for hx in lstm_hiddens] # TODO: Investigate the numerical stability issue. # np.testing.assert_array_almost_equal( # duplicated_char_repr.data.numpy(), # allennlp_out['activations'][0].data.numpy(), # ) # np.testing.assert_array_almost_equal( # lstm_hiddens[0].data.numpy(), # allennlp_out['activations'][1].data.numpy(), # ) np.testing.assert_array_almost_equal( lstm_hiddens[1].data.numpy(), allennlp_out['activations'][2].data.numpy(), )
import h5py import numpy as np from allennlp.data.dataset import Dataset from allennlp.data import Token, Vocabulary, Instance from allennlp.data.fields import TextField from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer from allennlp.nn.util import remove_sentence_boundaries from allennlp.modules.elmo import _ElmoBiLm from chunking.data import variableFromSentence options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" elmo_bilm = _ElmoBiLm(options_file, weight_file).cuda() indexer = ELMoTokenCharactersIndexer() __all__ = [ 'elmo_bilm', 'embed_sentence', 'ElmoEmbedder', 'variablesFromPairElmo', 'elmo_variable_from_sentence' ] use_cuda = torch.cuda.is_available() class ElmoEmbedder(Module): def __init__(self, elmo_bilm, special_tokens, device): super(ElmoEmbedder, self).__init__() self.elmo_bilm = elmo_bilm