def test_embedding_layer_actually_initializes_word_vectors_correctly(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0" vocab.add_token_to_namespace(unicode_space) embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write( f"{unicode_space} 3.4 3.3 5.0\n".encode('utf-8')) params = Params({ 'pretrained_file': embeddings_filename, 'embedding_dim': 3, }) embedding_layer = Embedding.from_params(vocab, params) word_vector = embedding_layer.weight.data[vocab.get_token_index( "word")] assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index( unicode_space)] assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index( "word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor: """ Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters (len(batch), max sentence length, max word length). Parameters ---------- batch : ``List[List[str]]``, required A list of tokenized sentences. Returns ------- A tensor of padded character ids. """ instances = [] indexer = ELMoTokenCharactersIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def test_make_vocab_without_extension(self): existing_serialization_dir = self.TEST_DIR / 'existing' extended_serialization_dir = self.TEST_DIR / 'extended' existing_vocab_path = existing_serialization_dir / 'vocabulary' extended_vocab_path = extended_serialization_dir / 'vocabulary' vocab = Vocabulary() vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens') vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens') # if extend is False, its users responsibility to make sure that dataset instances # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in # namespace for which there could be OOV entries seen in dataset during indexing. # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token. # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront. vocab.add_token_to_namespace('N', namespace='labels') vocab.add_token_to_namespace('V', namespace='labels') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params['vocabulary'] = {} self.params['vocabulary']['directory_path'] = existing_vocab_path self.params['vocabulary']['extend'] = False make_vocab_from_params(self.params, extended_serialization_dir) with open(extended_vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == '@@UNKNOWN@@' assert tokens[1] == 'some_weird_token_1' assert tokens[2] == 'some_weird_token_2' assert len(tokens) == 3
def setUp(self): super(IteratorTest, self).setUp() self.token_indexers = {"tokens": SingleIdTokenIndexer()} self.vocab = Vocabulary() self.this_index = self.vocab.add_token_to_namespace('this') self.is_index = self.vocab.add_token_to_namespace('is') self.a_index = self.vocab.add_token_to_namespace('a') self.sentence_index = self.vocab.add_token_to_namespace('sentence') self.another_index = self.vocab.add_token_to_namespace('another') self.yet_index = self.vocab.add_token_to_namespace('yet') self.very_index = self.vocab.add_token_to_namespace('very') self.long_index = self.vocab.add_token_to_namespace('long') instances = [ self.create_instance(["this", "is", "a", "sentence"]), self.create_instance(["this", "is", "another", "sentence"]), self.create_instance(["yet", "another", "sentence"]), self.create_instance([ "this", "is", "a", "very", "very", "very", "very", "long", "sentence" ]), self.create_instance(["sentence"]), ] class LazyIterable: def __iter__(self): return (instance for instance in instances) self.instances = instances self.lazy_instances = LazyIterable()
def test_index_converts_field_correctly(self): vocab = Vocabulary() sentence_index = vocab.add_token_to_namespace("sentence", namespace='words') capital_a_index = vocab.add_token_to_namespace("A", namespace='words') capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters') s_index = vocab.add_token_to_namespace("s", namespace='characters') e_index = vocab.add_token_to_namespace("e", namespace='characters') n_index = vocab.add_token_to_namespace("n", namespace='characters') t_index = vocab.add_token_to_namespace("t", namespace='characters') c_index = vocab.add_token_to_namespace("c", namespace='characters') field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) field.index(vocab) # pylint: disable=protected-access assert field._indexed_tokens["words"] == [capital_a_index, sentence_index] field1 = TextField([Token(t) for t in ["A", "sentence"]], {"characters": TokenCharactersIndexer(namespace="characters")}) field1.index(vocab) assert field1._indexed_tokens["characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]] field2 = TextField([Token(t) for t in ["A", "sentence"]], token_indexers={"words": SingleIdTokenIndexer(namespace="words"), "characters": TokenCharactersIndexer(namespace="characters")}) field2.index(vocab) assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index] assert field2._indexed_tokens["characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]]
def test_read_embedding_file_inside_archive(self): token2vec = { "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]), "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]), "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]), "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0]) } vocab = Vocabulary() for token in token2vec: vocab.add_token_to_namespace(token) params = Params({ 'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'), 'embedding_dim': 5 }) with pytest.raises( ValueError, message= "No ValueError when pretrained_file is a multi-file archive"): Embedding.from_params(vocab, params) for ext in ['.zip', '.tar.gz']: archive_path = str( self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext file_uri = format_embeddings_file_uri( archive_path, 'folder/fake_embeddings.5d.txt') params = Params({'pretrained_file': file_uri, 'embedding_dim': 5}) embeddings = Embedding.from_params(vocab, params).weight.data for tok, vec in token2vec.items(): i = vocab.get_token_index(tok) assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
def setUp(self): super(TestTokenCharactersEncoder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1", "token_characters") self.vocab.add_token_to_namespace("2", "token_characters") self.vocab.add_token_to_namespace("3", "token_characters") self.vocab.add_token_to_namespace("4", "token_characters") params = Params({ "embedding": { "embedding_dim": 2, "vocab_namespace": "token_characters" }, "encoder": { "type": "cnn", "embedding_dim": 2, "num_filters": 4, "ngram_filter_sizes": [1, 2], "output_dim": 3 } }) self.encoder = TokenCharactersEncoder.from_params( vocab=self.vocab, params=deepcopy(params)) self.embedding = Embedding.from_params(vocab=self.vocab, params=params["embedding"]) self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"]) constant_init = lambda tensor: torch.nn.init.constant_(tensor, 1.) initializer = InitializerApplicator([(".*", constant_init)]) initializer(self.encoder) initializer(self.embedding) initializer(self.inner_encoder)
def test_elmo_as_array_produces_token_sequence(self): # pylint: disable=invalid-name indexer = ELMoTokenCharactersIndexer() tokens = [Token('Second'), Token('.')] indices = indexer.tokens_to_indices(tokens, Vocabulary(), "test-elmo")["test-elmo"] padded_tokens = indexer.pad_token_sequence( {'test-elmo': indices}, desired_num_tokens={'test-elmo': 3}, padding_lengths={}) expected_padded_tokens = [ [ 259, 84, 102, 100, 112, 111, 101, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ], [ 259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] ] assert padded_tokens['test-elmo'] == expected_padded_tokens
def setUp(self): super(TestBasicTextFieldEmbedder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1") self.vocab.add_token_to_namespace("2") self.vocab.add_token_to_namespace("3") self.vocab.add_token_to_namespace("4") params = Params({ "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } }) self.token_embedder = BasicTextFieldEmbedder.from_params( vocab=self.vocab, params=params) self.inputs = { "words1": torch.LongTensor([[0, 2, 3, 5]]), "words2": torch.LongTensor([[1, 4, 3, 2]]), "words3": torch.LongTensor([[1, 5, 1, 2]]) }
def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this") self.vocab.add_token_to_namespace("is") self.vocab.add_token_to_namespace("a") self.vocab.add_token_to_namespace("sentence") self.vocab.add_token_to_namespace(".") self.token_indexer = {"tokens": SingleIdTokenIndexer()} self.instances = self.get_instances() super(TestDataset, self).setUp()
def test_label_field_can_index_with_vocab(self): vocab = Vocabulary() vocab.add_token_to_namespace("entailment", namespace="labels") vocab.add_token_to_namespace("contradiction", namespace="labels") vocab.add_token_to_namespace("neutral", namespace="labels") label = LabelField("entailment") label.index(vocab) tensor = label.as_tensor(label.get_padding_lengths()) assert tensor.item() == 0
def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("sentence", namespace='words') self.vocab.add_token_to_namespace("A", namespace='words') self.vocab.add_token_to_namespace("A", namespace='characters') self.vocab.add_token_to_namespace("s", namespace='characters') self.vocab.add_token_to_namespace("e", namespace='characters') self.vocab.add_token_to_namespace("n", namespace='characters') self.vocab.add_token_to_namespace("t", namespace='characters') self.vocab.add_token_to_namespace("c", namespace='characters') super(TestTextField, self).setUp()
def test_bos_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token('<S>')], Vocabulary(), "test-elmo") expected_indices = [ 259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ] assert indices == {"test-elmo": [expected_indices]}
def test_unicode_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token(chr(256) + 't')], Vocabulary(), "test-unicode") expected_indices = [ 259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ] assert indices == {"test-unicode": [expected_indices]}
def test_tokens_to_indices_produces_correct_characters(self): vocab = Vocabulary() vocab.add_token_to_namespace("A", namespace='characters') vocab.add_token_to_namespace("s", namespace='characters') vocab.add_token_to_namespace("e", namespace='characters') vocab.add_token_to_namespace("n", namespace='characters') vocab.add_token_to_namespace("t", namespace='characters') vocab.add_token_to_namespace("c", namespace='characters') indexer = TokenCharactersIndexer("characters") indices = indexer.tokens_to_indices([Token("sentential")], vocab, "char") assert indices == {"char": [[3, 4, 5, 6, 4, 5, 6, 1, 1, 1]]}
def test_get_embedding_layer_uses_correct_embedding_dim(self): vocab = Vocabulary() vocab.add_token_to_namespace('word1') vocab.add_token_to_namespace('word2') embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8')) embedding_weights = _read_pretrained_embeddings_file( embeddings_filename, 3, vocab) assert tuple(embedding_weights.size()) == ( 4, 3) # 4 because of padding and OOV with pytest.raises(ConfigurationError): _read_pretrained_embeddings_file(embeddings_filename, 4, vocab)
def test_tokens_to_indices_uses_ner_tags(self): tokens = self.tokenizer.split_words("Larry Page is CEO of Google.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags') none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags') vocab.add_token_to_namespace('ORG', namespace='ner_tags') indexer = NerTagIndexer() assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == { "tokens1": [person_index] } assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == { "tokens-1": [none_index] }
def test_tokens_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() root_index = vocab.add_token_to_namespace('ROOT', namespace='dep_labels') none_index = vocab.add_token_to_namespace('NONE', namespace='dep_labels') indexer = DepLabelIndexer() assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == { "tokens1": [root_index] } assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == { "tokens-1": [none_index] }
def test_elmo_token_representation_bos_eos(self): # The additional <S> and </S> embeddings added by the embedder should be as expected. indexer = ELMoTokenCharactersIndexer() elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file) for correct_index, token in [[0, '<S>'], [2, '</S>']]: indices = indexer.tokens_to_indices([Token(token)], Vocabulary(), "correct") indices = torch.from_numpy(numpy.array(indices["correct"])).view( 1, 1, -1) embeddings = elmo_token_embedder(indices)['token_embedding'] assert numpy.allclose(embeddings[0, correct_index, :].data.numpy(), embeddings[0, 1, :].data.numpy())
def _sentences_to_ids(self, sentences): indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for sentence in sentences: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({'elmo': field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in zip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) vocab = Vocabulary() # Now finally we can iterate through batches. iterator = BasicIterator(3) iterator.index_with(vocab) for i, batch in enumerate( iterator(instances, num_epochs=1, shuffle=False)): lm_embeddings = elmo_bilm(batch['elmo']['character_ids']) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings['activations'][2], lm_embeddings['mask']) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] self.assertEqual(lengths.tolist(), expected_lengths) # get the expected embeddings and compare! expected_top_layer = [ expected_lm_embeddings[k][i] for k in range(3) ] for k in range(3): self.assertTrue( numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6))
def test_read_hdf5_raises_on_invalid_shape(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") embeddings_filename = str(self.TEST_DIR / "embeddings.hdf5") embeddings = numpy.random.rand(vocab.get_vocab_size(), 10) with h5py.File(embeddings_filename, 'w') as fout: _ = fout.create_dataset('embedding', embeddings.shape, dtype='float32', data=embeddings) params = Params({ 'pretrained_file': embeddings_filename, 'embedding_dim': 5, }) with pytest.raises(ConfigurationError): _ = Embedding.from_params(vocab, params)
def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero( self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8')) params = Params({ 'pretrained_file': embeddings_filename, 'embedding_dim': 3, }) embedding_layer = Embedding.from_params(vocab, params) word_vector = embedding_layer.weight.data[vocab.get_token_index( "word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([0.0, 0.0, 0.0]))
def setUp(self): super(SpanBasedF1Test, self).setUp() vocab = Vocabulary() vocab.add_token_to_namespace("O", "tags") vocab.add_token_to_namespace("B-ARG1", "tags") vocab.add_token_to_namespace("I-ARG1", "tags") vocab.add_token_to_namespace("B-ARG2", "tags") vocab.add_token_to_namespace("I-ARG2", "tags") vocab.add_token_to_namespace("B-V", "tags") vocab.add_token_to_namespace("I-V", "tags") vocab.add_token_to_namespace("U-ARG1", "tags") vocab.add_token_to_namespace("U-ARG2", "tags") vocab.add_token_to_namespace("B-C-ARG1", "tags") vocab.add_token_to_namespace("I-C-ARG1", "tags") vocab.add_token_to_namespace("B-ARGM-ADJ", "tags") vocab.add_token_to_namespace("I-ARGM-ADJ", "tags") self.vocab = vocab
def test_read_hdf5_format_file(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") embeddings_filename = str(self.TEST_DIR / "embeddings.hdf5") embeddings = numpy.random.rand(vocab.get_vocab_size(), 5) with h5py.File(embeddings_filename, 'w') as fout: _ = fout.create_dataset('embedding', embeddings.shape, dtype='float32', data=embeddings) params = Params({ 'pretrained_file': embeddings_filename, 'embedding_dim': 5, }) embedding_layer = Embedding.from_params(vocab, params) assert numpy.allclose(embedding_layer.weight.data.numpy(), embeddings)
def setUp(self): self.tokenizer = WordTokenizer(SpacyWordSplitter(pos_tags=True)) self.utterance = self.tokenizer.tokenize("where is mersin?") self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")} json = { 'question': self.utterance, 'columns': ['Name in English', 'Location in English'], 'cells': [['Paradeniz', 'Mersin'], ['Lake Gala', 'Edirne']] } self.graph = TableQuestionKnowledgeGraph.read_from_json(json) self.vocab = Vocabulary() self.name_index = self.vocab.add_token_to_namespace("name", namespace='tokens') self.in_index = self.vocab.add_token_to_namespace("in", namespace='tokens') self.english_index = self.vocab.add_token_to_namespace( "english", namespace='tokens') self.location_index = self.vocab.add_token_to_namespace( "location", namespace='tokens') self.paradeniz_index = self.vocab.add_token_to_namespace( "paradeniz", namespace='tokens') self.mersin_index = self.vocab.add_token_to_namespace( "mersin", namespace='tokens') self.lake_index = self.vocab.add_token_to_namespace("lake", namespace='tokens') self.gala_index = self.vocab.add_token_to_namespace("gala", namespace='tokens') self.negative_one_index = self.vocab.add_token_to_namespace( "-1", namespace='tokens') self.zero_index = self.vocab.add_token_to_namespace("0", namespace='tokens') self.one_index = self.vocab.add_token_to_namespace("1", namespace='tokens') self.oov_index = self.vocab.get_token_index('random OOV string', namespace='tokens') self.edirne_index = self.oov_index self.field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer) super(KnowledgeGraphFieldTest, self).setUp()
def test_forward_works_with_projection_layer(self): vocab = Vocabulary() vocab.add_token_to_namespace('the') vocab.add_token_to_namespace('a') params = Params({ 'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/glove.6B.300d.sample.txt.gz'), 'embedding_dim': 300, 'projection_dim': 20 }) embedding_layer = Embedding.from_params(vocab, params) input_tensor = torch.LongTensor([[3, 2, 1, 0]]) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 4, 20) input_tensor = torch.LongTensor([[[3, 2, 1, 0]]]) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 1, 4, 20)
def test_get_vocab_index_mapping(self): # pylint: disable=line-too-long mml_model_archive_file = (self.FIXTURES_ROOT / "semantic_parsing" / "nlvr_direct_semantic_parser" / "serialization" / "model.tar.gz") archive = load_archive(mml_model_archive_file) mapping = self.model._get_vocab_index_mapping(archive.model.vocab) expected_mapping = [(i, i) for i in range(16)] assert mapping == expected_mapping new_vocab = Vocabulary() def copy_token_at_index(i): token = self.vocab.get_token_from_index(i, "tokens") new_vocab.add_token_to_namespace(token, "tokens") copy_token_at_index(5) copy_token_at_index(7) copy_token_at_index(10) mapping = self.model._get_vocab_index_mapping(new_vocab) # Mapping of indices from model vocabulary to new vocabulary. 0 and 1 are padding and unk # tokens. assert mapping == [(0, 0), (1, 1), (5, 2), (7, 3), (10, 4)]
def test_make_vocab_with_extension(self): existing_serialization_dir = self.TEST_DIR / 'existing' extended_serialization_dir = self.TEST_DIR / 'extended' existing_vocab_path = existing_serialization_dir / 'vocabulary' extended_vocab_path = extended_serialization_dir / 'vocabulary' vocab = Vocabulary() vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens') vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params['vocabulary'] = {} self.params['vocabulary']['directory_path'] = existing_vocab_path self.params['vocabulary']['extend'] = True self.params['vocabulary']['min_count'] = {"tokens": 3} make_vocab_from_params(self.params, extended_serialization_dir) vocab_files = os.listdir(extended_vocab_path) assert set(vocab_files) == { 'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt' } with open(extended_vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == '@@UNKNOWN@@' assert tokens[1] == 'some_weird_token_1' assert tokens[2] == 'some_weird_token_2' tokens.sort() assert tokens == [ '.', '@@UNKNOWN@@', 'animals', 'are', 'some_weird_token_1', 'some_weird_token_2' ] with open(extended_vocab_path / 'labels.txt') as f: labels = [line.strip() for line in f] labels.sort() assert labels == ['N', 'V']
def test_elmo_token_representation(self): # Load the test words and convert to char ids with open(os.path.join(self.elmo_fixtures_path, 'vocab_test.txt'), 'r') as fin: words = fin.read().strip().split('\n') vocab = Vocabulary() indexer = ELMoTokenCharactersIndexer() tokens = [Token(word) for word in words] indices = indexer.tokens_to_indices(tokens, vocab, "elmo") # There are 457 tokens. Reshape into 10 batches of 50 tokens. sentences = [] for k in range(10): char_indices = indices["elmo"][(k * 50):((k + 1) * 50)] sentences.append( indexer.pad_token_sequence({'key': char_indices}, desired_num_tokens={'key': 50}, padding_lengths={})['key']) batch = torch.from_numpy(numpy.array(sentences)) elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file) elmo_token_embedder_output = elmo_token_embedder(batch) # Reshape back to a list of words and compare with ground truth. Need to also # remove <S>, </S> actual_embeddings = remove_sentence_boundaries( elmo_token_embedder_output['token_embedding'], elmo_token_embedder_output['mask'])[0].data.numpy() actual_embeddings = actual_embeddings.reshape( -1, actual_embeddings.shape[-1]) embedding_file = os.path.join(self.elmo_fixtures_path, 'elmo_token_embeddings.hdf5') with h5py.File(embedding_file, 'r') as fin: expected_embeddings = fin['embedding'][...] assert numpy.allclose(actual_embeddings[:len(tokens)], expected_embeddings, atol=1e-6)