def test_elmo_as_array_produces_token_sequence(self): # pylint: disable=invalid-name indexer = ELMoTokenCharactersIndexer() tokens = [Token('Second'), Token('.')] indices = indexer.tokens_to_indices(tokens, Vocabulary(), "test-elmo")["test-elmo"] padded_tokens = indexer.pad_token_sequence( {'test-elmo': indices}, desired_num_tokens={'test-elmo': 3}, padding_lengths={}) expected_padded_tokens = [ [ 259, 84, 102, 100, 112, 111, 101, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ], [ 259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] ] assert padded_tokens['test-elmo'] == expected_padded_tokens
def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tags"] == { 'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2 } indexer._coarse_tags = True # pylint: disable=protected-access counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tags"] == { 'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2 }
def test_index_converts_field_correctly(self): vocab = Vocabulary() sentence_index = vocab.add_token_to_namespace("sentence", namespace='words') capital_a_index = vocab.add_token_to_namespace("A", namespace='words') capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters') s_index = vocab.add_token_to_namespace("s", namespace='characters') e_index = vocab.add_token_to_namespace("e", namespace='characters') n_index = vocab.add_token_to_namespace("n", namespace='characters') t_index = vocab.add_token_to_namespace("t", namespace='characters') c_index = vocab.add_token_to_namespace("c", namespace='characters') field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) field.index(vocab) # pylint: disable=protected-access assert field._indexed_tokens["words"] == [capital_a_index, sentence_index] field1 = TextField([Token(t) for t in ["A", "sentence"]], {"characters": TokenCharactersIndexer(namespace="characters")}) field1.index(vocab) assert field1._indexed_tokens["characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]] field2 = TextField([Token(t) for t in ["A", "sentence"]], token_indexers={"words": SingleIdTokenIndexer(namespace="words"), "characters": TokenCharactersIndexer(namespace="characters")}) field2.index(vocab) assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index] assert field2._indexed_tokens["characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]]
def test_count_vocab_items_uses_ner_tags(self): tokens = self.tokenizer.split_words("Larry Page is CEO of Google.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = NerTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["ner_tags"] == {'PERSON': 2, 'ORG': 1, 'NONE': 6}
def test_count_vocab_items_respects_casing(self): indexer = TokenCharactersIndexer("characters") counter = defaultdict(lambda: defaultdict(int)) indexer.count_vocab_items(Token("Hello"), counter) indexer.count_vocab_items(Token("hello"), counter) assert counter["characters"] == {"h": 1, "H": 1, "e": 2, "l": 4, "o": 2} indexer = TokenCharactersIndexer("characters", CharacterTokenizer(lowercase_characters=True)) counter = defaultdict(lambda: defaultdict(int)) indexer.count_vocab_items(Token("Hello"), counter) indexer.count_vocab_items(Token("hello"), counter) assert counter["characters"] == {"h": 2, "e": 2, "l": 4, "o": 2}
def test_count_vocab_items_respects_casing(self): indexer = SingleIdTokenIndexer("words") counter = defaultdict(lambda: defaultdict(int)) indexer.count_vocab_items(Token("Hello"), counter) indexer.count_vocab_items(Token("hello"), counter) assert counter["words"] == {"hello": 1, "Hello": 1} indexer = SingleIdTokenIndexer("words", lowercase_tokens=True) counter = defaultdict(lambda: defaultdict(int)) indexer.count_vocab_items(Token("Hello"), counter) indexer.count_vocab_items(Token("hello"), counter) assert counter["words"] == {"hello": 2}
def get_instances(self): field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer) field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence", "."]], self.token_indexer) field3 = TextField([Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer) field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer) instances = [Instance({"text1": field1, "text2": field2}), Instance({"text1": field3, "text2": field4})] return instances
def test_invalid_vocab_extension(self): vocab_dir = self.TEST_DIR / 'vocab_save' original_vocab = Vocabulary(non_padded_namespaces=["tokens1"]) original_vocab.add_token_to_namespace("a", namespace="tokens1") original_vocab.add_token_to_namespace("b", namespace="tokens1") original_vocab.add_token_to_namespace("p", namespace="tokens2") original_vocab.save_to_files(vocab_dir) text_field1 = TextField([Token(t) for t in ["a" "c"]], {"tokens1": SingleIdTokenIndexer("tokens1")}) text_field2 = TextField([Token(t) for t in ["p", "q", "r"]], {"tokens2": SingleIdTokenIndexer("tokens2")}) instances = Batch([Instance({"text1": text_field1, "text2": text_field2})]) # Following 2 should give error: token1 is non-padded in original_vocab but not in instances params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": []}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": []}) extended_vocab.extend_from_instances(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=[], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]}) # Following 2 should not give error: overlapping namespaces have same padding setting params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1"]}) Vocabulary.from_params(params, instances) extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1"]}) extended_vocab.extend_from_instances(params, instances) extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=["tokens1"], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]}) # Following 2 should give error: token1 is padded in instances but not in original_vocab params = Params({"directory_path": vocab_dir, "extend": True, "non_padded_namespaces": ["tokens1", "tokens2"]}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) params = Params({"non_padded_namespaces": ["tokens1", "tokens2"]}) extended_vocab.extend_from_instances(params, instances) with pytest.raises(ConfigurationError): extended_vocab = copy.copy(original_vocab) extended_vocab._extend(non_padded_namespaces=["tokens1", "tokens2"], tokens_to_add={"tokens1": ["a"], "tokens2": ["p"]})
def test_field_counts_vocab_items_correctly(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert list(namespace_token_counts.keys()) == ["words"] field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert list(namespace_token_counts.keys()) == ["characters"] field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert set(namespace_token_counts.keys()) == {"words", "characters"}
def test_read_from_json_handles_simple_cases(self): json = { 'question': [Token(x) for x in ['where', 'is', 'mersin', '?']], 'columns': ['Name in English', 'Location'], 'cells': [['Paradeniz', 'Mersin'], ['Lake Gala', 'Edirne']] } graph = TableQuestionKnowledgeGraph.read_from_json(json) neighbors = set(graph.neighbors['fb:cell.mersin']) assert graph.entities == [ '-1', '0', '1', 'fb:cell.edirne', 'fb:cell.lake_gala', 'fb:cell.mersin', 'fb:cell.paradeniz', 'fb:row.row.location', 'fb:row.row.name_in_english' ] assert neighbors == {'fb:row.row.location'} neighbors = set(graph.neighbors['fb:row.row.name_in_english']) assert neighbors == {'fb:cell.paradeniz', 'fb:cell.lake_gala'} assert graph.entity_text['fb:cell.edirne'] == 'Edirne' assert graph.entity_text['fb:cell.lake_gala'] == 'Lake Gala' assert graph.entity_text['fb:cell.mersin'] == 'Mersin' assert graph.entity_text['fb:cell.paradeniz'] == 'Paradeniz' assert graph.entity_text['fb:row.row.location'] == 'Location' assert graph.entity_text[ 'fb:row.row.name_in_english'] == 'Name in English' # These are default numbers that should always be in the graph. assert graph.neighbors['-1'] == [] assert graph.neighbors['0'] == [] assert graph.neighbors['1'] == [] assert graph.entity_text['-1'] == '-1' assert graph.entity_text['0'] == '0' assert graph.entity_text['1'] == '1'
def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = DepLabelIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["dep_labels"] == { "ROOT": 1, "nsubj": 1, "det": 1, "NONE": 2, "attr": 1, "punct": 1 }
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor: """ Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters (len(batch), max sentence length, max word length). Parameters ---------- batch : ``List[List[str]]``, required A list of tokenized sentences. Returns ------- A tensor of padded character ids. """ instances = [] indexer = ELMoTokenCharactersIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def setUp(self): token_indexer = SingleIdTokenIndexer("tokens") text_field = TextField([Token(t) for t in ["a", "a", "a", "a", "b", "b", "c", "c", "c"]], {"tokens": token_indexer}) self.instance = Instance({"text": text_field}) self.dataset = Batch([self.instance]) super(TestVocabulary, self).setUp()
def test_as_tensor_handles_words(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict["words"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1]))
def test_bpe(self): # [e, w, o, e</w>] -> best pair (e, w) # [ew, o, e</w>] -> best pair (o, e</w>) # [ew, oe</w>] -> done token = Token("ewoe") assert self.indexer.byte_pair_encode(token) == ['ew', 'oe</w>'] # Prefer "ew" to "we" token = Token("ewe") assert self.indexer.byte_pair_encode(token) == ['ew', 'e</w>'] # Prefer ending a word token = Token("eee") assert self.indexer.byte_pair_encode(token) == ['e', 'ee</w>'] # Encodes up to a single symbol when appropriate token = Token("woe") assert self.indexer.byte_pair_encode(token) == ['woe</w>']
def test_padding_lengths_are_computed_correctly(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5} field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8} field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters"), "words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8}
def test_get_linked_agenda_items(self): json = { 'question': [Token(x) for x in ['where', 'is', 'mersin', '?']], 'columns': ['Name in English', 'Location'], 'cells': [['Paradeniz', 'Mersin'], ['Lake Gala', 'Edirne']] } graph = TableQuestionKnowledgeGraph.read_from_json(json) assert graph.get_linked_agenda_items() == [ 'fb:cell.mersin', 'fb:row.row.location' ]
def test_unicode_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token(chr(256) + 't')], Vocabulary(), "test-unicode") expected_indices = [ 259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ] assert indices == {"test-unicode": [expected_indices]}
def test_bos_to_char_ids(self): indexer = ELMoTokenCharactersIndexer() indices = indexer.tokens_to_indices([Token('<S>')], Vocabulary(), "test-elmo") expected_indices = [ 259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261 ] assert indices == {"test-elmo": [expected_indices]}
def test_get_longest_span_matching_entities(self): json = { 'question': [Token(x) for x in ['where', 'is', 'lake', 'big', 'gala', '?']], 'columns': ['Name in English', 'Location'], 'cells': [['Paradeniz', 'Lake Big'], ['Lake Big Gala', 'Edirne']] } graph = TableQuestionKnowledgeGraph.read_from_json(json) assert graph._get_longest_span_matching_entities() == [ 'fb:cell.lake_big_gala' ]
def test_tokens_to_indices_produces_correct_characters(self): vocab = Vocabulary() vocab.add_token_to_namespace("A", namespace='characters') vocab.add_token_to_namespace("s", namespace='characters') vocab.add_token_to_namespace("e", namespace='characters') vocab.add_token_to_namespace("n", namespace='characters') vocab.add_token_to_namespace("t", namespace='characters') vocab.add_token_to_namespace("c", namespace='characters') indexer = TokenCharactersIndexer("characters") indices = indexer.tokens_to_indices([Token("sentential")], vocab, "char") assert indices == {"char": [[3, 4, 5, 6, 4, 5, 6, 1, 1, 1]]}
def test_as_tensor_handles_characters(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0], [1, 3, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4], [1, 0, 0, 0, 0, 0, 0, 0]]) numpy.testing.assert_array_almost_equal(tensor_dict["characters"].detach().cpu().numpy(), expected_character_array)
def test_tokens_to_indices(self): tokens = [Token('ewoe'), Token('woe'), Token('ewe'), Token('ee')] indices = self.indexer.tokens_to_indices(tokens, None, 'test') assert set(indices.keys()) == {"test", "test-offsets", "mask"} text_tokens = indices['test'] offsets = indices['test-offsets'] assert text_tokens[:6] == [ self.indexer.encoder.get(symbol, 0) for symbol in ['ew', 'oe</w>'] + ['woe</w>'] + ['ew', 'e</w>'] + ['ee</w>'] ] assert offsets == [ 1, # end of first word 2, # end of second word 4, # end of third word 5, # end of last word ]
def test_read_from_json_handles_numbers_in_question(self): # The TSV file we use has newlines converted to "\n", not actual escape characters. We # need to be sure we catch this. json = { 'question': [Token(x) for x in ['one', '4']], 'columns': [], 'cells': [] } graph = TableQuestionKnowledgeGraph.read_from_json(json) assert graph.neighbors['1'] == [] assert graph.neighbors['4'] == [] assert graph.entity_text['1'] == 'one' assert graph.entity_text['4'] == '4'
def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", 'words') self.vocab.add_token_to_namespace("s", 'characters') self.vocab.add_token_to_namespace("e", 'characters') self.vocab.add_token_to_namespace("n", 'characters') self.vocab.add_token_to_namespace("t", 'characters') self.vocab.add_token_to_namespace("c", 'characters') for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']: self.vocab.add_token_to_namespace(label, 'labels') self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexers = { "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters") } self.field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer) self.field2 = TextField( [Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer) self.field3 = TextField( [Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field( ) super(TestListField, self).setUp()
def _sentences_to_ids(self, sentences): indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for sentence in sentences: tokens = [Token(token) for token in sentence] field = TextField(tokens, {'character_ids': indexer}) instance = Instance({'elmo': field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary() dataset.index_instances(vocab) return dataset.as_tensor_dict()['elmo']['character_ids']
def test_max_vocab_size_partial_dict(self): indexers = {"tokens": SingleIdTokenIndexer(), "token_characters": TokenCharactersIndexer()} instance = Instance({ 'text': TextField([Token(w) for w in 'Abc def ghi jkl mno pqr stu vwx yz'.split(' ')], indexers) }) dataset = Batch([instance]) params = Params({ "max_vocab_size": { "tokens": 1 } }) vocab = Vocabulary.from_params(params=params, instances=dataset) assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3 # 1 + 2 assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28 # 26 + 2
def test_elmo_token_representation_bos_eos(self): # The additional <S> and </S> embeddings added by the embedder should be as expected. indexer = ELMoTokenCharactersIndexer() elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file) for correct_index, token in [[0, '<S>'], [2, '</S>']]: indices = indexer.tokens_to_indices([Token(token)], Vocabulary(), "correct") indices = torch.from_numpy(numpy.array(indices["correct"])).view( 1, 1, -1) embeddings = elmo_token_embedder(indices)['token_embedding'] assert numpy.allclose(embeddings[0, correct_index, :].data.numpy(), embeddings[0, 1, :].data.numpy())
def test_tokens_to_indices_uses_ner_tags(self): tokens = self.tokenizer.split_words("Larry Page is CEO of Google.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags') none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags') vocab.add_token_to_namespace('ORG', namespace='ner_tags') indexer = NerTagIndexer() assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == { "tokens1": [person_index] } assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == { "tokens-1": [none_index] }
def test_tokens_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() root_index = vocab.add_token_to_namespace('ROOT', namespace='dep_labels') none_index = vocab.add_token_to_namespace('NONE', namespace='dep_labels') indexer = DepLabelIndexer() assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == { "tokens1": [root_index] } assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == { "tokens-1": [none_index] }