def test_tokens_to_indices_uses_ner_tags(self): tokens = self.tokenizer.split_words("Larry Page is CEO of Google.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags') none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags') vocab.add_token_to_namespace('ORG', namespace='ner_tags') indexer = NerTagIndexer(namespace='ner_tags') assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {"tokens1": [person_index]} assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {"tokens-1": [none_index]}
def test_tokens_to_indices_uses_ner_tags(self): tokens = self.tokenizer.split_words("Larry Page is CEO of Google.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() person_index = vocab.add_token_to_namespace("PERSON", namespace="ner_tags") none_index = vocab.add_token_to_namespace("NONE", namespace="ner_tags") vocab.add_token_to_namespace("ORG", namespace="ner_tags") indexer = NerTagIndexer(namespace="ner_tags") assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == { "tokens1": [person_index] } assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == { "tokens-1": [none_index] }
def test_tokens_to_indices_uses_ner_tags(self): tokens = self.tokenizer.split_words(u"Larry Page is CEO of Google.") tokens = [t for t in tokens] + [Token(u"</S>")] vocab = Vocabulary() person_index = vocab.add_token_to_namespace(u'PERSON', namespace=u'ner_tags') none_index = vocab.add_token_to_namespace(u'NONE', namespace=u'ner_tags') vocab.add_token_to_namespace(u'ORG', namespace=u'ner_tags') indexer = NerTagIndexer() assert indexer.tokens_to_indices([tokens[1]], vocab, u"tokens1") == { u"tokens1": [person_index] } assert indexer.tokens_to_indices([tokens[-1]], vocab, u"tokens-1") == { u"tokens-1": [none_index] }
def test_blank_ner_tag(self): tokens = [ Token(token)._replace(ent_type_="") for token in "allennlp is awesome .".split(" ") ] indexer = NerTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) # spacy uses a empty string to indicate "no NER tag" # we convert it to "NONE" assert counter["ner_tokens"]["NONE"] == 4 vocab = Vocabulary(counter) none_index = vocab.get_token_index("NONE", "ner_tokens") # should raise no exception indices = indexer.tokens_to_indices(tokens, vocab, index_name="ner") assert {"ner": [none_index, none_index, none_index, none_index]} == indices
def test_blank_ner_tag(self): tokens = [Token(token) for token in "allennlp is awesome .".split(" ")] for token in tokens: token.ent_type_ = "" indexer = NerTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) # spacy uses a empty string to indicate "no NER tag" # we convert it to "NONE" assert counter["ner_tokens"]["NONE"] == 4 vocab = Vocabulary(counter) none_index = vocab.get_token_index('NONE', 'ner_tokens') # should raise no exception indices = indexer.tokens_to_indices(tokens, vocab, index_name="ner") assert {"ner": [none_index, none_index, none_index, none_index]} == indices