def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tags"] == { 'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2 } indexer._coarse_tags = True # pylint: disable=protected-access counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tags"] == { 'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2 }
def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.tokenize("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] # Hard-coding this because spacy's POS tagger keeps changing on us, wanting to call this AUX # in some runs. tokens[2] = Token("is", tag_="VBZ", pos_="VERB") indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tokens"] == { "DT": 2, "VBZ": 1, ".": 1, "NN": 1, "NONE": 2 } indexer._coarse_tags = True counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tokens"] == { "VERB": 1, "PUNCT": 1, "DET": 2, "NOUN": 1, "NONE": 2 }
def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tokens"] == { "DT": 2, "VBZ": 1, ".": 1, "NN": 1, "NONE": 2 } indexer._coarse_tags = True counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tokens"] == { "VERB": 1, "PUNCT": 1, "DET": 2, "NOUN": 1, "NONE": 2 }
def test_count_vocab_items_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")] indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tags"] == {'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2} indexer._coarse_tags = True # pylint: disable=protected-access counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) assert counter["pos_tags"] == {'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2}
def test_blank_pos_tag(self): tokens = [Token(token) for token in "allennlp is awesome .".split(" ")] for token in tokens: token.pos_ = "" indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) # spacy uses a empty string to indicate "no POS tag" # we convert it to "NONE" assert counter["pos_tokens"]["NONE"] == 4 vocab = Vocabulary(counter) none_index = vocab.get_token_index('NONE', 'pos_tokens') # should raise no exception indices = indexer.tokens_to_indices(tokens, vocab, index_name="pos") assert {"pos": [none_index, none_index, none_index, none_index]} == indices
def test_blank_pos_tag(self): tokens = [Token(token) for token in "allennlp is awesome .".split(" ")] indexer = PosTagIndexer() counter = defaultdict(lambda: defaultdict(int)) for token in tokens: indexer.count_vocab_items(token, counter) # spacy uses a empty string to indicate "no POS tag" # we convert it to "NONE" assert counter["pos_tokens"]["NONE"] == 4 vocab = Vocabulary(counter) none_index = vocab.get_token_index("NONE", "pos_tokens") # should raise no exception indices = indexer.tokens_to_indices(tokens, vocab) assert { "tokens": [none_index, none_index, none_index, none_index] } == indices