Exemple #1
0
    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = PosTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tags"] == {
            'DT': 2,
            'VBZ': 1,
            '.': 1,
            'NN': 1,
            'NONE': 2
        }

        indexer._coarse_tags = True  # pylint: disable=protected-access
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tags"] == {
            'VERB': 1,
            'PUNCT': 1,
            'DET': 2,
            'NOUN': 1,
            'NONE': 2
        }
    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.tokenize("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        # Hard-coding this because spacy's POS tagger keeps changing on us, wanting to call this AUX
        # in some runs.
        tokens[2] = Token("is", tag_="VBZ", pos_="VERB")
        indexer = PosTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tokens"] == {
            "DT": 2,
            "VBZ": 1,
            ".": 1,
            "NN": 1,
            "NONE": 2
        }

        indexer._coarse_tags = True
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tokens"] == {
            "VERB": 1,
            "PUNCT": 1,
            "DET": 2,
            "NOUN": 1,
            "NONE": 2
        }
    def test_tokens_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.tokenize("This is a sentence.")
        tokens = [t for t in tokens] + [Token("</S>")]
        tokens[1] = Token("is", tag_="VBZ", pos_="VERB")
        vocab = Vocabulary()
        verb_index = vocab.add_token_to_namespace("VERB", namespace="pos_tags")
        cop_index = vocab.add_token_to_namespace("VBZ", namespace="pos_tags")
        none_index = vocab.add_token_to_namespace("NONE", namespace="pos_tags")
        # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them
        vocab.add_token_to_namespace("DET", namespace="pos_tags")
        vocab.add_token_to_namespace("NOUN", namespace="pos_tags")
        vocab.add_token_to_namespace("PUNCT", namespace="pos_tags")

        indexer = PosTagIndexer(namespace="pos_tags", coarse_tags=True)

        indices = indexer.tokens_to_indices(tokens, vocab)
        assert len(indices) == 1
        assert "tokens" in indices
        assert indices["tokens"][1] == verb_index
        assert indices["tokens"][-1] == none_index

        indexer._coarse_tags = False
        assert indexer.tokens_to_indices([tokens[1]], vocab) == {
            "tokens": [cop_index]
        }
    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = PosTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tokens"] == {
            "DT": 2,
            "VBZ": 1,
            ".": 1,
            "NN": 1,
            "NONE": 2
        }

        indexer._coarse_tags = True
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tokens"] == {
            "VERB": 1,
            "PUNCT": 1,
            "DET": 2,
            "NOUN": 1,
            "NONE": 2
        }
 def test_token_to_indices_uses_pos_tags(self):
     tokens = self.tokenizer.split_words("This is a sentence.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags')
     cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags')
     none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags')
     indexer = PosTagIndexer(coarse_tags=True)
     assert indexer.token_to_indices(tokens[1], vocab) == verb_index
     assert indexer.token_to_indices(tokens[-1], vocab) == none_index
     indexer._coarse_tags = False  # pylint: disable=protected-access
     assert indexer.token_to_indices(tokens[1], vocab) == cop_index
    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = PosTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tags"] == {'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2}

        indexer._coarse_tags = True  # pylint: disable=protected-access
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tags"] == {'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2}
Exemple #7
0
    def test_tokens_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.split_words(u"This is a sentence.")
        tokens = [t for t in tokens] + [Token(u"</S>")]
        vocab = Vocabulary()
        verb_index = vocab.add_token_to_namespace(u'VERB', namespace=u'pos_tags')
        cop_index = vocab.add_token_to_namespace(u'VBZ', namespace=u'pos_tags')
        none_index = vocab.add_token_to_namespace(u'NONE', namespace=u'pos_tags')
        # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them
        vocab.add_token_to_namespace(u'DET', namespace=u'pos_tags')
        vocab.add_token_to_namespace(u'NOUN', namespace=u'pos_tags')
        vocab.add_token_to_namespace(u'PUNCT', namespace=u'pos_tags')

        indexer = PosTagIndexer(coarse_tags=True)

        indices = indexer.tokens_to_indices(tokens, vocab, u"tokens")
        assert len(indices) == 1
        assert u"tokens" in indices
        assert indices[u"tokens"][1] == verb_index
        assert indices[u"tokens"][-1] == none_index

        indexer._coarse_tags = False  # pylint: disable=protected-access
        assert indexer.tokens_to_indices([tokens[1]], vocab, u"coarse") == {u"coarse": [cop_index]}
    def test_tokens_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags')
        cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags')
        none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags')
        # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them
        vocab.add_token_to_namespace('DET', namespace='pos_tags')
        vocab.add_token_to_namespace('NOUN', namespace='pos_tags')
        vocab.add_token_to_namespace('PUNCT', namespace='pos_tags')

        indexer = PosTagIndexer(namespace='pos_tags', coarse_tags=True)

        indices = indexer.tokens_to_indices(tokens, vocab, "tokens")
        assert len(indices) == 1
        assert "tokens" in indices
        assert indices["tokens"][1] == verb_index
        assert indices["tokens"][-1] == none_index

        indexer._coarse_tags = False  # pylint: disable=protected-access
        assert indexer.tokens_to_indices([tokens[1]], vocab, "coarse") == {"coarse": [cop_index]}