def test_tokens_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.tokenize("This is a sentence.")
        tokens = [t for t in tokens] + [Token("</S>")]
        tokens[1] = Token("is", tag_="VBZ", pos_="VERB")
        vocab = Vocabulary()
        verb_index = vocab.add_token_to_namespace("VERB", namespace="pos_tags")
        cop_index = vocab.add_token_to_namespace("VBZ", namespace="pos_tags")
        none_index = vocab.add_token_to_namespace("NONE", namespace="pos_tags")
        # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them
        vocab.add_token_to_namespace("DET", namespace="pos_tags")
        vocab.add_token_to_namespace("NOUN", namespace="pos_tags")
        vocab.add_token_to_namespace("PUNCT", namespace="pos_tags")

        indexer = PosTagIndexer(namespace="pos_tags", coarse_tags=True)

        indices = indexer.tokens_to_indices(tokens, vocab)
        assert len(indices) == 1
        assert "tokens" in indices
        assert indices["tokens"][1] == verb_index
        assert indices["tokens"][-1] == none_index

        indexer._coarse_tags = False
        assert indexer.tokens_to_indices([tokens[1]], vocab) == {
            "tokens": [cop_index]
        }
 def test_as_array_produces_token_sequence(self):
     indexer = PosTagIndexer()
     padded_tokens = indexer.as_padded_tensor_dict(
         {"tokens": [1, 2, 3, 4, 5]}, {"tokens": 10})
     assert padded_tokens["tokens"].tolist() == [
         1, 2, 3, 4, 5, 0, 0, 0, 0, 0
     ]
    def __init__(self,
                 word_indexer: Optional[TokenIndexer] = None,
                 is_bert: bool = False,
                 conceptnet_path: Optional[Path] = None):
        super().__init__(lazy=False)
        self.pos_indexers = {"pos_tokens": PosTagIndexer()}
        self.ner_indexers = {"ner_tokens": NerTagIndexer()}
        self.rel_indexers = {
            "rel_tokens": SingleIdTokenIndexer(namespace='rel_tokens')
        }

        if is_bert:
            splitter = BertBasicWordSplitter()
        else:
            splitter = SpacyWordSplitter()
        self.tokeniser = WordTokenizer(word_splitter=splitter)

        self.word_indexers = {'tokens': word_indexer}
        word_splitter = SpacyWordSplitter(pos_tags=True, ner=True, parse=True)
        self.word_tokeniser = WordTokenizer(word_splitter=word_splitter)
        bert_splitter = BertBasicWordSplitter()
        self.bert_tokeniser = WordTokenizer(word_splitter=bert_splitter)

        if word_indexer is None:
            if is_bert:
                word_indexer = PretrainedBertIndexer(
                    pretrained_model='bert-base-uncased',
                    truncate_long_sequences=False)
            else:
                word_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
        self.word_indexers = {'tokens': word_indexer}

        self.conceptnet = ConceptNet(conceptnet_path=conceptnet_path)
 def test_blank_pos_tag(self):
     tokens = [Token(token) for token in "allennlp is awesome .".split(" ")]
     for token in tokens:
         token.pos_ = ""
     indexer = PosTagIndexer()
     counter = defaultdict(lambda: defaultdict(int))
     for token in tokens:
         indexer.count_vocab_items(token, counter)
     # spacy uses a empty string to indicate "no POS tag"
     # we convert it to "NONE"
     assert counter["pos_tokens"]["NONE"] == 4
     vocab = Vocabulary(counter)
     none_index = vocab.get_token_index('NONE', 'pos_tokens')
     # should raise no exception
     indices = indexer.tokens_to_indices(tokens, vocab, index_name="pos")
     assert {"pos": [none_index, none_index, none_index, none_index]} == indices
 def test_blank_pos_tag(self):
     tokens = [Token(token) for token in "allennlp is awesome .".split(" ")]
     indexer = PosTagIndexer()
     counter = defaultdict(lambda: defaultdict(int))
     for token in tokens:
         indexer.count_vocab_items(token, counter)
     # spacy uses a empty string to indicate "no POS tag"
     # we convert it to "NONE"
     assert counter["pos_tokens"]["NONE"] == 4
     vocab = Vocabulary(counter)
     none_index = vocab.get_token_index("NONE", "pos_tokens")
     # should raise no exception
     indices = indexer.tokens_to_indices(tokens, vocab)
     assert {
         "tokens": [none_index, none_index, none_index, none_index]
     } == indices
Exemple #6
0
 def update_sentence_pos(
     self,
     fields,
     tokens
 ) -> Field:
     indexers = {'pos_tag': PosTagIndexer(namespace='pos_tag')}
     textfield = TextField(tokens, indexers)
     return textfield
Exemple #7
0
    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = PosTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tags"] == {
            'DT': 2,
            'VBZ': 1,
            '.': 1,
            'NN': 1,
            'NONE': 2
        }

        indexer._coarse_tags = True  # pylint: disable=protected-access
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tags"] == {
            'VERB': 1,
            'PUNCT': 1,
            'DET': 2,
            'NOUN': 1,
            'NONE': 2
        }
    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.tokenize("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        # Hard-coding this because spacy's POS tagger keeps changing on us, wanting to call this AUX
        # in some runs.
        tokens[2] = Token("is", tag_="VBZ", pos_="VERB")
        indexer = PosTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tokens"] == {
            "DT": 2,
            "VBZ": 1,
            ".": 1,
            "NN": 1,
            "NONE": 2
        }

        indexer._coarse_tags = True
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tokens"] == {
            "VERB": 1,
            "PUNCT": 1,
            "DET": 2,
            "NOUN": 1,
            "NONE": 2
        }
    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = PosTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tokens"] == {
            "DT": 2,
            "VBZ": 1,
            ".": 1,
            "NN": 1,
            "NONE": 2
        }

        indexer._coarse_tags = True
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tokens"] == {
            "VERB": 1,
            "PUNCT": 1,
            "DET": 2,
            "NOUN": 1,
            "NONE": 2
        }
Exemple #10
0
    def test_tokens_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.split_words(u"This is a sentence.")
        tokens = [t for t in tokens] + [Token(u"</S>")]
        vocab = Vocabulary()
        verb_index = vocab.add_token_to_namespace(u'VERB', namespace=u'pos_tags')
        cop_index = vocab.add_token_to_namespace(u'VBZ', namespace=u'pos_tags')
        none_index = vocab.add_token_to_namespace(u'NONE', namespace=u'pos_tags')
        # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them
        vocab.add_token_to_namespace(u'DET', namespace=u'pos_tags')
        vocab.add_token_to_namespace(u'NOUN', namespace=u'pos_tags')
        vocab.add_token_to_namespace(u'PUNCT', namespace=u'pos_tags')

        indexer = PosTagIndexer(coarse_tags=True)

        indices = indexer.tokens_to_indices(tokens, vocab, u"tokens")
        assert len(indices) == 1
        assert u"tokens" in indices
        assert indices[u"tokens"][1] == verb_index
        assert indices[u"tokens"][-1] == none_index

        indexer._coarse_tags = False  # pylint: disable=protected-access
        assert indexer.tokens_to_indices([tokens[1]], vocab, u"coarse") == {u"coarse": [cop_index]}
    def test_tokens_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags')
        cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags')
        none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags')
        # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them
        vocab.add_token_to_namespace('DET', namespace='pos_tags')
        vocab.add_token_to_namespace('NOUN', namespace='pos_tags')
        vocab.add_token_to_namespace('PUNCT', namespace='pos_tags')

        indexer = PosTagIndexer(namespace='pos_tags', coarse_tags=True)

        indices = indexer.tokens_to_indices(tokens, vocab, "tokens")
        assert len(indices) == 1
        assert "tokens" in indices
        assert indices["tokens"][1] == verb_index
        assert indices["tokens"][-1] == none_index

        indexer._coarse_tags = False  # pylint: disable=protected-access
        assert indexer.tokens_to_indices([tokens[1]], vocab, "coarse") == {"coarse": [cop_index]}
Exemple #12
0
 def test_token_to_indices_uses_pos_tags(self):
     tokens = self.tokenizer.split_words("This is a sentence.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags')
     cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags')
     none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags')
     indexer = PosTagIndexer(coarse_tags=True)
     assert indexer.token_to_indices(tokens[1], vocab) == verb_index
     assert indexer.token_to_indices(tokens[-1], vocab) == none_index
     indexer._coarse_tags = False  # pylint: disable=protected-access
     assert indexer.token_to_indices(tokens[1], vocab) == cop_index
    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = PosTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tags"] == {'DT': 2, 'VBZ': 1, '.': 1, 'NN': 1, 'NONE': 2}

        indexer._coarse_tags = True  # pylint: disable=protected-access
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tags"] == {'VERB': 1, 'PUNCT': 1, 'DET': 2, 'NOUN': 1, 'NONE': 2}
 def test_as_array_produces_token_sequence(self):
     indexer = PosTagIndexer()
     padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {})
     assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
Exemple #15
0
 def test_as_array_produces_token_sequence(self):
     indexer = PosTagIndexer()
     padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]},
                                                {'key': 10}, {})
     assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
Exemple #16
0
from allennlp.data.fields import TextField
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer, PosTagIndexer
from allennlp.data.tokenizers import WordTokenizer, CharacterTokenizer
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.data import Vocabulary

# Splits text into words (instead of wordpieces or characters), and do part of speech tagging with
# spacy while we're at it.
tokenizer = WordTokenizer(word_splitter=SpacyWordSplitter(pos_tags=True))

# Represents each token with (1) an id from a vocabulary, (2) a sequence of characters, and (3)
# part of speech tag ids.
token_indexers = {
    'tokens': SingleIdTokenIndexer(namespace='token_vocab'),
    'token_characters': TokenCharactersIndexer(namespace='character_vocab'),
    'pos_tags': PosTagIndexer(namespace='pos_tag_vocab')
}

vocab = Vocabulary()
vocab.add_tokens_to_namespace(['This', 'is', 'some', 'text', '.'],
                              namespace='token_vocab')
vocab.add_tokens_to_namespace(
    ['T', 'h', 'i', 's', ' ', 'o', 'm', 'e', 't', 'x', '.'],
    namespace='character_vocab')
vocab.add_tokens_to_namespace(['DT', 'VBZ', 'NN', '.'],
                              namespace='pos_tag_vocab')

text = "This is some text."
tokens = tokenizer.tokenize(text)
print(tokens)
print([token.tag_ for token in tokens])
Exemple #17
0
 def test_as_array_produces_token_sequence(self):
     indexer = PosTagIndexer()
     padded_tokens = indexer.as_padded_tensor({'key': [1, 2, 3, 4, 5]},
                                              {'key': 10}, {})
     assert padded_tokens["key"].tolist() == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
Exemple #18
0
 def test_padding_functions(self):
     indexer = PosTagIndexer()
     assert indexer.get_padding_token() == 0
     assert indexer.get_padding_lengths(0) == {}
 def test_as_array_produces_token_sequence(self):
     indexer = PosTagIndexer()
     padded_tokens = indexer.pad_token_sequence([1, 2, 3, 4, 5], 10, {})
     assert padded_tokens == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
 def test_padding_functions(self):
     indexer = PosTagIndexer()
     assert indexer.get_padding_token() == 0
     assert indexer.get_padding_lengths(0) == {}
Exemple #21
0
 def test_as_array_produces_token_sequence(self):
     indexer = PosTagIndexer()
     padded_tokens = indexer.pad_token_sequence([1, 2, 3, 4, 5], 10, {})
     assert padded_tokens == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
Exemple #22
0
    model_dir = "C:/Users/t-ofarvi/PycharmProjects/tryout-model"
    vocab_dir = f'{model_dir}/vocabulary'

    # NOTE: The word tokenizer is a SpaCy tokenizer, which is a little different from the BERT tokenizer.
    # This was done for convince.
    word_tokenizer = SpacyMultilingualWhitespaceWordSplitter()

    bert_indexer = PretrainedBertIndexer(pretrained_model=bert_mode,
                                         do_lowercase=bert_do_lowercase,
                                         truncate_long_sequences=False)
    word_indexer = {
        "bert": bert_indexer,
        "deps": DepLabelIndexer(namespace="deps_tags"),
        "ner": NerTagIndexer(),
        "pos": PosTagIndexer(),
        "lang": LanguageIndexer()
    }

    train_ds, validation_ds = (
        UccaSpanParserDatasetReader(word_tokenizer, word_indexer).read(folder)
        for folder in [train_dataset_folder, validation_dataset_folder])

    if os.path.exists(vocab_dir):
        vocab = Vocabulary.from_files(vocab_dir)
    else:
        vocab = Vocabulary.from_instances(
            itertools.chain(train_ds, validation_ds))
        vocab.save_to_files(vocab_dir)

    vocab_namespaces = vocab._index_to_token.keys()