def test_count_vocab_items_uses_pos_tags(self):
     tokens = self.tokenizer.split_words("This is a sentence.")
     tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
     indexer = DepLabelIndexer()
     counter = defaultdict(lambda: defaultdict(int))
     for token in tokens:
         indexer.count_vocab_items(token, counter)
     assert counter["dep_labels"] == {"ROOT": 1, "nsubj": 1, "advmod": 3, "NONE": 2}
Exemple #2
0
 def test_tokens_to_indices_uses_pos_tags(self):
     tokens = self.tokenizer.split_words("This is a sentence.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     root_index = vocab.add_token_to_namespace('ROOT', namespace='dep_labels')
     none_index = vocab.add_token_to_namespace('NONE', namespace='dep_labels')
     indexer = DepLabelIndexer()
     assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {"tokens1": [root_index]}
     assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {"tokens-1": [none_index]}
 def test_tokens_to_indices_uses_pos_tags(self):
     tokens = self.tokenizer.split_words("This is a sentence.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     root_index = vocab.add_token_to_namespace('ROOT', namespace='dep_labels')
     none_index = vocab.add_token_to_namespace('NONE', namespace='dep_labels')
     indexer = DepLabelIndexer()
     assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {"tokens1": [root_index]}
     assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {"tokens-1": [none_index]}
 def test_tokens_to_indices_uses_pos_tags(self):
     tokens = self.tokenizer.tokenize("This is a sentence.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     root_index = vocab.add_token_to_namespace("ROOT", namespace="dep_labels")
     none_index = vocab.add_token_to_namespace("NONE", namespace="dep_labels")
     indexer = DepLabelIndexer()
     assert indexer.tokens_to_indices([tokens[1]], vocab) == {"tokens": [root_index]}
     assert indexer.tokens_to_indices([tokens[-1]], vocab) == {"tokens": [none_index]}
    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = DepLabelIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)

        assert counter["dep_labels"] == {"ROOT": 1, "nsubj": 1,
                                         "det": 1, "NONE": 2, "attr": 1, "punct": 1}
    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.split_words(u"This is a sentence.")
        tokens = [Token(u"<S>")] + [t for t in tokens] + [Token(u"</S>")]
        indexer = DepLabelIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)

        assert counter[u"dep_labels"] == {
            u"ROOT": 1,
            u"nsubj": 1,
            u"det": 1,
            u"NONE": 2,
            u"attr": 1,
            u"punct": 1
        }
Exemple #7
0
 def update_sentence_dep(
     self,
     fields,
     tokens
 ) -> Field:
     indexers = {'dep_tag': DepLabelIndexer(namespace='dep_tag')}
     textfield = TextField(tokens, indexers)
     return textfield
 def test_as_array_produces_token_sequence(self):
     indexer = DepLabelIndexer()
     padded_tokens = indexer.as_padded_tensor({"key": [1, 2, 3, 4, 5]},
                                              {"key": 10}, {})
     assert padded_tokens["key"].tolist() == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
 def test_padding_functions(self):
     indexer = DepLabelIndexer()
     assert indexer.get_padding_lengths(0) == {}
 def test_as_array_produces_token_sequence(self):
     indexer = DepLabelIndexer()
     padded_tokens = indexer.pad_token_sequence([1, 2, 3, 4, 5], 10, {})
     assert padded_tokens == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
 def test_as_array_produces_token_sequence(self):
     indexer = DepLabelIndexer()
     padded_tokens = indexer.pad_token_sequence({u'key': [1, 2, 3, 4, 5]},
                                                {u'key': 10}, {})
     assert padded_tokens == {u'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
 def test_as_array_produces_token_sequence(self):
     indexer = DepLabelIndexer()
     padded_tokens = indexer.pad_token_sequence({'key': [1, 2, 3, 4, 5]}, {'key': 10}, {})
     assert padded_tokens == {'key': [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]}
 def test_padding_functions(self):
     indexer = DepLabelIndexer()
     assert indexer.get_padding_token() == 0
     assert indexer.get_padding_lengths(0) == {}
 def test_as_array_produces_token_sequence(self):
     indexer = DepLabelIndexer()
     padded_tokens = indexer.pad_token_sequence([1, 2, 3, 4, 5], 10, {})
     assert padded_tokens == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
Exemple #15
0
    train_dataset_folder = "C:/Users/t-ofarvi/PycharmProjects/UCCA_Dataset_29-06-09/tryout"  #"C:/Users/t-ofarvi/Desktop/train_allen"
    validation_dataset_folder = "C:/Users/t-ofarvi/PycharmProjects/UCCA_Dataset_29-06-09/tryout-validation"  #"C:/Users/t-ofarvi/Desktop/dev_allen"

    model_dir = "C:/Users/t-ofarvi/PycharmProjects/tryout-model"
    vocab_dir = f'{model_dir}/vocabulary'

    # NOTE: The word tokenizer is a SpaCy tokenizer, which is a little different from the BERT tokenizer.
    # This was done for convince.
    word_tokenizer = SpacyMultilingualWhitespaceWordSplitter()

    bert_indexer = PretrainedBertIndexer(pretrained_model=bert_mode,
                                         do_lowercase=bert_do_lowercase,
                                         truncate_long_sequences=False)
    word_indexer = {
        "bert": bert_indexer,
        "deps": DepLabelIndexer(namespace="deps_tags"),
        "ner": NerTagIndexer(),
        "pos": PosTagIndexer(),
        "lang": LanguageIndexer()
    }

    train_ds, validation_ds = (
        UccaSpanParserDatasetReader(word_tokenizer, word_indexer).read(folder)
        for folder in [train_dataset_folder, validation_dataset_folder])

    if os.path.exists(vocab_dir):
        vocab = Vocabulary.from_files(vocab_dir)
    else:
        vocab = Vocabulary.from_instances(
            itertools.chain(train_ds, validation_ds))
        vocab.save_to_files(vocab_dir)