Ejemplo n.º 1
0
    def test_from_dataset_respects_max_vocab_size_single_int(self):
        max_vocab_size = 1
        vocab = Vocabulary.from_instances(self.dataset, max_vocab_size=max_vocab_size)
        words = vocab.get_index_to_token_vocabulary().values()
        # Additional 2 tokens are '@@PADDING@@' and '@@UNKNOWN@@' by default
        assert len(words) == max_vocab_size + 2

        vocab = Vocabulary.from_instances(self.dataset, min_count=None)
        words = vocab.get_index_to_token_vocabulary().values()
        assert len(words) == 5
Ejemplo n.º 2
0
 def setUp(self):
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("this")
     self.vocab.add_token_to_namespace("is")
     self.vocab.add_token_to_namespace("a")
     self.vocab.add_token_to_namespace("sentence")
     self.vocab.add_token_to_namespace(".")
     self.token_indexer = {"tokens": SingleIdTokenIndexer()}
     self.instances = self.get_instances()
     super(TestDataset, self).setUp()
Ejemplo n.º 3
0
 def setUp(self):
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("sentence", namespace='words')
     self.vocab.add_token_to_namespace("A", namespace='words')
     self.vocab.add_token_to_namespace("A", namespace='characters')
     self.vocab.add_token_to_namespace("s", namespace='characters')
     self.vocab.add_token_to_namespace("e", namespace='characters')
     self.vocab.add_token_to_namespace("n", namespace='characters')
     self.vocab.add_token_to_namespace("t", namespace='characters')
     self.vocab.add_token_to_namespace("c", namespace='characters')
     super(TestTextField, self).setUp()
Ejemplo n.º 4
0
 def test_unknown_token(self):
     # pylint: disable=protected-access
     # We're putting this behavior in a test so that the behavior is documented.  There is
     # solver code that depends in a small way on how we treat the unknown token, so any
     # breaking change to this behavior should break a test, so you know you've done something
     # that needs more consideration.
     vocab = Vocabulary()
     oov_token = vocab._oov_token
     oov_index = vocab.get_token_index(oov_token)
     assert oov_index == 1
     assert vocab.get_token_index("unseen word") == oov_index
Ejemplo n.º 5
0
    def test_from_dataset_respects_min_count(self):
        vocab = Vocabulary.from_instances(self.dataset, min_count={'tokens': 4})
        words = vocab.get_index_to_token_vocabulary().values()
        assert 'a' in words
        assert 'b' not in words
        assert 'c' not in words

        vocab = Vocabulary.from_instances(self.dataset, min_count=None)
        words = vocab.get_index_to_token_vocabulary().values()
        assert 'a' in words
        assert 'b' in words
        assert 'c' in words
Ejemplo n.º 6
0
class TestDataset(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this")
        self.vocab.add_token_to_namespace("is")
        self.vocab.add_token_to_namespace("a")
        self.vocab.add_token_to_namespace("sentence")
        self.vocab.add_token_to_namespace(".")
        self.token_indexer = {"tokens": SingleIdTokenIndexer()}
        self.instances = self.get_instances()
        super(TestDataset, self).setUp()

    def test_instances_must_have_homogeneous_fields(self):
        instance1 = Instance({"tag": (LabelField(1, skip_indexing=True))})
        instance2 = Instance({"words": TextField([Token("hello")], {})})
        with pytest.raises(ConfigurationError):
            _ = Batch([instance1, instance2])

    def test_padding_lengths_uses_max_instance_lengths(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        assert padding_lengths == {"text1": {"num_tokens": 5}, "text2": {"num_tokens": 6}}

    def test_as_tensor_dict(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        tensors = dataset.as_tensor_dict(padding_lengths)
        text1 = tensors["text1"]["tokens"].asnumpy()
        text2 = tensors["text2"]["tokens"].asnumpy()

        numpy.testing.assert_array_almost_equal(text1, numpy.array([[2, 3, 4, 5, 6],
                                                                    [1, 3, 4, 5, 6]]))
        numpy.testing.assert_array_almost_equal(text2, numpy.array([[2, 3, 4, 1, 5, 6],
                                                                    [2, 3, 1, 0, 0, 0]]))

    def get_instances(self):
        field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence", "."]],
                           self.token_indexer)
        field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence", "."]],
                           self.token_indexer)
        field3 = TextField([Token(t) for t in ["here", "is", "a", "sentence", "."]],
                           self.token_indexer)
        field4 = TextField([Token(t) for t in ["this", "is", "short"]],
                           self.token_indexer)
        instances = [Instance({"text1": field1, "text2": field2}),
                     Instance({"text1": field3, "text2": field4})]
        return instances
Ejemplo n.º 7
0
    def test_elmo_as_array_produces_token_sequence(self):  # pylint: disable=invalid-name
        indexer = ELMoTokenCharactersIndexer()
        indices = [
            indexer.token_to_indices(Token(token), Vocabulary())
            for token in ['Second', '.']
        ]
        padded_tokens = indexer.pad_token_sequence(indices,
                                                   desired_num_tokens=3,
                                                   padding_lengths={})
        expected_padded_tokens = [
            [
                259, 84, 102, 100, 112, 111, 101, 260, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261
            ],
            [
                259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261
            ],
            [
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 0, 0, 0, 0
            ]
        ]

        assert padded_tokens == expected_padded_tokens
Ejemplo n.º 8
0
    def test_label_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("entailment", namespace="labels")
        vocab.add_token_to_namespace("contradiction", namespace="labels")
        vocab.add_token_to_namespace("neutral", namespace="labels")

        label = LabelField("entailment")
        label.index(vocab)
        tensor = label.as_tensor(label.get_padding_lengths()).asnumpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0]))
Ejemplo n.º 9
0
 def test_bos_to_char_ids(self):
     indexer = ELMoTokenCharactersIndexer()
     indices = indexer.token_to_indices(Token('<S>'), Vocabulary())
     expected_indices = [
         259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261
     ]
     assert indices == expected_indices
Ejemplo n.º 10
0
 def test_unicode_to_char_ids(self):
     indexer = ELMoTokenCharactersIndexer()
     indices = indexer.token_to_indices(Token(chr(256) + 't'), Vocabulary())
     expected_indices = [
         259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261
     ]
     assert indices == expected_indices
Ejemplo n.º 11
0
def get_data(options, train_path, vocab_path):
    max_word_length = options['char_cnn']['max_characters_per_token']
    batch_size = options['batch_size']
    steps = options['unroll_steps']
    if max_word_length != None:
        vocab = UnicodeCharsVocabulary(vocab_path, max_word_length=max_word_length)
    else:
        vocab = Vocabulary(vocab)
    data = BidirectionalLMDataset(train_path, vocab)
    data_gen = data.iter_batches(batch_size, steps)
    return data_gen
Ejemplo n.º 12
0
 def token_to_indices(self, token: Token, vocabulary: Vocabulary) -> int:
     if getattr(token, 'text_id', None) is not None:
         # `text_id` being set on the token means that we aren't using the vocab, we just use
         # this id instead.
         index = token.text_id
     else:
         text = token.text
         if self.lowercase_tokens:
             text = text.lower()
         index = vocabulary.get_token_index(text, self.namespace)
     return index
Ejemplo n.º 13
0
    def test_saving_and_loading_works_with_byte_encoding(self):
        # We're going to set a vocabulary from a TextField using byte encoding, index it, save the
        # vocab, load the vocab, then index the text field again, and make sure we get the same
        # result.
        tokenizer = CharacterTokenizer(byte_encoding='utf-8')
        token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer)
        tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]]
        text_field = TextField(tokens, {"characters": token_indexer})
        dataset = Batch([Instance({"sentence": text_field})])
        vocab = Vocabulary.from_instances(dataset)
        text_field.index(vocab)
        indexed_tokens = deepcopy(text_field._indexed_tokens)  # pylint: disable=protected-access

        vocab_dir = os.path.join(self.TEST_DIR, 'vocab_save')
        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)
        text_field2 = TextField(tokens, {"characters": token_indexer})
        text_field2.index(vocab2)
        indexed_tokens2 = deepcopy(text_field2._indexed_tokens)  # pylint: disable=protected-access
        assert indexed_tokens == indexed_tokens2
Ejemplo n.º 14
0
    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence",
                                                      namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace(
            "A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [
            capital_a_index, sentence_index
        ]

        field1 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            {"characters": TokenCharactersIndexer(namespace="characters")})
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [
                                                            s_index, e_index,
                                                            n_index, t_index,
                                                            e_index, n_index,
                                                            c_index, e_index
                                                        ]]
        field2 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "words": SingleIdTokenIndexer(namespace="words"),
                "characters": TokenCharactersIndexer(namespace="characters")
            })
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [
            capital_a_index, sentence_index
        ]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [
                                                            s_index, e_index,
                                                            n_index, t_index,
                                                            e_index, n_index,
                                                            c_index, e_index
                                                        ]]
Ejemplo n.º 15
0
    def test_from_dataset_respects_inclusive_embedding_file(self):
        embeddings_filename = self.TEST_DIR + "embeddings.gz"
        with gzip.open(embeddings_filename, 'wb') as embeddings_file:
            embeddings_file.write("a 1.0 2.3 -1.0\n".encode('utf-8'))
            embeddings_file.write("b 0.1 0.4 -4.0\n".encode('utf-8'))

        vocab = Vocabulary.from_instances(self.dataset,
                                          min_count={'tokens': 4},
                                          pretrained_files={'tokens': embeddings_filename},
                                          only_include_pretrained_words=False)
        words = vocab.get_index_to_token_vocabulary().values()
        assert 'a' in words
        assert 'b' in words
        assert 'c' not in words

        vocab = Vocabulary.from_instances(self.dataset,
                                          pretrained_files={'tokens': embeddings_filename},
                                          only_include_pretrained_words=False)
        words = vocab.get_index_to_token_vocabulary().values()
        assert 'a' in words
        assert 'b' in words
        assert 'c' in words
Ejemplo n.º 16
0
    def test_set_from_file_reads_non_padded_files(self):
        # pylint: disable=protected-access
        vocab_filename = self.TEST_DIR + 'vocab_file'
        with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file:
            vocab_file.write('B-PERS\n')
            vocab_file.write('I-PERS\n')
            vocab_file.write('O\n')
            vocab_file.write('B-ORG\n')
            vocab_file.write('I-ORG\n')

        vocab = Vocabulary()
        vocab.set_from_file(vocab_filename, is_padded=False, namespace='tags')
        assert vocab.get_token_index("B-PERS", namespace='tags') == 0
        assert vocab.get_token_index("I-PERS", namespace='tags') == 1
        assert vocab.get_token_index("O", namespace='tags') == 2
        assert vocab.get_token_index("B-ORG", namespace='tags') == 3
        assert vocab.get_token_index("I-ORG", namespace='tags') == 4
        assert vocab.get_token_from_index(0, namespace='tags') == "B-PERS"
        assert vocab.get_token_from_index(1, namespace='tags') == "I-PERS"
        assert vocab.get_token_from_index(2, namespace='tags') == "O"
        assert vocab.get_token_from_index(3, namespace='tags') == "B-ORG"
        assert vocab.get_token_from_index(4, namespace='tags') == "I-ORG"
Ejemplo n.º 17
0
 def token_to_indices(self, token: Token,
                      vocabulary: Vocabulary) -> List[int]:
     indices = []
     if token.text is None:
         raise ConfigurationError(
             'TokenCharactersIndexer needs a tokenizer that retains text')
     for character in self._character_tokenizer.tokenize(token.text):
         if getattr(character, 'text_id', None) is not None:
             # `text_id` being set on the token means that we aren't using the vocab, we just
             # use this id instead.
             index = character.text_id
         else:
             index = vocabulary.get_token_index(character.text,
                                                self._namespace)
         indices.append(index)
     return indices
Ejemplo n.º 18
0
    def test_saving_and_loading(self):
        # pylint: disable=protected-access
        vocab_dir = os.path.join(self.TEST_DIR, 'vocab_save')

        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_token_to_namespace("a0", namespace="a")  # non-padded, should start at 0
        vocab.add_token_to_namespace("a1", namespace="a")
        vocab.add_token_to_namespace("a2", namespace="a")
        vocab.add_token_to_namespace("b2", namespace="b")  # padded, should start at 2
        vocab.add_token_to_namespace("b3", namespace="b")

        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)

        assert vocab2._non_padded_namespaces == ["a", "c"]

        # Check namespace a.
        assert vocab2.get_vocab_size(namespace='a') == 3
        assert vocab2.get_token_from_index(0, namespace='a') == 'a0'
        assert vocab2.get_token_from_index(1, namespace='a') == 'a1'
        assert vocab2.get_token_from_index(2, namespace='a') == 'a2'
        assert vocab2.get_token_index('a0', namespace='a') == 0
        assert vocab2.get_token_index('a1', namespace='a') == 1
        assert vocab2.get_token_index('a2', namespace='a') == 2

        # Check namespace b.
        assert vocab2.get_vocab_size(namespace='b') == 4  # (unk + padding + two tokens)
        assert vocab2.get_token_from_index(0, namespace='b') == vocab._padding_token
        assert vocab2.get_token_from_index(1, namespace='b') == vocab._oov_token
        assert vocab2.get_token_from_index(2, namespace='b') == 'b2'
        assert vocab2.get_token_from_index(3, namespace='b') == 'b3'
        assert vocab2.get_token_index(vocab._padding_token, namespace='b') == 0
        assert vocab2.get_token_index(vocab._oov_token, namespace='b') == 1
        assert vocab2.get_token_index('b2', namespace='b') == 2
        assert vocab2.get_token_index('b3', namespace='b') == 3

        # Check the dictionaries containing the reverse mapping are identical.
        assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")
Ejemplo n.º 19
0
class TestTextField(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("sentence", namespace='words')
        self.vocab.add_token_to_namespace("A", namespace='words')
        self.vocab.add_token_to_namespace("A", namespace='characters')
        self.vocab.add_token_to_namespace("s", namespace='characters')
        self.vocab.add_token_to_namespace("e", namespace='characters')
        self.vocab.add_token_to_namespace("n", namespace='characters')
        self.vocab.add_token_to_namespace("t", namespace='characters')
        self.vocab.add_token_to_namespace("c", namespace='characters')
        super(TestTextField, self).setUp()

    def test_field_counts_vocab_items_correctly(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["words"]

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters": TokenCharactersIndexer("characters")
            })
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["characters"]

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "words": SingleIdTokenIndexer("words"),
                "characters": TokenCharactersIndexer("characters")
            })
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert set(namespace_token_counts.keys()) == {"words", "characters"}

    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence",
                                                      namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace(
            "A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [
            capital_a_index, sentence_index
        ]

        field1 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            {"characters": TokenCharactersIndexer(namespace="characters")})
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [
                                                            s_index, e_index,
                                                            n_index, t_index,
                                                            e_index, n_index,
                                                            c_index, e_index
                                                        ]]
        field2 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "words": SingleIdTokenIndexer(namespace="words"),
                "characters": TokenCharactersIndexer(namespace="characters")
            })
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [
            capital_a_index, sentence_index
        ]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [
                                                            s_index, e_index,
                                                            n_index, t_index,
                                                            e_index, n_index,
                                                            c_index, e_index
                                                        ]]
        # pylint: enable=protected-access

    def test_get_padding_lengths_raises_if_no_indexed_tokens(self):

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        with pytest.raises(ConfigurationError):
            field.get_padding_lengths()

    def test_padding_lengths_are_computed_correctly(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5}

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters": TokenCharactersIndexer("characters")
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8}

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters": TokenCharactersIndexer("characters"),
                "words": SingleIdTokenIndexer("words")
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8}

    def test_as_tensor_handles_words(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(tensor_dict["words"].asnumpy(),
                                                numpy.array([1, 1, 1, 2, 1]))

    def test_as_tensor_handles_longer_lengths(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["num_tokens"] = 10
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"].asnumpy(),
            numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0]))

    def test_as_tensor_handles_characters(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters": TokenCharactersIndexer("characters")
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0],
                                                [1, 3, 0, 0, 0, 0, 0, 0],
                                                [1, 0, 0, 0, 0, 0, 0, 0],
                                                [3, 4, 5, 6, 4, 5, 7, 4],
                                                [1, 0, 0, 0, 0, 0, 0, 0]])
        numpy.testing.assert_array_almost_equal(
            tensor_dict["characters"].asnumpy(), expected_character_array)

    def test_as_tensor_handles_words_and_characters_with_longer_lengths(self):
        field = TextField(
            [Token(t) for t in ["a", "sentence", "."]],
            token_indexers={
                "words": SingleIdTokenIndexer("words"),
                "characters": TokenCharactersIndexer("characters")
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["num_tokens"] = 5
        padding_lengths["num_token_characters"] = 10
        tensor_dict = field.as_tensor(padding_lengths)

        numpy.testing.assert_array_almost_equal(tensor_dict["words"].asnumpy(),
                                                numpy.array([1, 2, 1, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["characters"].asnumpy(),
            numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [3, 4, 5, 6, 4, 5, 7, 4, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
Ejemplo n.º 20
0
 def index(self, vocab: Vocabulary):
     if self._label_id is None:
         self._label_id = vocab.get_token_index(
             self.label, self._label_namespace)  # type: ignore
Ejemplo n.º 21
0
    def test_add_word_to_index_gives_consistent_results(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word")
        assert "word" in vocab.get_index_to_token_vocabulary().values()
        assert vocab.get_token_index("word") == word_index
        assert vocab.get_token_from_index(word_index) == "word"
        assert vocab.get_vocab_size() == initial_vocab_size + 1

        # Now add it again, and make sure nothing changes.
        vocab.add_token_to_namespace("word")
        assert "word" in vocab.get_index_to_token_vocabulary().values()
        assert vocab.get_token_index("word") == word_index
        assert vocab.get_token_from_index(word_index) == "word"
        assert vocab.get_vocab_size() == initial_vocab_size + 1
Ejemplo n.º 22
0
    def test_from_params(self):
        # Save a vocab to check we can load it from_params.
        vocab_dir = os.path.join(self.TEST_DIR, 'vocab_save')
        vocab = Vocabulary(non_padded_namespaces=["a", "c"])
        vocab.add_token_to_namespace("a0", namespace="a")  # non-padded, should start at 0
        vocab.add_token_to_namespace("a1", namespace="a")
        vocab.add_token_to_namespace("a2", namespace="a")
        vocab.add_token_to_namespace("b2", namespace="b")  # padded, should start at 2
        vocab.add_token_to_namespace("b3", namespace="b")
        vocab.save_to_files(vocab_dir)

        params = Params({"directory_path": vocab_dir})
        vocab2 = Vocabulary.from_params(params)
        assert vocab.get_index_to_token_vocabulary("a") == vocab2.get_index_to_token_vocabulary("a")
        assert vocab.get_index_to_token_vocabulary("b") == vocab2.get_index_to_token_vocabulary("b")

        # Test case where we build a vocab from a dataset.
        vocab2 = Vocabulary.from_params(Params({}), self.dataset)
        assert vocab2.get_index_to_token_vocabulary("tokens") == {0: '@@PADDING@@',
                                                                  1: '@@UNKNOWN@@',
                                                                  2: 'a', 3: 'c', 4: 'b'}
        # Test from_params raises when we have neither a dataset and a vocab_directory.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(Params({}))

        # Test from_params raises when there are any other dict keys
        # present apart from 'vocabulary_directory' and we aren't calling from_dataset.
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(Params({"directory_path": vocab_dir, "min_count": {'tokens': 2}}))
Ejemplo n.º 23
0
    def test_namespaces(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word", namespace='1')
        assert "word" in vocab.get_index_to_token_vocabulary(namespace='1').values()
        assert vocab.get_token_index("word", namespace='1') == word_index
        assert vocab.get_token_from_index(word_index, namespace='1') == "word"
        assert vocab.get_vocab_size(namespace='1') == initial_vocab_size + 1

        # Now add it again, in a different namespace and a different word, and make sure it's like
        # new.
        word2_index = vocab.add_token_to_namespace("word2", namespace='2')
        word_index = vocab.add_token_to_namespace("word", namespace='2')
        assert "word" in vocab.get_index_to_token_vocabulary(namespace='2').values()
        assert "word2" in vocab.get_index_to_token_vocabulary(namespace='2').values()
        assert vocab.get_token_index("word", namespace='2') == word_index
        assert vocab.get_token_index("word2", namespace='2') == word2_index
        assert vocab.get_token_from_index(word_index, namespace='2') == "word"
        assert vocab.get_token_from_index(word2_index, namespace='2') == "word2"
        assert vocab.get_vocab_size(namespace='2') == initial_vocab_size + 2
Ejemplo n.º 24
0
    def test_set_from_file_reads_padded_files(self):
        # pylint: disable=protected-access
        vocab_filename = self.TEST_DIR + 'vocab_file'
        with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file:
            vocab_file.write('<S>\n')
            vocab_file.write('</S>\n')
            vocab_file.write('<UNK>\n')
            vocab_file.write('a\n')
            vocab_file.write('tricky\x0bchar\n')
            vocab_file.write('word\n')
            vocab_file.write('another\n')

        vocab = Vocabulary()
        vocab.set_from_file(vocab_filename, is_padded=True, oov_token="<UNK>")

        assert vocab._oov_token == DEFAULT_OOV_TOKEN
        assert vocab.get_token_index("random string") == 3
        assert vocab.get_token_index("<S>") == 1
        assert vocab.get_token_index("</S>") == 2
        assert vocab.get_token_index(DEFAULT_OOV_TOKEN) == 3
        assert vocab.get_token_index("a") == 4
        assert vocab.get_token_index("tricky\x0bchar") == 5
        assert vocab.get_token_index("word") == 6
        assert vocab.get_token_index("another") == 7
        assert vocab.get_token_from_index(0) == vocab._padding_token
        assert vocab.get_token_from_index(1) == "<S>"
        assert vocab.get_token_from_index(2) == "</S>"
        assert vocab.get_token_from_index(3) == DEFAULT_OOV_TOKEN
        assert vocab.get_token_from_index(4) == "a"
        assert vocab.get_token_from_index(5) == "tricky\x0bchar"
        assert vocab.get_token_from_index(6) == "word"
        assert vocab.get_token_from_index(7) == "another"