Ejemplo n.º 1
0
    def test_count_vocab_items_case_insensitive(self):
        indexer = SingleIdTokenIndexer("words", lowercase_tokens=True)
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token("Hello"), counter)
        indexer.count_vocab_items(Token("hello"), counter)

        assert counter["words"] == {"hello": 2}
Ejemplo n.º 2
0
    def test_count_vocab_items_respect_casing(self):
        indexer = SingleIdTokenIndexer("words")
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token("Hello"), counter)
        indexer.count_vocab_items(Token("hello"), counter)

        assert counter["words"] == {"hello": 1, "Hello": 1}
Ejemplo n.º 3
0
    def test_index_converts_field_correctly(self):
        vocab = Dictionary()
        sentence_index = vocab.add_token_to_namespace("sentence",
                                                      namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace(
            "A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [
            capital_a_index, sentence_index
        ]

        field1 = TextField(
            [Token(t) for t in ["A", "sentence"]], {
                "characters":
                TokenCharacterIndexer(namespace="characters",
                                      min_padding_length=1)
            })
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [
                                                            s_index, e_index,
                                                            n_index, t_index,
                                                            e_index, n_index,
                                                            c_index, e_index
                                                        ]]
        field2 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "words":
                SingleIdTokenIndexer(namespace="words"),
                "characters":
                TokenCharacterIndexer(namespace="characters",
                                      min_padding_length=1)
            })
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [
            capital_a_index, sentence_index
        ]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [
                                                            s_index, e_index,
                                                            n_index, t_index,
                                                            e_index, n_index,
                                                            c_index, e_index
                                                        ]]
Ejemplo n.º 4
0
 def test_token_padding_lengths_are_computed_correctly(self):
     field = TextField(
         [Token(t) for t in ["A", "sentence"]],
         token_indexers={
             "field_with_dict":
             DictReturningTokenIndexer(token_min_padding_length=3),
             "words":
             SingleIdTokenIndexer("words", token_min_padding_length=3),
             "characters":
             TokenCharacterIndexer("characters",
                                   min_padding_length=1,
                                   token_min_padding_length=3)
         })
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     assert padding_lengths == {
         'token_ids_length': 5,
         'additional_key_length': 3,
         'words_length': 3,
         'characters_length': 3,
         'num_token_characters': 8,
         'num_tokens': 5,
     }
     tensors = field.as_tensor(padding_lengths)
     assert tensors['additional_key'].tolist()[-1] == 0
     assert tensors['words'].tolist()[-1] == 0
     assert tensors['characters'].tolist()[-1] == [0] * 8
Ejemplo n.º 5
0
 def test_token_indexer_returns_dict(self):
     field = TextField(
         [Token(t) for t in ["A", "sentence"]],
         token_indexers={
             "field_with_dict":
             DictReturningTokenIndexer(),
             "words":
             SingleIdTokenIndexer("words"),
             "characters":
             TokenCharacterIndexer("characters", min_padding_length=1)
         })
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     assert padding_lengths == {
         'token_ids_length': 5,
         'additional_key_length': 2,
         'words_length': 2,
         'characters_length': 2,
         'num_token_characters': 8,
         'num_tokens': 5,
     }
     padding_lengths['token_ids_length'] = 7
     padding_lengths['additional_key_length'] = 3
     padding_lengths['words_length'] = 4
     padding_lengths['characters_length'] = 4
     tensors = field.as_tensor(padding_lengths)
     assert list(tensors['token_ids'].shape) == [7]
     assert list(tensors['additional_key'].shape) == [3]
     assert list(tensors['words'].shape) == [4]
     assert list(tensors['characters'].shape) == [4, 8]
Ejemplo n.º 6
0
    def test_as_tensor_handles_words_and_characters_with_longer_lengths(self):
        field = TextField(
            [Token(t) for t in ["a", "sentence", "."]],
            token_indexers={
                "words":
                SingleIdTokenIndexer("words"),
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["words_length"] = 5
        padding_lengths["characters_length"] = 5
        padding_lengths["num_token_characters"] = 10
        tensor_dict = field.as_tensor(padding_lengths)

        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"].detach().cpu().numpy(),
            numpy.array([1, 2, 1, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["characters"].detach().cpu().numpy(),
            numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [3, 4, 5, 6, 4, 5, 7, 4, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
Ejemplo n.º 7
0
    def setUp(self):

        super(TestIndexField, self).setUp()

        self.text = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            {"words": SingleIdTokenIndexer("words")})
Ejemplo n.º 8
0
 def test_as_tensor_handles_words(self):
     field = TextField(
         [Token(t) for t in ["This", "is", "a", "sentence", "."]],
         token_indexers={"words": SingleIdTokenIndexer("words")})
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     tensor_dict = field.as_tensor(padding_lengths)
     numpy.testing.assert_array_almost_equal(
         tensor_dict["words"].detach().cpu().numpy(),
         numpy.array([1, 1, 1, 2, 1]))
Ejemplo n.º 9
0
    def test_padding_lengths_are_computed_correctly(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"words_length": 5, "num_tokens": 5}

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "num_tokens": 5,
            "characters_length": 5,
            "num_token_characters": 8
        }

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1),
                "words":
                SingleIdTokenIndexer("words")
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "num_tokens": 5,
            "characters_length": 5,
            "words_length": 5,
            "num_token_characters": 8
        }
Ejemplo n.º 10
0
    def __init__(self,
                 source_tokenizer: Tokenizer = None,
                 target_tokenizer: Tokenizer = None,
                 source_token_indexers: Dict[str, TokenIndexer] = None,
                 target_token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 delimiter: str = "\t",
                 lazy: bool = False) -> None:
        super(Seq2SeqDatasetReader, self).__init__(lazy)

        self.source_tokenizer = source_tokenizer or WordTokenizer()
        self.target_tokenizer = target_tokenizer or self.source_tokenizer
        self.source_token_indexers = source_token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self.target_token_indexers = target_token_indexers or self.source_token_indexers
        self.source_add_start_token = source_add_start_token
        self.delimiter = delimiter
Ejemplo n.º 11
0
    def setUp(self):
        super(TestSequenceLabelField, self).setUp()

        self.text = TextField(
            [Token(t) for t in ["here", "are", "some", "words", "."]],
            {"words": SingleIdTokenIndexer("words")})
Ejemplo n.º 12
0
    def test_field_counts_vocab_items_correctly(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["words"]

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1)
            })
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["characters"]

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "words":
                SingleIdTokenIndexer("words"),
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1)
            })
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert set(namespace_token_counts.keys()) == {"words", "characters"}
Ejemplo n.º 13
0
 def test_printing_doesnt_crash(self):
     field = TextField([Token(t) for t in ["A", "sentence"]],
                       {"words": SingleIdTokenIndexer(namespace="words")})
     print(field)