def test_as_tensor_produces_integer_targets(self):
        vocab = Dictionary()
        vocab.add_token_to_namespace("B", namespace='*labels')
        vocab.add_token_to_namespace("I", namespace='*labels')
        vocab.add_token_to_namespace("O", namespace='*labels')

        tags = ["B", "I", "O", "O", "O"]
        sequence_label_field = SequenceLabelField(tags,
                                                  self.text,
                                                  label_namespace="*labels")
        sequence_label_field.index(vocab)
        padding_lengths = sequence_label_field.get_padding_lengths()
        tensor = sequence_label_field.as_tensor(
            padding_lengths).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor,
                                                numpy.array([0, 1, 2, 2, 2]))
    def test_index_converts_field_correctly(self):
        vocab = Dictionary()
        b_index = vocab.add_token_to_namespace("B", namespace='*labels')
        i_index = vocab.add_token_to_namespace("I", namespace='*labels')
        o_index = vocab.add_token_to_namespace("O", namespace='*labels')

        tags = ["B", "I", "O", "O", "O"]
        sequence_label_field = SequenceLabelField(tags,
                                                  self.text,
                                                  label_namespace="*labels")
        sequence_label_field.index(vocab)

        # pylint: disable=protected-access
        assert sequence_label_field.indexed_labels == [
            b_index, i_index, o_index, o_index, o_index
        ]
    def test_additional_start_and_end_tokens(self):
        vocab = Dictionary()
        vocab.add_token_to_namespace("A", namespace='characters')  # 2
        vocab.add_token_to_namespace("s", namespace='characters')  # 3
        vocab.add_token_to_namespace("e", namespace='characters')  # 4
        vocab.add_token_to_namespace("n", namespace='characters')  # 5
        vocab.add_token_to_namespace("t", namespace='characters')  # 6
        vocab.add_token_to_namespace("c", namespace='characters')  # 7
        vocab.add_token_to_namespace("<", namespace='characters')  # 8
        vocab.add_token_to_namespace(">", namespace='characters')  # 9
        vocab.add_token_to_namespace("/", namespace='characters')  # 10

        indexer = TokenCharacterIndexer("characters",
                                        start_tokens=["<s>"],
                                        end_tokens=["</s>"],
                                        min_padding_length=1)
        indices = indexer.tokens_to_indices([Token("sentential")], vocab,
                                            "char")

        expected_ = {
            "char": [[8, 3, 9], [3, 4, 5, 6, 4, 5, 6, 1, 1, 1], [8, 10, 3, 9]]
        }

        assert indices == expected_
    def test_label_field_can_index_with_vocab(self):
        vocab = Dictionary()
        vocab.add_token_to_namespace("entailment", namespace="labels")
        vocab.add_token_to_namespace("contradiction", namespace="labels")
        vocab.add_token_to_namespace("neutral", namespace="labels")

        label = LabelField("entailment")
        label.index(vocab)
        tensor = label.as_tensor(label.get_padding_lengths())
        assert tensor.item() == 0
Exemple #5
0
    def test_index_converts_field_correctly(self):
        vocab = Dictionary()
        sentence_index = vocab.add_token_to_namespace("sentence",
                                                      namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace(
            "A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [
            capital_a_index, sentence_index
        ]

        field1 = TextField(
            [Token(t) for t in ["A", "sentence"]], {
                "characters":
                TokenCharacterIndexer(namespace="characters",
                                      min_padding_length=1)
            })
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [
                                                            s_index, e_index,
                                                            n_index, t_index,
                                                            e_index, n_index,
                                                            c_index, e_index
                                                        ]]
        field2 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "words":
                SingleIdTokenIndexer(namespace="words"),
                "characters":
                TokenCharacterIndexer(namespace="characters",
                                      min_padding_length=1)
            })
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [
            capital_a_index, sentence_index
        ]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [
                                                            s_index, e_index,
                                                            n_index, t_index,
                                                            e_index, n_index,
                                                            c_index, e_index
                                                        ]]
    def test_token2indices_correct_characters(self):
        vocab = Dictionary()
        vocab.add_token_to_namespace("A", namespace='characters')  # 2
        vocab.add_token_to_namespace("s", namespace='characters')  # 3
        vocab.add_token_to_namespace("e", namespace='characters')  # 4
        vocab.add_token_to_namespace("n", namespace='characters')  # 5
        vocab.add_token_to_namespace("t", namespace='characters')  # 6
        vocab.add_token_to_namespace("c", namespace='characters')  # 7

        indexer = TokenCharacterIndexer("characters", min_padding_length=1)
        indices = indexer.tokens_to_indices([Token("sentential")], vocab,
                                            "char")

        expected_ = {"char": [[3, 4, 5, 6, 4, 5, 6, 1, 1, 1]]}

        assert indices == expected_
    def test_minimal_padding_length(self):
        sentence = "This is a test ."
        tokens = [Token(t) for t in sentence.split(" ")]
        vocab = Dictionary()
        vocab.add_token_to_namespace("T", namespace='characters')  # 2
        vocab.add_token_to_namespace("h", namespace='characters')  # 3
        vocab.add_token_to_namespace("i", namespace='characters')  # 4
        vocab.add_token_to_namespace("s", namespace='characters')  # 5
        vocab.add_token_to_namespace("a", namespace='characters')  # 6
        vocab.add_token_to_namespace("t", namespace='characters')  # 7
        vocab.add_token_to_namespace("e", namespace='characters')  # 8
        vocab.add_token_to_namespace(".", namespace='characters')  # 9
        vocab.add_token_to_namespace("y", namespace='characters')  # 10
        vocab.add_token_to_namespace("m", namespace='characters')  # 11
        vocab.add_token_to_namespace("n", namespace='characters')  # 12

        indexer = TokenCharacterIndexer("characters", min_padding_length=10)
        indices = indexer.tokens_to_indices(tokens, vocab, "char")
        key_padding_lengths = "num_token_characters"
        value_padding_lengths = 0

        for token in indices["char"]:
            item = indexer.get_padding_lengths(token)
            value = item.values()
            value_padding_lengths = max(value_padding_lengths, max(value))
        padded_ = indexer.pad_token_sequence(
            indices, {"char": len(indices["char"])},
            {key_padding_lengths: value_padding_lengths})

        expected_ = {
            "char": [[2, 3, 4, 5, 0, 0, 0, 0, 0, 0],
                     [4, 5, 0, 0, 0, 0, 0, 0, 0, 0],
                     [6, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                     [7, 8, 5, 7, 0, 0, 0, 0, 0, 0],
                     [9, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
        }

        assert padded_ == expected_
Exemple #8
0
class TestTextField(unittest.TestCase):
    def setUp(self):
        self.vocab = Dictionary()
        self.vocab.add_token_to_namespace("sentence", namespace='words')
        self.vocab.add_token_to_namespace("A", namespace='words')
        self.vocab.add_token_to_namespace("A", namespace='characters')
        self.vocab.add_token_to_namespace("s", namespace='characters')
        self.vocab.add_token_to_namespace("e", namespace='characters')
        self.vocab.add_token_to_namespace("n", namespace='characters')
        self.vocab.add_token_to_namespace("t", namespace='characters')
        self.vocab.add_token_to_namespace("c", namespace='characters')
        super(TestTextField, self).setUp()

    def test_field_counts_vocab_items_correctly(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["words"]

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1)
            })
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["characters"]

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "words":
                SingleIdTokenIndexer("words"),
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1)
            })
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert set(namespace_token_counts.keys()) == {"words", "characters"}

    def test_index_converts_field_correctly(self):
        vocab = Dictionary()
        sentence_index = vocab.add_token_to_namespace("sentence",
                                                      namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace(
            "A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [
            capital_a_index, sentence_index
        ]

        field1 = TextField(
            [Token(t) for t in ["A", "sentence"]], {
                "characters":
                TokenCharacterIndexer(namespace="characters",
                                      min_padding_length=1)
            })
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [
                                                            s_index, e_index,
                                                            n_index, t_index,
                                                            e_index, n_index,
                                                            c_index, e_index
                                                        ]]
        field2 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "words":
                SingleIdTokenIndexer(namespace="words"),
                "characters":
                TokenCharacterIndexer(namespace="characters",
                                      min_padding_length=1)
            })
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [
            capital_a_index, sentence_index
        ]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [
                                                            s_index, e_index,
                                                            n_index, t_index,
                                                            e_index, n_index,
                                                            c_index, e_index
                                                        ]]

    def test_padding_lengths_are_computed_correctly(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"words_length": 5, "num_tokens": 5}

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "num_tokens": 5,
            "characters_length": 5,
            "num_token_characters": 8
        }

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1),
                "words":
                SingleIdTokenIndexer("words")
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "num_tokens": 5,
            "characters_length": 5,
            "words_length": 5,
            "num_token_characters": 8
        }

    def test_as_tensor_handles_words(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"].detach().cpu().numpy(),
            numpy.array([1, 1, 1, 2, 1]))

    def test_as_tensor_handles_longer_lengths(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["words_length"] = 10
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"].detach().cpu().numpy(),
            numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0]))

    def test_as_tensor_handles_characters(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0],
                                                [1, 3, 0, 0, 0, 0, 0, 0],
                                                [1, 0, 0, 0, 0, 0, 0, 0],
                                                [3, 4, 5, 6, 4, 5, 7, 4],
                                                [1, 0, 0, 0, 0, 0, 0, 0]])
        numpy.testing.assert_array_almost_equal(
            tensor_dict["characters"].detach().cpu().numpy(),
            expected_character_array)

    def test_as_tensor_handles_characters_if_empty_field(self):
        field = TextField(
            [],
            token_indexers={
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        expected_character_array = numpy.array([])
        numpy.testing.assert_array_almost_equal(
            tensor_dict["characters"].detach().cpu().numpy(),
            expected_character_array)

    def test_as_tensor_handles_words_and_characters_with_longer_lengths(self):
        field = TextField(
            [Token(t) for t in ["a", "sentence", "."]],
            token_indexers={
                "words":
                SingleIdTokenIndexer("words"),
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["words_length"] = 5
        padding_lengths["characters_length"] = 5
        padding_lengths["num_token_characters"] = 10
        tensor_dict = field.as_tensor(padding_lengths)

        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"].detach().cpu().numpy(),
            numpy.array([1, 2, 1, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["characters"].detach().cpu().numpy(),
            numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [3, 4, 5, 6, 4, 5, 7, 4, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))

    def test_printing_doesnt_crash(self):
        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        print(field)

    def test_token_indexer_returns_dict(self):
        field = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "field_with_dict":
                DictReturningTokenIndexer(),
                "words":
                SingleIdTokenIndexer("words"),
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            'token_ids_length': 5,
            'additional_key_length': 2,
            'words_length': 2,
            'characters_length': 2,
            'num_token_characters': 8,
            'num_tokens': 5,
        }
        padding_lengths['token_ids_length'] = 7
        padding_lengths['additional_key_length'] = 3
        padding_lengths['words_length'] = 4
        padding_lengths['characters_length'] = 4
        tensors = field.as_tensor(padding_lengths)
        assert list(tensors['token_ids'].shape) == [7]
        assert list(tensors['additional_key'].shape) == [3]
        assert list(tensors['words'].shape) == [4]
        assert list(tensors['characters'].shape) == [4, 8]

    def test_token_padding_lengths_are_computed_correctly(self):
        field = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "field_with_dict":
                DictReturningTokenIndexer(token_min_padding_length=3),
                "words":
                SingleIdTokenIndexer("words", token_min_padding_length=3),
                "characters":
                TokenCharacterIndexer("characters",
                                      min_padding_length=1,
                                      token_min_padding_length=3)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            'token_ids_length': 5,
            'additional_key_length': 3,
            'words_length': 3,
            'characters_length': 3,
            'num_token_characters': 8,
            'num_tokens': 5,
        }
        tensors = field.as_tensor(padding_lengths)
        assert tensors['additional_key'].tolist()[-1] == 0
        assert tensors['words'].tolist()[-1] == 0
        assert tensors['characters'].tolist()[-1] == [0] * 8

    def test_sequence_methods(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]], {})

        assert len(field) == 5
        assert field[1].text == "is"
        assert [token.text
                for token in field] == ["This", "is", "a", "sentence", "."]