コード例 #1
0
ファイル: test_index_field.py プロジェクト: naetherm/NSEC_NMT
    def setUp(self):

        super(TestIndexField, self).setUp()

        self.text = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            {"words": SingleIdTokenIndexer("words")})
コード例 #2
0
    def test_sequence_methods(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]], {})

        assert len(field) == 5
        assert field[1].text == "is"
        assert [token.text
                for token in field] == ["This", "is", "a", "sentence", "."]
コード例 #3
0
 def test_token_padding_lengths_are_computed_correctly(self):
     field = TextField(
         [Token(t) for t in ["A", "sentence"]],
         token_indexers={
             "field_with_dict":
             DictReturningTokenIndexer(token_min_padding_length=3),
             "words":
             SingleIdTokenIndexer("words", token_min_padding_length=3),
             "characters":
             TokenCharacterIndexer("characters",
                                   min_padding_length=1,
                                   token_min_padding_length=3)
         })
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     assert padding_lengths == {
         'token_ids_length': 5,
         'additional_key_length': 3,
         'words_length': 3,
         'characters_length': 3,
         'num_token_characters': 8,
         'num_tokens': 5,
     }
     tensors = field.as_tensor(padding_lengths)
     assert tensors['additional_key'].tolist()[-1] == 0
     assert tensors['words'].tolist()[-1] == 0
     assert tensors['characters'].tolist()[-1] == [0] * 8
コード例 #4
0
 def test_token_indexer_returns_dict(self):
     field = TextField(
         [Token(t) for t in ["A", "sentence"]],
         token_indexers={
             "field_with_dict":
             DictReturningTokenIndexer(),
             "words":
             SingleIdTokenIndexer("words"),
             "characters":
             TokenCharacterIndexer("characters", min_padding_length=1)
         })
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     assert padding_lengths == {
         'token_ids_length': 5,
         'additional_key_length': 2,
         'words_length': 2,
         'characters_length': 2,
         'num_token_characters': 8,
         'num_tokens': 5,
     }
     padding_lengths['token_ids_length'] = 7
     padding_lengths['additional_key_length'] = 3
     padding_lengths['words_length'] = 4
     padding_lengths['characters_length'] = 4
     tensors = field.as_tensor(padding_lengths)
     assert list(tensors['token_ids'].shape) == [7]
     assert list(tensors['additional_key'].shape) == [3]
     assert list(tensors['words'].shape) == [4]
     assert list(tensors['characters'].shape) == [4, 8]
コード例 #5
0
    def test_as_tensor_handles_words_and_characters_with_longer_lengths(self):
        field = TextField(
            [Token(t) for t in ["a", "sentence", "."]],
            token_indexers={
                "words":
                SingleIdTokenIndexer("words"),
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["words_length"] = 5
        padding_lengths["characters_length"] = 5
        padding_lengths["num_token_characters"] = 10
        tensor_dict = field.as_tensor(padding_lengths)

        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"].detach().cpu().numpy(),
            numpy.array([1, 2, 1, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["characters"].detach().cpu().numpy(),
            numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [3, 4, 5, 6, 4, 5, 7, 4, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
コード例 #6
0
ファイル: seq2seq_reader.py プロジェクト: naetherm/NSEC_NMT
    def text_to_instance(self,
                         source_string: str,
                         target_string: str = None) -> Instance:
        tokenized_source = self.source_tokenizer.tokenize(source_string)
        if self.source_add_start_token:
            tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))

        source_field = TextField(tokenized_source, self.source_token_indexers)

        if target_string is not None:
            tokenized_target = self.target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target,
                                     self.target_token_indexers)

            return Instance({
                "source_tokens": source_field,
                "target_tokens": target_field
            })
        else:
            return Instance({"source_tokens": source_field})
コード例 #7
0
 def test_as_tensor_handles_words(self):
     field = TextField(
         [Token(t) for t in ["This", "is", "a", "sentence", "."]],
         token_indexers={"words": SingleIdTokenIndexer("words")})
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     tensor_dict = field.as_tensor(padding_lengths)
     numpy.testing.assert_array_almost_equal(
         tensor_dict["words"].detach().cpu().numpy(),
         numpy.array([1, 1, 1, 2, 1]))
コード例 #8
0
 def test_as_tensor_handles_characters_if_empty_field(self):
     field = TextField(
         [],
         token_indexers={
             "characters":
             TokenCharacterIndexer("characters", min_padding_length=1)
         })
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     tensor_dict = field.as_tensor(padding_lengths)
     expected_character_array = numpy.array([])
     numpy.testing.assert_array_almost_equal(
         tensor_dict["characters"].detach().cpu().numpy(),
         expected_character_array)
コード例 #9
0
 def test_as_tensor_handles_characters(self):
     field = TextField(
         [Token(t) for t in ["This", "is", "a", "sentence", "."]],
         token_indexers={
             "characters":
             TokenCharacterIndexer("characters", min_padding_length=1)
         })
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     tensor_dict = field.as_tensor(padding_lengths)
     expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0],
                                             [1, 3, 0, 0, 0, 0, 0, 0],
                                             [1, 0, 0, 0, 0, 0, 0, 0],
                                             [3, 4, 5, 6, 4, 5, 7, 4],
                                             [1, 0, 0, 0, 0, 0, 0, 0]])
     numpy.testing.assert_array_almost_equal(
         tensor_dict["characters"].detach().cpu().numpy(),
         expected_character_array)
コード例 #10
0
    def test_field_counts_vocab_items_correctly(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["words"]

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1)
            })
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["characters"]

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "words":
                SingleIdTokenIndexer("words"),
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1)
            })
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert set(namespace_token_counts.keys()) == {"words", "characters"}
コード例 #11
0
 def test_printing_doesnt_crash(self):
     field = TextField([Token(t) for t in ["A", "sentence"]],
                       {"words": SingleIdTokenIndexer(namespace="words")})
     print(field)
コード例 #12
0
    def test_padding_lengths_are_computed_correctly(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"words_length": 5, "num_tokens": 5}

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "num_tokens": 5,
            "characters_length": 5,
            "num_token_characters": 8
        }

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1),
                "words":
                SingleIdTokenIndexer("words")
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "num_tokens": 5,
            "characters_length": 5,
            "words_length": 5,
            "num_token_characters": 8
        }
コード例 #13
0
    def test_index_converts_field_correctly(self):
        vocab = Dictionary()
        sentence_index = vocab.add_token_to_namespace("sentence",
                                                      namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace(
            "A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [
            capital_a_index, sentence_index
        ]

        field1 = TextField(
            [Token(t) for t in ["A", "sentence"]], {
                "characters":
                TokenCharacterIndexer(namespace="characters",
                                      min_padding_length=1)
            })
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [
                                                            s_index, e_index,
                                                            n_index, t_index,
                                                            e_index, n_index,
                                                            c_index, e_index
                                                        ]]
        field2 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "words":
                SingleIdTokenIndexer(namespace="words"),
                "characters":
                TokenCharacterIndexer(namespace="characters",
                                      min_padding_length=1)
            })
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [
            capital_a_index, sentence_index
        ]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [
                                                            s_index, e_index,
                                                            n_index, t_index,
                                                            e_index, n_index,
                                                            c_index, e_index
                                                        ]]