예제 #1
0
    def test_min_padding_length(self):
        sentence = "AllenNLP is awesome ."
        tokens = [Token(token) for token in sentence.split(" ")]
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace="characters")  # 2
        vocab.add_token_to_namespace("l", namespace="characters")  # 3
        vocab.add_token_to_namespace("e", namespace="characters")  # 4
        vocab.add_token_to_namespace("n", namespace="characters")  # 5
        vocab.add_token_to_namespace("N", namespace="characters")  # 6
        vocab.add_token_to_namespace("L", namespace="characters")  # 7
        vocab.add_token_to_namespace("P", namespace="characters")  # 8
        vocab.add_token_to_namespace("i", namespace="characters")  # 9
        vocab.add_token_to_namespace("s", namespace="characters")  # 10
        vocab.add_token_to_namespace("a", namespace="characters")  # 11
        vocab.add_token_to_namespace("w", namespace="characters")  # 12
        vocab.add_token_to_namespace("o", namespace="characters")  # 13
        vocab.add_token_to_namespace("m", namespace="characters")  # 14
        vocab.add_token_to_namespace(".", namespace="characters")  # 15

        indexer = TokenCharactersIndexer("characters", min_padding_length=10)
        indices = indexer.tokens_to_indices(tokens, vocab, "char")
        key_padding_lengths = "num_token_characters"
        value_padding_lengths = 0
        for token in indices["char"]:
            item = indexer.get_padding_lengths(token)
            value = item.values()
            value_padding_lengths = max(value_padding_lengths, max(value))
        padded = indexer.as_padded_tensor(
            indices, {"char": len(indices["char"])},
            {key_padding_lengths: value_padding_lengths})
        assert padded["char"].tolist() == [[2, 3, 3, 4, 5, 6, 7, 8, 0, 0],
                                           [9, 10, 0, 0, 0, 0, 0, 0, 0, 0],
                                           [11, 12, 4, 10, 13, 14, 4, 0, 0, 0],
                                           [15, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
예제 #2
0
 def test_as_array_produces_token_sequence(self):
     indexer = TokenCharactersIndexer("characters", min_padding_length=1)
     padded_tokens = indexer.as_padded_tensor(
         {'k': [[1, 2, 3, 4, 5], [1, 2, 3], [1]]},
         desired_num_tokens={'k': 4},
         padding_lengths={"num_token_characters": 10})
     assert padded_tokens["k"].tolist() == [[1, 2, 3, 4, 5, 0, 0, 0, 0, 0],
                                            [1, 2, 3, 0, 0, 0, 0, 0, 0, 0],
                                            [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]