Esempio n. 1
0
    def test_init_with_nested_field_as_nesting_field(self):
        nesting_field = data.NestedField(data.Field())

        with pytest.raises(ValueError) as excinfo:
            data.NestedField(nesting_field)
        assert "nesting field must not be another NestedField" in str(
            excinfo.value)
Esempio n. 2
0
    def test_init_when_nesting_field_has_include_lengths_equal_true(self):
        nesting_field = data.Field(include_lengths=True)

        with pytest.raises(ValueError) as excinfo:
            data.NestedField(nesting_field)
        assert "nesting field cannot have include_lengths=True" in str(
            excinfo.value)
Esempio n. 3
0
    def test_pad_when_pad_first_is_true(self):
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>")
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>",
                                 pad_first=True)
        minibatch = [
            [list("john"), list("loves"),
             list("mary")],
            [list("mary"), list("cries")],
        ]
        expected = [[
            ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
            ["<w>"] + list("john") + ["</w>", "<cpad>"],
            ["<w>"] + list("loves") + ["</w>"],
            ["<w>"] + list("mary") + ["</w>", "<cpad>"],
            ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
        ],
                    [
                        ["<cpad>"] * 7,
                        ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                        ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                        ["<w>"] + list("cries") + ["</w>"],
                        ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                    ]]

        assert CHARS.pad(minibatch) == expected

        # test include_length
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>")
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>",
                                 include_lengths=True,
                                 pad_first=True)
        arr, seq_len, words_len = CHARS.pad(minibatch)
        assert arr == expected
        assert seq_len == [5, 4]
        assert words_len == [[3, 6, 7, 6, 3], [0, 3, 6, 7, 3]]
Esempio n. 4
0
    def test_preprocess(self):
        nesting_field = data.Field(
            tokenize=list, preprocessing=lambda xs: [x.upper() for x in xs])
        field = data.NestedField(nesting_field,
                                 preprocessing=lambda xs: reversed(xs))
        preprocessed = field.preprocess("john loves mary")

        assert preprocessed == [list("MARY"), list("LOVES"), list("JOHN")]
Esempio n. 5
0
    def test_pad_when_nesting_field_has_fix_length(self):
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>",
                                   fix_length=5)
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>")
        minibatch = [["john", "loves", "mary"], ["mary", "cries"]]
        expected = [[
            ["<w>", "<s>", "</w>"] + ["<cpad>"] * 2,
            ["<w>"] + list("joh") + ["</w>"],
            ["<w>"] + list("lov") + ["</w>"],
            ["<w>"] + list("mar") + ["</w>"],
            ["<w>", "</s>", "</w>"] + ["<cpad>"] * 2,
        ],
                    [
                        ["<w>", "<s>", "</w>"] + ["<cpad>"] * 2,
                        ["<w>"] + list("mar") + ["</w>"],
                        ["<w>"] + list("cri") + ["</w>"],
                        ["<w>", "</s>", "</w>"] + ["<cpad>"] * 2,
                        ["<cpad>"] * 5,
                    ]]

        assert CHARS.pad(minibatch) == expected

        # test include length
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>",
                                   fix_length=5)
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>",
                                 include_lengths=True)
        arr, seq_len, words_len = CHARS.pad(minibatch)
        assert arr == expected
        assert seq_len == [5, 4]
        assert words_len == [[3, 5, 5, 5, 3], [3, 5, 5, 3, 0]]
Esempio n. 6
0
    def test_pad_when_fix_length_is_not_none(self):
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>")
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>",
                                 fix_length=3)
        minibatch = [["john", "loves", "mary"], ["mary", "cries"]]
        expected = [[
            ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
            ["<w>"] + list("john") + ["</w>", "<cpad>"],
            ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
        ],
                    [
                        ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                        ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                        ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                    ]]

        assert CHARS.pad(minibatch) == expected

        # test include length
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>")
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>",
                                 include_lengths=True,
                                 fix_length=3)
        arr, seq_len, words_len = CHARS.pad(minibatch)
        assert arr == expected
        assert seq_len == [3, 3]
        assert words_len == [[3, 6, 3], [3, 6, 3]]
Esempio n. 7
0
    def test_build_vocab_from_iterable(self):
        nesting_field = data.Field(unk_token="<cunk>", pad_token="<cpad>")
        CHARS = data.NestedField(nesting_field)
        CHARS.build_vocab(
            [[list("aaa"), list("bbb"), ["c"]], [list("bbb"),
                                                 list("aaa")]],
            [[list("ccc"), list("bbb")], [list("bbb")]],
        )

        expected = "a b c <cunk> <cpad>".split()
        assert len(CHARS.vocab) == len(expected)
        for c in expected:
            assert c in CHARS.vocab.stoi
Esempio n. 8
0
    def test_pad_when_nesting_field_is_not_sequential(self):
        nesting_field = data.Field(sequential=False,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>")
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>")
        minibatch = [["john", "loves", "mary"], ["mary", "cries"]]
        expected = [
            ["<s>", "john", "loves", "mary", "</s>"],
            ["<s>", "mary", "cries", "</s>", "<pad>"],
        ]

        assert CHARS.pad(minibatch) == expected
Esempio n. 9
0
    def test_build_vocab_from_dataset(self):
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>")
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>")
        ex1 = data.Example.fromlist(["aaa bbb c"], [("chars", CHARS)])
        ex2 = data.Example.fromlist(["bbb aaa"], [("chars", CHARS)])
        dataset = data.Dataset([ex1, ex2], [("chars", CHARS)])

        CHARS.build_vocab(dataset, min_freq=2)

        expected = "a b <w> </w> <s> </s> <cunk> <cpad>".split()
        assert len(CHARS.vocab) == len(expected)
        for c in expected:
            assert c in CHARS.vocab.stoi
Esempio n. 10
0
    def test_pad_when_no_init_and_eos_tokens(self):
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>")
        CHARS = data.NestedField(nesting_field)
        minibatch = [["john", "loves", "mary"], ["mary", "cries"]]
        expected = [[
            ["<w>"] + list("john") + ["</w>", "<cpad>"],
            ["<w>"] + list("loves") + ["</w>"],
            ["<w>"] + list("mary") + ["</w>", "<cpad>"],
        ],
                    [
                        ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                        ["<w>"] + list("cries") + ["</w>"],
                        ["<cpad>"] * 7,
                    ]]

        assert CHARS.pad(minibatch) == expected
Esempio n. 11
0
    def test_build_vocab(self):
        nesting_field = data.Field(tokenize=list,
                                   init_token="<w>",
                                   eos_token="</w>")

        field = data.NestedField(nesting_field,
                                 init_token='<s>',
                                 eos_token='</s>',
                                 include_lengths=True,
                                 pad_first=True)

        sources = [[['a'], ['s', 'e', 'n', 't', 'e', 'n', 'c', 'e'],
                    ['o', 'f'], ['d', 'a', 't', 'a'], ['.']],
                   [['y', 'e', 't'], ['a', 'n', 'o', 't', 'h', 'e', 'r']],
                   [['o', 'n', 'e'], ['l', 'a', 's', 't'],
                    ['s', 'e', 'n', 't']]]

        field.build_vocab(sources,
                          vectors='glove.6B.50d',
                          unk_init=init.xavier_normal,
                          vectors_cache=".vector_cache")
Esempio n. 12
0
    def test_init_minimal(self):
        nesting_field = data.Field()
        field = data.NestedField(nesting_field)

        assert isinstance(field, data.Field)
        assert field.nesting_field is nesting_field
        assert field.sequential
        assert field.use_vocab
        assert field.init_token is None
        assert field.eos_token is None
        assert field.unk_token == nesting_field.unk_token
        assert field.fix_length is None
        assert field.dtype is torch.long
        assert field.preprocessing is None
        assert field.postprocessing is None
        assert field.lower == nesting_field.lower
        assert field.tokenize("a b c") == "a b c".split()
        assert not field.include_lengths
        assert field.batch_first
        assert field.pad_token == nesting_field.pad_token
        assert not field.pad_first
Esempio n. 13
0
    def test_init_full(self):
        nesting_field = data.Field()
        field = data.NestedField(
            nesting_field,
            use_vocab=False,
            init_token="<s>",
            eos_token="</s>",
            fix_length=10,
            dtype=torch.float,
            preprocessing=lambda xs: list(reversed(xs)),
            postprocessing=lambda xs: [x.upper() for x in xs],
            tokenize=list,
            pad_first=True,
        )

        assert not field.use_vocab
        assert field.init_token == "<s>"
        assert field.eos_token == "</s>"
        assert field.fix_length == 10
        assert field.dtype is torch.float
        assert field.preprocessing("a b c".split()) == "c b a".split()
        assert field.postprocessing("a b c".split()) == "A B C".split()
        assert field.tokenize("abc") == ["a", "b", "c"]
        assert field.pad_first
Esempio n. 14
0
    def test_numericalize(self):
        nesting_field = data.Field(batch_first=True)
        field = data.NestedField(nesting_field)
        ex1 = data.Example.fromlist(["john loves mary"], [("words", field)])
        ex2 = data.Example.fromlist(["mary cries"], [("words", field)])
        dataset = data.Dataset([ex1, ex2], [("words", field)])
        field.build_vocab(dataset)
        examples_data = [[
            ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
            ["<w>"] + list("john") + ["</w>", "<cpad>"],
            ["<w>"] + list("loves") + ["</w>"],
            ["<w>"] + list("mary") + ["</w>", "<cpad>"],
            ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
        ],
                         [
                             ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                             ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                             ["<w>"] + list("cries") + ["</w>"],
                             ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                             ["<cpad>"] * 7,
                         ]]
        numericalized = field.numericalize(examples_data)

        assert numericalized.dim() == 3
        assert numericalized.size(0) == len(examples_data)
        for example, numericalized_example in zip(examples_data,
                                                  numericalized):
            verify_numericalized_example(field,
                                         example,
                                         numericalized_example,
                                         batch_first=True)

        # test include_lengths
        nesting_field = data.Field(batch_first=True)
        field = data.NestedField(nesting_field, include_lengths=True)
        ex1 = data.Example.fromlist(["john loves mary"], [("words", field)])
        ex2 = data.Example.fromlist(["mary cries"], [("words", field)])
        dataset = data.Dataset([ex1, ex2], [("words", field)])
        field.build_vocab(dataset)
        examples_data = [[
            ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
            ["<w>"] + list("john") + ["</w>", "<cpad>"],
            ["<w>"] + list("loves") + ["</w>"],
            ["<w>"] + list("mary") + ["</w>", "<cpad>"],
            ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
        ],
                         [
                             ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                             ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                             ["<w>"] + list("cries") + ["</w>"],
                             ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                             ["<cpad>"] * 7,
                         ]]

        numericalized, seq_len, word_len = field.numericalize(
            (examples_data, [5, 4], [[3, 6, 7, 6, 3], [3, 6, 7, 3, 0]]))

        assert numericalized.dim() == 3
        assert len(seq_len) == 2
        assert len(word_len) == 2

        assert numericalized.size(0) == len(examples_data)
        for example, numericalized_example in zip(examples_data,
                                                  numericalized):
            verify_numericalized_example(field,
                                         example,
                                         numericalized_example,
                                         batch_first=True)
Esempio n. 15
0
    def test_init_when_nesting_field_is_not_sequential(self):
        nesting_field = data.Field(sequential=False)
        field = data.NestedField(nesting_field)

        assert field.pad_token == "<pad>"
Esempio n. 16
0
                                                  batch_size=3,
                                                  device="cuda:0")

batch = next(iter(train_iter))

print("words", batch.word)
print("udtags", batch.udtag)
print("ptbtags", batch.ptbtag)

# Now lets try both word and character embeddings
WORD = data.Field(init_token="<bos>", eos_token="<eos>")
PTB_TAG = data.Field(init_token="<bos>", eos_token="<eos>")

# We'll use NestedField to tokenize each word into list of chars
CHAR_NESTING = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>")
CHAR = data.NestedField(CHAR_NESTING, init_token="<bos>", eos_token="<eos>")

fields = [(('word', 'char'), (WORD, CHAR)), (None, None), ('ptbtag', PTB_TAG)]
train, val, test = datasets.UDPOS.splits(fields=fields)

print(train.fields)
print(len(train))
print(vars(train[0]))

WORD.build_vocab(train.word,
                 val.word,
                 test.word,
                 vectors=[GloVe(name='6B', dim='300')])
CHAR.build_vocab(train.char, val.char, test.char)
PTB_TAG.build_vocab(train.ptbtag)