Beispiel #1
0
def test_custom_node_without_label():
    data_path = "tests/dataprocessor/corpus_data_without_label.txt"
    vocab_path = "tests/dataprocessor/vocab.txt"

    @dataclass
    class CustomPreprocessor(Node):

        def __post_init__(self):
            super().__init__()
            self.node = lambda x : x

    pipe = (Corpus("custom") >>
            CustomPreprocessor() >>
            Tokenizer("bert", vocab_path) >>
            DataManager(batch_size=32))
    data = pipe.run(data_path)
    assert len(data) == 1
    i = 0
    for batch in data:
        assert type(batch) == list
        # batch size default is 1
        assert len(batch) == 10
        # this is batch data
        assert type(batch[0]) == list
        # tokens
        assert len(batch[0]) > 1
        i += 1
    assert i == 1
Beispiel #2
0
def test_custom_node_with_label():
    data_path = "tests/dataprocessor/corpus_data.txt"
    vocab_path = "tests/dataprocessor/vocab.txt"

    @dataclass
    class CustomPreprocessor(Node):

        def __post_init__(self):
            super().__init__()
            self.node = lambda x : x

    pipe = (Corpus("custom") >>
            CustomPreprocessor() >>
            Tokenizer("bert", vocab_path) >>
            DataManager(batch_size=5))
    data = pipe.run(data_path)
    assert len(data) == 2
    i = 0
    for batch in data:
        assert type(batch) == tuple
        batch_x, batch_y = batch
        # batch size is 5
        assert len(batch_x) == 5
        assert len(batch_y) == 5
        # this is batch data
        assert type(batch_x[0]) == list
        # tokens
        assert len(batch_x[1]) > 1
        assert batch_y[0] in ["1", "0"]
        i += 1
    assert i == 2
Beispiel #3
0
def test_data_manager_sequence_pad(input_data):
    # min_seq_len default is 1
    manager = DataManager(batch_size=3, name="sequence")
    data = manager(input_data)
    result = list(data)
    assert result == [[[
        "I", "love", "you", ",", "and", "you", "love", "me", "."
    ], [1, 2, 3, 4, 5, 0, 0, 0, 0]]]
Beispiel #4
0
def test_data_manager_sequence_default(input_data):
    # batch_size default is 1
    manager = DataManager(name="sequence")
    data = manager(input_data)
    result = list(data)
    assert result == [[[
        "I", "love", "you", ",", "and", "you", "love", "me", "."
    ]], [[1, 2, 3, 4, 5]]]
Beispiel #5
0
def test_data_manager_sequence_drop_last(input_data):
    input_data = input_data + [["1", "2", "3", "4", "5"]]
    manager = DataManager(drop_last=True, batch_size=2, name="sequence")
    data = manager(input_data)
    result = list(data)
    assert result == [
        [[1, 2, 3, 4, 5, 0, 0, 0, 0],
         ["I", "love", "you", ",", "and", "you", "love", "me", "."]]
    ] or result == [[[
        "I", "love", "you", ",", "and", "you", "love", "me", "."
    ], [1, 2, 3, 4, 5, 0, 0, 0, 0]]]
Beispiel #6
0
def test_pipeline_without_label():
    data_path = "tests/dataprocessor/corpus_data_without_label.txt"
    vocab_path = "tests/dataprocessor/vocab.txt"
    pipe = (Corpus("custom") >>
            Preprocessor("common") >>
            Tokenizer("bert", vocab_path) >>
            DataManager(batch_size=2))
    data = pipe.run(data_path)
    assert len(data) == 5
    i = 0
    for batch in data:
        assert type(batch) == list
        # batch size default is 1
        assert len(batch) == 2
        # this is batch data
        assert type(batch[0]) == list
        # tokens
        assert len(batch[0]) > 1
        i += 1
    assert i == 5
Beispiel #7
0
def test_normal_without_label():
    data_path = "tests/dataprocessor/corpus_data_without_label.txt"
    vocab_path = "tests/dataprocessor/vocab.txt"
    corpus = Corpus("custom")
    preprocessor = Preprocessor("common")
    tokenizer = Tokenizer("bert", vocab_path)
    datamanager = DataManager()

    data = datamanager(tokenizer(preprocessor(corpus(data_path))))
    assert len(data) == 10
    i = 0
    for batch in data:
        assert type(batch) == list
        # batch size default is 1
        assert len(batch) == 1
        # this is batch data
        assert type(batch[0]) == list
        # tokens
        assert len(batch[0]) > 1
        i += 1
    assert i == 10
Beispiel #8
0
def test_functional_pipeline_without_label():
    data_path = "tests/dataprocessor/corpus_data_without_label.txt"
    vocab_path = "tests/dataprocessor/vocab.txt"
    corpus = Corpus("custom")
    preprocessor = Preprocessor("common")
    tokenizer = Tokenizer("bert", vocab_path)
    datamanager = DataManager(batch_size=2)
    pipe = N(corpus) >> N(preprocessor) >> N(tokenizer) >> N(datamanager)
    data = pipe(data_path)
    assert len(data) == 5
    i = 0
    for batch in data:
        assert type(batch) == list
        # batch size default is 1
        assert len(batch) == 2
        # this is batch data
        assert type(batch[0]) == list
        # tokens
        assert len(batch[0]) > 1
        i += 1
    assert i == 5
Beispiel #9
0
def test_pipeline_with_label():
    data_path = "tests/dataprocessor/corpus_data.txt"
    vocab_path = "tests/dataprocessor/vocab.txt"
    pipe = (Corpus("custom") >>
            Preprocessor("common") >>
            Tokenizer("bert", vocab_path) >>
            DataManager(batch_size=3, drop_last=True))
    data = pipe.run(data_path)
    assert len(data) == 3
    i = 0
    for batch in data:
        assert type(batch) == tuple
        batch_x, batch_y = batch
        # batch size is 2
        assert len(batch_x) == 3
        assert len(batch_y) == 3
        # this is batch data
        assert type(batch_x[0]) == list
        # tokens
        assert len(batch_x[1]) > 1
        assert batch_y[0] in ["1", "0"]
        i += 1
    assert i == 3
Beispiel #10
0
def test_normal_with_label():
    data_path = "tests/dataprocessor/corpus_data.txt"
    vocab_path = "tests/dataprocessor/vocab.txt"
    corpus = Corpus("custom")
    preprocessor = Preprocessor("common")
    tokenizer = Tokenizer("bert", vocab_path)
    datamanager = DataManager(batch_size=2)

    data = datamanager(tokenizer(preprocessor(corpus(data_path))))
    assert len(data) == 5
    i = 0
    for batch in data:
        assert type(batch) == tuple
        batch_x, batch_y = batch
        # batch size is 2
        assert len(batch_x) == 2
        assert len(batch_y) == 2
        # this is batch data
        assert type(batch_x[0]) == list
        # tokens
        assert len(batch_x[1]) > 1
        assert batch_y[0] in ["1", "0"]
        i += 1
    assert i == 5
Beispiel #11
0
def test_functional_pipeline_with_label():
    data_path = "tests/dataprocessor/corpus_data.txt"
    vocab_path = "tests/dataprocessor/vocab.txt"
    corpus = Corpus("custom")
    preprocessor = Preprocessor("common")
    tokenizer = Tokenizer("bert", vocab_path)
    datamanager = DataManager(batch_size=3, drop_last=True)
    pipe = N(corpus) >> N(preprocessor) >> N(tokenizer) >> N(datamanager)
    data = pipe(data_path)
    assert len(data) == 3
    i = 0
    for batch in data:
        assert type(batch) == tuple
        batch_x, batch_y = batch
        # batch size is 2
        assert len(batch_x) == 3
        assert len(batch_y) == 3
        # this is batch data
        assert type(batch_x[0]) == list
        # tokens
        assert len(batch_x[1]) > 1
        assert batch_y[0] in ["1", "0"]
        i += 1
    assert i == 3
Beispiel #12
0
def test_pretrained_processor_input_dataloader():
    pp = PretrainedBasicProcessor()

    batch_size = 10
    seq_len = 32

    data_path = "tests/dataprocessor/corpus_data_without_label.txt"
    vocab_path = "tests/dataprocessor/vocab.txt"
    pipe = (Corpus("custom") >> Preprocessor("common") >> Tokenizer(
        "bert", vocab_path) >> DataManager(batch_size=batch_size))
    data = pipe.run(data_path)

    outputs = pp(data)

    for output in outputs:
        print(output)
        assert len(output) == 4
        assert output["input_ids"].shape == torch.Size([batch_size, seq_len])
        assert output["attention_mask"].shape == torch.Size(
            [batch_size, seq_len])
        assert output["token_type_ids"].shape == torch.Size(
            [batch_size, seq_len])
        assert output["position_ids"].shape == torch.Size(
            [batch_size, seq_len])
Beispiel #13
0
def test_data_manager_random_drop_last(input_data):
    input_data = input_data + [["1", "2", "3", "4", "5"]]
    manager = DataManager(drop_last=True, batch_size=2, name="random")
    data = manager(input_data)
    result = list(data)
    assert len(result[0][0]) == len(result[0][1])
Beispiel #14
0
def test_data_manager_random_seq_len(input_data):
    manager = DataManager(min_seq_len=6, batch_size=3, name="random")
    data = manager(input_data)
    assert list(data) == [[[
        "I", "love", "you", ",", "and", "you", "love", "me", "."
    ]]]