Example #1
0
def test_c_py_compose_transforms_module():
    """
    Test combining Python and C++ transforms
    """
    ds.config.set_seed(0)

    def test_config(arr, input_columns, output_cols, op_list):
        data = ds.NumpySlicesDataset(arr, column_names=input_columns, shuffle=False)
        data = data.map(operations=op_list, input_columns=input_columns, output_columns=output_cols,
                        column_order=output_cols)
        res = []
        for i in data.create_dict_iterator(output_numpy=True):
            for col_name in output_cols:
                res.append(i[col_name].tolist())
        return res

    arr = [1, 0]
    assert test_config(arr, ["cols"], ["cols"],
                       [py_transforms.OneHotOp(2), c_transforms.Mask(c_transforms.Relational.EQ, 1)]) == \
           [[[False, True]],
            [[True, False]]]
    assert test_config(arr, ["cols"], ["cols"],
                       [py_transforms.OneHotOp(2), (lambda x: x + x), c_transforms.Fill(1)]) \
           == [[[1, 1]], [[1, 1]]]
    assert test_config(arr, ["cols"], ["cols"],
                       [py_transforms.OneHotOp(2), (lambda x: x + x), c_transforms.Fill(1), (lambda x: x + x)]) \
           == [[[2, 2]], [[2, 2]]]
    assert test_config([[1, 3]], ["cols"], ["cols"],
                       [c_transforms.PadEnd([3], -1), (lambda x: x + x)]) \
           == [[2, 6, -2]]

    arr = ([[1]], [[3]])
    assert test_config(arr, ["col0", "col1"], ["a"], [(lambda x, y: x + y), c_transforms.PadEnd([2], -1)]) == [[4, -1]]
def process_tnews_clue_dataset(data_dir, label_list, bert_vocab_path,
                               data_usage='train', shuffle_dataset=False, max_seq_len=128, batch_size=64):
    """Process TNEWS dataset"""
    ### Loading TNEWS from CLUEDataset
    assert data_usage in ['train', 'eval', 'test']
    if data_usage == 'train':
        dataset = ds.CLUEDataset(os.path.join(data_dir, "train.json"), task='TNEWS',
                                 usage=data_usage, shuffle=shuffle_dataset)
    elif data_usage == 'eval':
        dataset = ds.CLUEDataset(os.path.join(data_dir, "dev.json"), task='TNEWS',
                                 usage=data_usage, shuffle=shuffle_dataset)
    else:
        dataset = ds.CLUEDataset(os.path.join(data_dir, "test.json"), task='TNEWS',
                                 usage=data_usage, shuffle=shuffle_dataset)
    ### Processing label
    if data_usage == 'test':
        dataset = dataset.map(input_columns=["id"], output_columns=["id", "label_id"],
                              columns_order=["id", "label_id", "sentence"], operations=ops.Duplicate())
        dataset = dataset.map(input_columns=["label_id"], operations=ops.Fill(0))
    else:
        label_vocab = text.Vocab.from_list(label_list)
        label_lookup = text.Lookup(label_vocab)
        dataset = dataset.map(input_columns="label_desc", output_columns="label_id", operations=label_lookup)
    ### Processing sentence
    vocab = text.Vocab.from_file(bert_vocab_path)
    tokenizer = text.BertTokenizer(vocab, lower_case=True)
    lookup = text.Lookup(vocab, unknown_token='[UNK]')
    dataset = dataset.map(input_columns=["sentence"], operations=tokenizer)
    dataset = dataset.map(input_columns=["sentence"], operations=ops.Slice(slice(0, max_seq_len)))
    dataset = dataset.map(input_columns=["sentence"],
                          operations=ops.Concatenate(prepend=np.array(["[CLS]"], dtype='S'),
                                                     append=np.array(["[SEP]"], dtype='S')))
    dataset = dataset.map(input_columns=["sentence"], output_columns=["text_ids"], operations=lookup)
    dataset = dataset.map(input_columns=["text_ids"], operations=ops.PadEnd([max_seq_len], 0))
    dataset = dataset.map(input_columns=["text_ids"], output_columns=["text_ids", "mask_ids"],
                          columns_order=["label_id", "text_ids", "mask_ids"], operations=ops.Duplicate())
    dataset = dataset.map(input_columns=["mask_ids"], operations=ops.Mask(ops.Relational.NE, 0, mstype.int32))
    dataset = dataset.map(input_columns=["text_ids"], output_columns=["text_ids", "segment_ids"],
                          columns_order=["label_id", "text_ids", "mask_ids", "segment_ids"], operations=ops.Duplicate())
    dataset = dataset.map(input_columns=["segment_ids"], operations=ops.Fill(0))
    dataset = dataset.batch(batch_size)
    label = []
    text_ids = []
    mask_ids = []
    segment_ids = []
    for data in dataset:
        label.append(data[0])
        text_ids.append(data[1])
        mask_ids.append(data[2])
        segment_ids.append(data[3])
    return label, text_ids, mask_ids, segment_ids
def test_eager_fill():
    """
    Test Fill op is callable
    """
    fill_op = data_trans.Fill(3)
    expected = np.array([3, 3, 3, 3])
    assert np.array_equal(fill_op([4, 5, 6, 7]), expected)
def process_ner_msra_dataset(data_dir,
                             label_list,
                             bert_vocab_path,
                             max_seq_len=128,
                             class_filter=None,
                             split_begin=None,
                             split_end=None):
    """Process MSRA dataset"""
    ### Loading MSRA from CLUEDataset
    dataset = ds.GeneratorDataset(process_msra(data_dir, class_filter,
                                               split_begin, split_end),
                                  column_names=['text', 'label'])

    ### Processing label
    label_vocab = text.Vocab.from_list(label_list)
    label_lookup = text.Lookup(label_vocab)
    dataset = dataset.map(operations=label_lookup,
                          input_columns="label",
                          output_columns="label_ids")
    dataset = dataset.map(
        operations=ops.Concatenate(prepend=np.array([0], dtype='i')),
        input_columns=["label_ids"])
    dataset = dataset.map(operations=ops.Slice(slice(0, max_seq_len)),
                          input_columns=["label_ids"])
    dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0),
                          input_columns=["label_ids"])
    ### Processing sentence
    vocab = text.Vocab.from_file(bert_vocab_path)
    lookup = text.Lookup(vocab, unknown_token='[UNK]')
    unicode_char_tokenizer = text.UnicodeCharTokenizer()
    dataset = dataset.map(operations=unicode_char_tokenizer,
                          input_columns=["text"],
                          output_columns=["sentence"])
    dataset = dataset.map(operations=ops.Slice(slice(0, max_seq_len - 2)),
                          input_columns=["sentence"])
    dataset = dataset.map(operations=ops.Concatenate(
        prepend=np.array(["[CLS]"], dtype='S'),
        append=np.array(["[SEP]"], dtype='S')),
                          input_columns=["sentence"])
    dataset = dataset.map(operations=lookup,
                          input_columns=["sentence"],
                          output_columns=["input_ids"])
    dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0),
                          input_columns=["input_ids"])
    dataset = dataset.map(
        operations=ops.Duplicate(),
        input_columns=["input_ids"],
        output_columns=["input_ids", "input_mask"],
        column_order=["input_ids", "input_mask", "label_ids"])
    dataset = dataset.map(operations=ops.Mask(ops.Relational.NE, 0,
                                              mstype.int32),
                          input_columns=["input_mask"])
    dataset = dataset.map(
        operations=ops.Duplicate(),
        input_columns=["input_ids"],
        output_columns=["input_ids", "segment_ids"],
        column_order=["input_ids", "input_mask", "segment_ids", "label_ids"])
    dataset = dataset.map(operations=ops.Fill(0),
                          input_columns=["segment_ids"])
    return dataset
Example #5
0
def test_compose():
    """
    Test C++ and Python Compose Op
    """
    ds.config.set_seed(0)

    def test_config(arr, op_list):
        try:
            data = ds.NumpySlicesDataset(arr,
                                         column_names="col",
                                         shuffle=False)
            data = data.map(input_columns=["col"], operations=op_list)
            res = []
            for i in data.create_dict_iterator(output_numpy=True):
                res.append(i["col"].tolist())
            return res
        except (TypeError, ValueError) as e:
            return str(e)

    # Test simple compose with only 1 op, this would generate a warning
    assert test_config([[1, 0], [3, 4]],
                       ops.Compose([ops.Fill(2)])) == [[2, 2], [2, 2]]
    # Test 1 column -> 2 columns -> 1 -> 2 -> 1
    assert test_config([[1, 0]],
                       ops.Compose([ops.Duplicate(), ops.Concatenate(), ops.Duplicate(), ops.Concatenate()])) \
           == [[1, 0] * 4]
    # Test one Python transform followed by a C transform. Type after OneHot is a float (mixed use-case)
    assert test_config(
        [1, 0], ops.Compose([py_ops.OneHotOp(2),
                             ops.TypeCast(mstype.int32)])) == [[[0, 1]],
                                                               [[1, 0]]]
    # Test exceptions.
    with pytest.raises(TypeError) as error_info:
        ops.Compose([1, ops.TypeCast(mstype.int32)])
    assert "op_list[0] is not a c_transform op (TensorOp) nor a callable pyfunc." in str(
        error_info.value)
    # Test empty op list
    with pytest.raises(ValueError) as error_info:
        test_config([1, 0], ops.Compose([]))
    assert "op_list can not be empty." in str(error_info.value)

    # Test Python compose op
    assert test_config([1, 0],
                       py_ops.Compose([py_ops.OneHotOp(2)])) == [[[0, 1]],
                                                                 [[1, 0]]]
    assert test_config([1, 0],
                       py_ops.Compose([py_ops.OneHotOp(2),
                                       (lambda x: x + x)])) == [[[0, 2]],
                                                                [[2, 0]]]
    # Test nested Python compose op
    assert test_config([1, 0],
                       py_ops.Compose([py_ops.Compose([py_ops.OneHotOp(2)]), (lambda x: x + x)])) \
           == [[[0, 2]], [[2, 0]]]

    with pytest.raises(TypeError) as error_info:
        py_ops.Compose([(lambda x: x + x)])()
    assert "Compose was called without an image. Fix invocation (avoid it being invoked as Compose([...])())." in str(
        error_info.value)
Example #6
0
def test_py_vision_with_c_transforms():
    """
    Test combining Python vision operations with C++ transforms operations
    """

    ds.config.set_seed(0)

    def test_config(op_list):
        data_dir = "../data/dataset/testImageNetData/train/"
        data1 = ds.ImageFolderDataset(dataset_dir=data_dir, shuffle=False)
        data1 = data1.map(operations=op_list, input_columns=["image"])
        transformed_images = []

        for item in data1.create_dict_iterator(num_epochs=1,
                                               output_numpy=True):
            transformed_images.append(item["image"])
        return transformed_images

    # Test with Mask Op
    output_arr = test_config([
        py_vision.Decode(),
        py_vision.CenterCrop((2)), np.array,
        c_transforms.Mask(c_transforms.Relational.GE, 100)
    ])

    exp_arr = [
        np.array([[[True, False, False], [True, False, False]],
                  [[True, False, False], [True, False, False]]]),
        np.array([[[True, False, False], [True, False, False]],
                  [[True, False, False], [True, False, False]]])
    ]

    for exp_a, output in zip(exp_arr, output_arr):
        np.testing.assert_array_equal(exp_a, output)

    # Test with Fill Op
    output_arr = test_config([
        py_vision.Decode(),
        py_vision.CenterCrop((4)), np.array,
        c_transforms.Fill(10)
    ])

    exp_arr = [np.ones((4, 4, 3)) * 10] * 2
    for exp_a, output in zip(exp_arr, output_arr):
        np.testing.assert_array_equal(exp_a, output)

    # Test with Concatenate Op, which will raise an error since ConcatenateOp only supports rank 1 tensors.
    with pytest.raises(RuntimeError) as error_info:
        test_config([
            py_vision.Decode(),
            py_vision.CenterCrop((2)), np.array,
            c_transforms.Concatenate(0)
        ])
    assert "Only 1D tensors supported" in str(error_info.value)
Example #7
0
def test_fillop_bytes():
    def gen():
        yield (np.array(["A", "B", "C"], dtype='S'), )

    data = ds.GeneratorDataset(gen, column_names=["col"])
    fill_op = data_trans.Fill(b'abc')

    data = data.map(operations=fill_op, input_columns=["col"])
    expected = np.array([b'abc', b'abc', b'abc'], dtype='S')
    for data_row in data.create_tuple_iterator(output_numpy=True):
        np.testing.assert_array_equal(data_row[0], expected)
Example #8
0
def test_fillop_string():
    def gen():
        yield (np.array(["45555", "45555"], dtype='S'), )

    data = ds.GeneratorDataset(gen, column_names=["col"])
    fill_op = data_trans.Fill("error")

    data = data.map(operations=fill_op, input_columns=["col"])
    expected = np.array(['error', 'error'], dtype='S')
    for data_row in data.create_tuple_iterator(output_numpy=True):
        np.testing.assert_array_equal(data_row[0], expected)
Example #9
0
def test_fillop_up_type_cast():
    def gen():
        yield (np.array([4, 5, 6, 7], dtype=np.float), )

    data = ds.GeneratorDataset(gen, column_names=["col"])
    fill_op = data_trans.Fill(3)

    data = data.map(operations=fill_op, input_columns=["col"])
    expected = np.array([3., 3., 3., 3.], dtype=np.float)
    for data_row in data:
        np.testing.assert_array_equal(data_row[0].asnumpy(), expected)
Example #10
0
def test_fillop_down_type_cast():
    def gen():
        yield (np.array([4, 5, 6, 7], dtype=np.uint8),)

    data = ds.GeneratorDataset(gen, column_names=["col"])
    fill_op = data_trans.Fill(-3)

    data = data.map(input_columns=["col"], operations=fill_op)
    expected = np.array([253, 253, 253, 253], dtype=np.uint8)
    for data_row in data:
        np.testing.assert_array_equal(data_row[0], expected)
Example #11
0
def test_fillop_error_handling():
    def gen():
        yield (np.array([4, 4, 4, 4]),)

    data = ds.GeneratorDataset(gen, column_names=["col"])
    fill_op = data_trans.Fill("words")
    data = data.map(input_columns=["col"], operations=fill_op)

    with pytest.raises(RuntimeError) as error_info:
        for data_row in data:
            print(data_row)
    assert "Types do not match" in repr(error_info.value)
Example #12
0
def test_fillop_error_handling():
    def gen():
        yield (np.array([4, 4, 4, 4]), )

    data = ds.GeneratorDataset(gen, column_names=["col"])
    fill_op = data_trans.Fill("words")
    data = data.map(operations=fill_op, input_columns=["col"])

    with pytest.raises(RuntimeError) as error_info:
        for _ in data:
            pass
    assert "fill datatype does not match the input datatype" in str(
        error_info.value)
def skip_test_serdes_fill(remove_json_files=True):
    """
    Test serdes on Fill data transform.
    """
    def gen():
        yield (np.array([4, 5, 6, 7], dtype=np.int32), )

    data = ds.GeneratorDataset(gen, column_names=["col"])
    fill_op = c.Fill(3)

    data = data.map(operations=fill_op, input_columns=["col"])
    expected = np.array([3, 3, 3, 3], dtype=np.int32)
    for data_row in data:
        np.testing.assert_array_equal(data_row[0].asnumpy(), expected)

    # FIXME - need proper serdes support for Fill's fill_value parameter
    util_check_serialize_deserialize_file(data, "fill_pipeline",
                                          remove_json_files)
Example #14
0
def test_compose():
    ds.config.set_seed(0)

    def test_config(arr, op_list):
        try:
            data = ds.NumpySlicesDataset(arr,
                                         column_names="col",
                                         shuffle=False)
            data = data.map(operations=ops.Compose(op_list),
                            input_columns=["col"])
            res = []
            for i in data.create_dict_iterator(num_epochs=1,
                                               output_numpy=True):
                res.append(i["col"].tolist())
            return res
        except (TypeError, ValueError) as e:
            return str(e)

    # test simple compose with only 1 op, this would generate a warning
    assert test_config([[1, 0], [3, 4]], [ops.Fill(2)]) == [[2, 2], [2, 2]]
    # test 1 column -> 2columns -> 1 -> 2 -> 1
    assert test_config([[1, 0]], [
        ops.Duplicate(),
        ops.Concatenate(),
        ops.Duplicate(),
        ops.Concatenate()
    ]) == [[1, 0] * 4]
    # test one python transform followed by a C transform. type after oneHot is float (mixed use-case)
    assert test_config(
        [1, 0],
        [py_ops.OneHotOp(2), ops.TypeCast(mstype.int32)]) == [[[0, 1]],
                                                              [[1, 0]]]
    # test exceptions. compose, randomApply randomChoice use the same validator
    assert "op_list[0] is not a c_transform op" in test_config(
        [1, 0], [1, ops.TypeCast(mstype.int32)])
    # test empty op list
    assert "op_list can not be empty." in test_config([1, 0], [])
def process_cmnli_clue_dataset(data_dir,
                               label_list,
                               bert_vocab_path,
                               data_usage='train',
                               shuffle_dataset=False,
                               max_seq_len=128,
                               batch_size=64,
                               drop_remainder=True):
    """Process CMNLI dataset"""
    ### Loading CMNLI from CLUEDataset
    assert data_usage in ['train', 'eval', 'test']
    if data_usage == 'train':
        dataset = ds.CLUEDataset(os.path.join(data_dir, "train.json"),
                                 task='CMNLI',
                                 usage=data_usage,
                                 shuffle=shuffle_dataset)
    elif data_usage == 'eval':
        dataset = ds.CLUEDataset(os.path.join(data_dir, "dev.json"),
                                 task='CMNLI',
                                 usage=data_usage,
                                 shuffle=shuffle_dataset)
    else:
        dataset = ds.CLUEDataset(os.path.join(data_dir, "test.json"),
                                 task='CMNLI',
                                 usage=data_usage,
                                 shuffle=shuffle_dataset)
    ### Processing label
    if data_usage == 'test':
        dataset = dataset.map(
            operations=ops.Duplicate(),
            input_columns=["id"],
            output_columns=["id", "label_id"],
            column_order=["id", "label_id", "sentence1", "sentence2"])
        dataset = dataset.map(operations=ops.Fill(0),
                              input_columns=["label_id"])
    else:
        label_vocab = text.Vocab.from_list(label_list)
        label_lookup = text.Lookup(label_vocab)
        dataset = dataset.map(operations=label_lookup,
                              input_columns="label",
                              output_columns="label_id")
    ### Processing sentence pairs
    vocab = text.Vocab.from_file(bert_vocab_path)
    tokenizer = text.BertTokenizer(vocab, lower_case=True)
    lookup = text.Lookup(vocab, unknown_token='[UNK]')
    ### Tokenizing sentences and truncate sequence pair
    dataset = dataset.map(operations=tokenizer, input_columns=["sentence1"])
    dataset = dataset.map(operations=tokenizer, input_columns=["sentence2"])
    dataset = dataset.map(operations=text.TruncateSequencePair(max_seq_len -
                                                               3),
                          input_columns=["sentence1", "sentence2"])
    ### Adding special tokens
    dataset = dataset.map(operations=ops.Concatenate(
        prepend=np.array(["[CLS]"], dtype='S'),
        append=np.array(["[SEP]"], dtype='S')),
                          input_columns=["sentence1"])
    dataset = dataset.map(
        operations=ops.Concatenate(append=np.array(["[SEP]"], dtype='S')),
        input_columns=["sentence2"])
    ### Generating segment_ids
    dataset = dataset.map(
        operations=ops.Duplicate(),
        input_columns=["sentence1"],
        output_columns=["sentence1", "type_sentence1"],
        column_order=["sentence1", "type_sentence1", "sentence2", "label_id"])
    dataset = dataset.map(operations=ops.Duplicate(),
                          input_columns=["sentence2"],
                          output_columns=["sentence2", "type_sentence2"],
                          column_order=[
                              "sentence1", "type_sentence1", "sentence2",
                              "type_sentence2", "label_id"
                          ])
    dataset = dataset.map(operations=[lookup, ops.Fill(0)],
                          input_columns=["type_sentence1"])
    dataset = dataset.map(operations=[lookup, ops.Fill(1)],
                          input_columns=["type_sentence2"])
    dataset = dataset.map(
        operations=ops.Concatenate(),
        input_columns=["type_sentence1", "type_sentence2"],
        output_columns=["segment_ids"],
        column_order=["sentence1", "sentence2", "segment_ids", "label_id"])
    dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0),
                          input_columns=["segment_ids"])
    ### Generating text_ids
    dataset = dataset.map(operations=ops.Concatenate(),
                          input_columns=["sentence1", "sentence2"],
                          output_columns=["text_ids"],
                          column_order=["text_ids", "segment_ids", "label_id"])
    dataset = dataset.map(operations=lookup, input_columns=["text_ids"])
    dataset = dataset.map(operations=ops.PadEnd([max_seq_len], 0),
                          input_columns=["text_ids"])
    ### Generating mask_ids
    dataset = dataset.map(
        operations=ops.Duplicate(),
        input_columns=["text_ids"],
        output_columns=["text_ids", "mask_ids"],
        column_order=["text_ids", "mask_ids", "segment_ids", "label_id"])
    dataset = dataset.map(operations=ops.Mask(ops.Relational.NE, 0,
                                              mstype.int32),
                          input_columns=["mask_ids"])
    dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
    return dataset