Beispiel #1
0
def test_numpyslices_sampler_chain_batch():
    """
    Test NumpySlicesDataset sampler chaining, with batch
    """
    logger.info("test_numpyslices_sampler_chain_batch")

    # Create NumpySlicesDataset with sampler chain
    np_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    sampler = ds.SequentialSampler(start_index=1, num_samples=3)
    sampler = sampler.add_child(
        ds.SequentialSampler(start_index=1, num_samples=2))
    data1 = ds.NumpySlicesDataset(np_data, sampler=sampler)
    data1 = data1.batch(batch_size=3, drop_remainder=False)

    # Verify dataset size
    data1_size = data1.get_dataset_size()
    logger.info("dataset size is: {}".format(data1_size))
    assert data1_size == 4

    # Verify number of rows
    assert sum([1 for _ in data1]) == 4

    # Verify dataset contents
    res = []
    for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
        logger.info("item: {}".format(item))
        res.append(item)
    logger.info("dataset: {}".format(res))
Beispiel #2
0
def test_numpyslices_sampler_chain2():
    """
    Test NumpySlicesDataset sampler chain
    """
    logger.info("test_numpyslices_sampler_chain2")

    # Create NumpySlicesDataset with sampler chain
    # Use 2 statements to add child sampler
    np_data = [1, 2, 3, 4]
    sampler = ds.SequentialSampler(start_index=1, num_samples=1)
    child_sampler = ds.SequentialSampler(start_index=1, num_samples=2)
    sampler.add_child(child_sampler)
    data1 = ds.NumpySlicesDataset(np_data, sampler=sampler)

    # Verify dataset size
    data1_size = data1.get_dataset_size()
    logger.info("dataset size is: {}".format(data1_size))
    assert data1_size == 1

    # Verify number of rows
    assert sum([1 for _ in data1]) == 1

    # Verify dataset contents
    res = []
    for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
        logger.info("item: {}".format(item))
        res.append(item)
    logger.info("dataset: {}".format(res))
def test_cv_minddataset_sequential_sampler_exceed_size(add_and_remove_cv_file):
    data = get_data(CV_DIR_NAME, True)
    columns_list = ["data", "file_name", "label"]
    num_readers = 4
    sampler = ds.SequentialSampler(2, 10)
    data_set = ds.MindDataset(CV_FILE_NAME + "0",
                              columns_list,
                              num_readers,
                              sampler=sampler)
    dataset_size = data_set.get_dataset_size()
    assert dataset_size == 10
    num_iter = 0
    for item in data_set.create_dict_iterator():
        logger.info(
            "-------------- cv reader basic: {} ------------------------".
            format(num_iter))
        logger.info(
            "-------------- item[data]: {}  -----------------------------".
            format(item["data"]))
        logger.info(
            "-------------- item[file_name]: {} ------------------------".
            format(item["file_name"]))
        logger.info(
            "-------------- item[label]: {} ----------------------------".
            format(item["label"]))
        assert item['file_name'] == np.array(data[(num_iter + 2) %
                                                  dataset_size]['file_name'],
                                             dtype='S')
        num_iter += 1
    assert num_iter == 10
Beispiel #4
0
def test_add_sampler_invalid_input():
    manifest_file = "../data/dataset/testManifestData/test5trainimgs.json"
    _ = {
        (172876, 0): 0,
        (54214, 0): 1,
        (54214, 1): 2,
        (173673, 0): 3,
        (64631, 1): 4
    }
    data1 = ds.ManifestDataset(manifest_file)

    with pytest.raises(TypeError) as info:
        data1.use_sampler(1)
    assert "not an instance of a sampler" in str(info.value)

    with pytest.raises(TypeError) as info:
        data1.use_sampler("sampler")
    assert "not an instance of a sampler" in str(info.value)

    sampler = ds.SequentialSampler()
    with pytest.raises(RuntimeError) as info:
        data2 = ds.ManifestDataset(manifest_file,
                                   sampler=sampler,
                                   num_samples=20)
    assert "sampler and num_samples cannot be specified at the same time" in str(
        info.value)
Beispiel #5
0
def test_chained_sampler_03():
    logger.info("Test Case Chained Sampler - Random and Sequential, with repeat then batch")

    # Create chained sampler, random and sequential
    sampler = ds.RandomSampler()
    child_sampler = ds.SequentialSampler()
    sampler.add_child(child_sampler)
    # Create ImageFolderDataset with sampler
    data1 = ds.ImageFolderDataset(DATA_DIR, sampler=sampler)

    data1 = data1.repeat(count=2)
    data1 = data1.batch(batch_size=5, drop_remainder=False)

    # Verify dataset size
    data1_size = data1.get_dataset_size()
    logger.info("dataset size is: {}".format(data1_size))
    assert data1_size == 18

    # Verify number of iterations
    num_iter = 0
    for item in data1.create_dict_iterator(num_epochs=1):  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        logger.info("image is {}".format(item["image"]))
        logger.info("label is {}".format(item["label"]))
        num_iter += 1

    logger.info("Number of data in data1: {}".format(num_iter))
    assert num_iter == 18
def test_generator_num_samples():
    source = [(np.array([x]), ) for x in range(64)]
    num_samples = 32
    ds1 = ds.GeneratorDataset(
        source, ["data"],
        sampler=ds.SequentialSampler(num_samples=num_samples))
    ds2 = ds.GeneratorDataset(source, ["data"],
                              sampler=[i for i in range(32)],
                              num_samples=num_samples)
    ds3 = ds.GeneratorDataset(generator_1d, ["data"], num_samples=num_samples)

    count = 0
    for _ in ds1.create_dict_iterator():
        count = count + 1
    assert count == num_samples

    count = 0
    for _ in ds2.create_dict_iterator():
        count = count + 1
    assert count == num_samples

    count = 0
    for _ in ds3.create_dict_iterator():
        count = count + 1
    assert count == num_samples
def test_raise_error():
    data1 = [{'image': np.zeros(1, np.uint8)}, {'image': np.zeros(2, np.uint8)},
             {'image': np.zeros(3, np.uint8)}, {'image': np.zeros(4, np.uint8)},
             {'image': np.zeros(5, np.uint8)}]
    data2 = [{'image': np.zeros(6, np.uint8)}, {'image': np.zeros(7, np.uint8)},
             {'image': np.zeros(8, np.uint8)}]

    ds1 = ds.PaddedDataset(data1)
    ds4 = ds1.batch(2)
    ds2 = ds.PaddedDataset(data2)
    ds3 = ds4 + ds2

    with pytest.raises(TypeError) as excinfo:
        testsampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=False, num_samples=None)
        ds3.use_sampler(testsampler)
        assert excinfo.type == 'TypeError'

    with pytest.raises(TypeError) as excinfo:
        otherSampler = ds.SequentialSampler()
        ds3.use_sampler(otherSampler)
        assert excinfo.type == 'TypeError'

    with pytest.raises(ValueError) as excinfo:
        testsampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=True, num_samples=None)
        ds3.use_sampler(testsampler)
        assert excinfo.type == 'ValueError'

    with pytest.raises(ValueError) as excinfo:
        testsampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=False, num_samples=5)
        ds3.use_sampler(testsampler)
        assert excinfo.type == 'ValueError'
Beispiel #8
0
def test_add_sampler_invalid_input():
    manifest_file = "../data/dataset/testManifestData/test5trainimgs.json"
    _ = {
        (172876, 0): 0,
        (54214, 0): 1,
        (54214, 1): 2,
        (173673, 0): 3,
        (64631, 1): 4
    }
    data1 = ds.ManifestDataset(manifest_file)

    with pytest.raises(TypeError) as info:
        data1.use_sampler(1)
    assert "not an instance of a sampler" in str(info.value)

    with pytest.raises(TypeError) as info:
        data1.use_sampler("sampler")
    assert "not an instance of a sampler" in str(info.value)

    sampler = ds.SequentialSampler()
    with pytest.raises(ValueError) as info:
        data2 = ds.ManifestDataset(manifest_file,
                                   sampler=sampler,
                                   num_samples=20)
    assert "Conflicting arguments during sampler assignments" in str(
        info.value)
def test_sequential_sampler():
    logger.info("Test Case SequentialSampler")

    golden = [
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
    ]

    # define parameters
    repeat_count = 1

    # apply dataset operations
    sampler = ds.SequentialSampler()
    data1 = ds.ImageFolderDatasetV2(DATA_DIR, sampler=sampler)
    data1 = data1.repeat(repeat_count)

    result = []
    num_iter = 0
    for item in data1.create_dict_iterator():  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        result.append(item["label"])
        num_iter += 1

    logger.info("Result: {}".format(result))
    assert result == golden
Beispiel #10
0
def test_manifest_sampler_chain_batch_repeat():
    """
    Test ManifestDataset sampler chain DistributedSampler->SequentialSampler, with batch then repeat
    """
    logger.info("test_manifest_sampler_chain_batch_repeat")
    manifest_file = "../data/dataset/testManifestData/test5trainimgs.json"

    # Create sampler chain DistributedSampler->SequentialSampler
    sampler = ds.DistributedSampler(num_shards=1,
                                    shard_id=0,
                                    shuffle=False,
                                    num_samples=5)
    child_sampler = ds.SequentialSampler()
    sampler.add_child(child_sampler)

    # Create ManifestDataset with sampler chain
    data1 = ds.ManifestDataset(manifest_file, decode=True, sampler=sampler)
    one_hot_encode = c_transforms.OneHot(3)
    data1 = data1.map(operations=one_hot_encode, input_columns=["label"])
    data1 = data1.batch(batch_size=5, drop_remainder=False)
    data1 = data1.repeat(count=2)

    # Verify dataset size
    data1_size = data1.get_dataset_size()
    logger.info("dataset size is: {}".format(data1_size))
    assert data1_size == 2
Beispiel #11
0
def test_voc_sampler_chain():
    """
    Test VOC sampler chain
    """
    logger.info("test_voc_sampler_chain")

    sampler = ds.DistributedSampler(num_shards=2,
                                    shard_id=0,
                                    shuffle=False,
                                    num_samples=5)
    child_sampler = ds.SequentialSampler(start_index=0)
    sampler.add_child(child_sampler)
    data1 = ds.VOCDataset(VOC_DATA_DIR, task="Segmentation", sampler=sampler)

    # Verify dataset size
    data1_size = data1.get_dataset_size()
    logger.info("dataset size is: {}".format(data1_size))
    assert data1_size == 5

    # Verify number of rows
    assert sum([1 for _ in data1]) == 5

    # Verify dataset contents
    res = []
    for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
        logger.info("item: {}".format(item))
        res.append(item)
    logger.info("dataset: {}".format(res))
Beispiel #12
0
def test_manifest_sampler_chain_repeat():
    """
    Test ManifestDataset sampler chain DistributedSampler->SequentialSampler, with repeat
    """
    logger.info("test_manifest_sampler_chain_batch")
    manifest_file = "../data/dataset/testManifestData/test5trainimgs.json"

    # Create sampler chain DistributedSampler->SequentialSampler
    sampler = ds.DistributedSampler(num_shards=1,
                                    shard_id=0,
                                    shuffle=False,
                                    num_samples=5)
    child_sampler = ds.SequentialSampler()
    sampler.add_child(child_sampler)

    # Create ManifestDataset with sampler chain
    data1 = ds.ManifestDataset(manifest_file, sampler=sampler)
    data1 = data1.repeat(count=2)

    # Verify dataset size
    data1_size = data1.get_dataset_size()
    logger.info("dataset size is: {}".format(data1_size))
    assert data1_size == 10

    # Verify number of rows
    assert sum([1 for _ in data1]) == 10

    # Verify dataset contents
    filename = "sampler_chain_manifest_repeat_result.npz"
    save_and_check_md5(data1, filename, generate_golden=GENERATE_GOLDEN)
Beispiel #13
0
def test_cifar_sampler_chain():
    """
    Test Cifar sampler chain
    """
    logger.info("test_cifar_sampler_chain")

    sampler = ds.DistributedSampler(num_shards=2,
                                    shard_id=0,
                                    shuffle=False,
                                    num_samples=5)
    child_sampler = ds.RandomSampler(replacement=True, num_samples=4)
    child_sampler2 = ds.SequentialSampler(start_index=0, num_samples=2)
    child_sampler.add_child(child_sampler2)
    sampler.add_child(child_sampler)
    data1 = ds.Cifar10Dataset(CIFAR10_DATA_DIR, sampler=sampler)
    # Verify dataset size
    data1_size = data1.get_dataset_size()
    logger.info("dataset size is: {}".format(data1_size))
    assert data1_size == 1

    # Verify number of rows
    assert sum([1 for _ in data1]) == 1

    # Verify dataset contents
    res = []
    for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
        logger.info("item: {}".format(item))
        res.append(item)
    logger.info("dataset: {}".format(res))
Beispiel #14
0
def test_generator_sequential_sampler():
    source = [(np.array([x]),) for x in range(64)]
    ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler())
    i = 0
    for data in ds1.create_dict_iterator(num_epochs=1, output_numpy=True):  # each data is a dictionary
        golden = np.array([i])
        np.testing.assert_array_equal(data["data"], golden)
        i = i + 1
Beispiel #15
0
def test_sequential_sampler():
    source = [(np.array([x]),) for x in range(64)]
    ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler())
    i = 0
    for data in ds1.create_dict_iterator():  # each data is a dictionary
        golden = np.array([i])
        assert np.array_equal(data["data"], golden)
        i = i + 1
Beispiel #16
0
def test_numpy_slices_sequential_sampler():
    logger.info("Test numpy_slices_dataset with SequentialSampler and repeat.")

    np_data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]
    ds = de.NumpySlicesDataset(np_data, sampler=de.SequentialSampler()).repeat(2)

    for i, data in enumerate(ds):
        assert np.equal(data[0].asnumpy(), np_data[i % 8]).all()
Beispiel #17
0
    def test_config(start_index, num_samples):
        sampler = ds.SequentialSampler(start_index, num_samples)
        d = ds.ManifestDataset(manifest_file, sampler=sampler)

        res = []
        for item in d.create_dict_iterator():
            res.append(map_[(item["image"].shape[0], item["label"].item())])

        return res
def test_cifar10_with_chained_sampler_get_dataset_size():
    """
    Test Cifar10Dataset with PKSampler chained with a SequentialSampler and get_dataset_size
    """
    sampler = ds.SequentialSampler(start_index=0, num_samples=5)
    child_sampler = ds.PKSampler(4)
    sampler.add_child(child_sampler)
    data = ds.Cifar10Dataset(DATA_DIR_10, sampler=sampler)
    num_iter = 0
    ds_sz = data.get_dataset_size()
    for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        num_iter += 1
    assert ds_sz == num_iter == 5
Beispiel #19
0
 def test_config(num_samples, num_repeats=None):
     sampler = ds.SequentialSampler()
     data1 = ds.ManifestDataset(manifest_file, num_samples=num_samples, sampler=sampler)
     if num_repeats is not None:
         data1 = data1.repeat(num_repeats)
     res = []
     for item in data1.create_dict_iterator():
         logger.info("item[image].shape[0]: {}, item[label].item(): {}"
                     .format(item["image"].shape[0], item["label"].item()))
         res.append(map_[(item["image"].shape[0], item["label"].item())])
     if print_res:
         logger.info("image.shapes and labels: {}".format(res))
     return res
Beispiel #20
0
    def test_config(num_shards, shard_id):
        sampler = ds.DistributedSampler(num_shards, shard_id, False)
        child_sampler = ds.SequentialSampler()
        sampler.add_child(child_sampler)

        data1 = ds.ManifestDataset(manifest_file, num_samples=5, sampler=sampler)

        res = []
        for item in data1.create_dict_iterator():
            logger.info("item[image].shape[0]: {}, item[label].item(): {}"
                        .format(item["image"].shape[0], item["label"].item()))
            res.append(map_[(item["image"].shape[0], item["label"].item())])
        return res
Beispiel #21
0
def test_cifar10_sequential_sampler():
    """
    Test Cifar10Dataset with SequentialSampler
    """
    logger.info("Test Cifar10Dataset Op with SequentialSampler")
    num_samples = 30
    sampler = ds.SequentialSampler(num_samples=num_samples)
    data1 = ds.Cifar10Dataset(DATA_DIR_10, sampler=sampler)
    data2 = ds.Cifar10Dataset(DATA_DIR_10, shuffle=False, num_samples=num_samples)
    num_iter = 0
    for item1, item2 in zip(data1.create_dict_iterator(), data2.create_dict_iterator()):
        np.testing.assert_equal(item1["label"], item2["label"])
        num_iter += 1
    assert num_iter == num_samples
Beispiel #22
0
def test_case_14():
    """
    Test 1D Generator MP + CPP sampler
    """
    logger.info("Test 1D Generator MP : 0 - 63")

    source = [(np.array([x]),) for x in range(256)]
    ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler(), num_parallel_workers=4).repeat(2)
    i = 0
    for data in ds1.create_dict_iterator():  # each data is a dictionary
        golden = np.array([i])
        assert np.array_equal(data["data"], golden)
        i = i + 1
        if i == 256:
            i = 0
Beispiel #23
0
def test_sampler_chain_errors():
    """
    Test error cases for sampler chains
    """
    logger.info("test_sampler_chain_errors")

    error_msg_1 = "'NoneType' object has no attribute 'add_child'"
    # Test add child sampler within child sampler
    sampler = ds.SequentialSampler(start_index=1, num_samples=2)
    sampler = sampler.add_child(
        ds.SequentialSampler(start_index=1, num_samples=2))
    with pytest.raises(AttributeError, match=error_msg_1):
        sampler.add_child(ds.SequentialSampler(start_index=1, num_samples=2))

    # error_msg_2 = "'NoneType' object has no attribute 'add_child'"
    # Test add second and nested child sampler
    sampler = ds.SequentialSampler(start_index=1, num_samples=2)
    child_sampler = ds.SequentialSampler(start_index=1, num_samples=2)
    sampler.add_child(child_sampler)
    child_sampler2 = ds.SequentialSampler(start_index=1, num_samples=2)
    sampler.add_child(child_sampler2)
    # FIXME - no error is raised; uncomment after code issue is resolved
    # with pytest.raises(AttributeError, match=error_msg_2):
    #     sampler.add_child(child_sampler2)
    #     np_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    #     data1 = ds.NumpySlicesDataset(np_data, sampler=sampler)

    error_msg_3 = "Conflicting arguments during sampler assignments."
    # Test conflicting arguments (sampler and shuffle=False) for sampler (no chain)
    np_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    sampler = ds.SequentialSampler(start_index=1, num_samples=3)
    with pytest.raises(ValueError, match=error_msg_3):
        ds.NumpySlicesDataset(np_data, shuffle=False, sampler=sampler)

    # error_msg_4 = "Conflicting arguments during sampler assignments."
    # Test conflicting arguments (sampler and shuffle=False) for sampler chaining
    np_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    sampler = ds.SequentialSampler(start_index=1, num_samples=3)
    sampler = sampler.add_child(
        ds.SequentialSampler(start_index=1, num_samples=2))
Beispiel #24
0
def test_case_16():
    """
    Test multi column generator Mp + CPP sampler
    """
    logger.info("Test multi column generator")

    source = [(np.array([x]), np.array([x + 1])) for x in range(256)]
    # apply dataset operations
    data1 = ds.GeneratorDataset(source, ["col0", "col1"], sampler=ds.SequentialSampler())

    i = 0
    for item in data1.create_dict_iterator():  # each data is a dictionary
        golden = np.array([i])
        assert np.array_equal(item["col0"], golden)
        golden = np.array([i + 1])
        assert np.array_equal(item["col1"], golden)
        i = i + 1
Beispiel #25
0
def test_mnist_sequential_sampler():
    """
    Test MnistDataset with SequentialSampler
    """
    logger.info("Test MnistDataset Op with SequentialSampler")
    num_samples = 50
    sampler = ds.SequentialSampler(num_samples=num_samples)
    data1 = ds.MnistDataset(DATA_DIR, sampler=sampler)
    data2 = ds.MnistDataset(DATA_DIR, shuffle=False, num_samples=num_samples)
    label_list1, label_list2 = [], []
    num_iter = 0
    for item1, item2 in zip(data1.create_dict_iterator(num_epochs=1), data2.create_dict_iterator(num_epochs=1)):
        label_list1.append(item1["label"].asnumpy())
        label_list2.append(item2["label"].asnumpy())
        num_iter += 1
    np.testing.assert_array_equal(label_list1, label_list2)
    assert num_iter == num_samples
Beispiel #26
0
def test_sampler_py_api():
    sampler = ds.SequentialSampler().create()
    sampler.set_num_rows(128)
    sampler.set_num_samples(64)
    sampler.initialize()
    sampler.get_indices()

    sampler = ds.RandomSampler().create()
    sampler.set_num_rows(128)
    sampler.set_num_samples(64)
    sampler.initialize()
    sampler.get_indices()

    sampler = ds.DistributedSampler(8, 4).create()
    sampler.set_num_rows(128)
    sampler.set_num_samples(64)
    sampler.initialize()
    sampler.get_indices()
Beispiel #27
0
def test_imagefolder_sampler_chain():
    """
    Test ImageFolderDataset sampler chain
    """
    logger.info("test_imagefolder_sampler_chain")

    sampler = ds.SequentialSampler(start_index=1, num_samples=3)
    child_sampler = ds.PKSampler(2)
    sampler.add_child(child_sampler)
    data1 = ds.ImageFolderDataset(IMAGENET_RAWDATA_DIR, sampler=sampler)
    # Verify dataset size
    data1_size = data1.get_dataset_size()
    logger.info("dataset size is: {}".format(data1_size))
    assert data1_size == 3
    # Verify number of rows
    assert sum([1 for _ in data1]) == 3

    # Verify dataset contents
    res = []
    for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True):
        logger.info("item: {}".format(item))
        res.append(item)
    logger.info("dataset: {}".format(res))
Beispiel #28
0
def test_sampler_py_api():
    sampler = ds.SequentialSampler().parse()
    sampler1 = ds.RandomSampler().parse()
    sampler1.add_child(sampler)
def test_serdes_imagefolder_dataset(remove_json_files=True):
    """
    Test simulating resnet50 dataset pipeline.
    """
    data_dir = "../data/dataset/testPK/data"
    ds.config.set_seed(1)

    # define data augmentation parameters
    rescale = 1.0 / 255.0
    shift = 0.0
    resize_height, resize_width = 224, 224
    weights = [
        1.0, 0.1, 0.02, 0.3, 0.4, 0.05, 1.2, 0.13, 0.14, 0.015, 0.16, 1.1
    ]

    # Constructing DE pipeline
    sampler = ds.WeightedRandomSampler(weights, 11)
    child_sampler = ds.SequentialSampler()
    sampler.add_child(child_sampler)
    data1 = ds.ImageFolderDataset(data_dir, sampler=sampler)
    data1 = data1.repeat(1)
    data1 = data1.map(operations=[vision.Decode(True)],
                      input_columns=["image"])
    rescale_op = vision.Rescale(rescale, shift)

    resize_op = vision.Resize((resize_height, resize_width), Inter.LINEAR)
    data1 = data1.map(operations=[rescale_op, resize_op],
                      input_columns=["image"])
    data1 = data1.batch(2)

    # Serialize the dataset pre-processing pipeline.
    # data1 should still work after saving.
    ds.serialize(data1, "imagenet_dataset_pipeline.json")
    ds1_dict = ds.serialize(data1)
    assert validate_jsonfile("imagenet_dataset_pipeline.json") is True

    # Print the serialized pipeline to stdout
    ds.show(data1)

    # Deserialize the serialized json file
    data2 = ds.deserialize(json_filepath="imagenet_dataset_pipeline.json")

    # Serialize the pipeline we just deserialized.
    # The content of the json file should be the same to the previous serialize.
    ds.serialize(data2, "imagenet_dataset_pipeline_1.json")
    assert validate_jsonfile("imagenet_dataset_pipeline_1.json") is True
    assert filecmp.cmp('imagenet_dataset_pipeline.json',
                       'imagenet_dataset_pipeline_1.json')

    # Deserialize the latest json file again
    data3 = ds.deserialize(json_filepath="imagenet_dataset_pipeline_1.json")
    data4 = ds.deserialize(input_dict=ds1_dict)
    num_samples = 0
    # Iterate and compare the data in the original pipeline (data1) against the deserialized pipeline (data2)
    for item1, item2, item3, item4 in zip(
            data1.create_dict_iterator(num_epochs=1, output_numpy=True),
            data2.create_dict_iterator(num_epochs=1, output_numpy=True),
            data3.create_dict_iterator(num_epochs=1, output_numpy=True),
            data4.create_dict_iterator(num_epochs=1, output_numpy=True)):
        np.testing.assert_array_equal(item1['image'], item2['image'])
        np.testing.assert_array_equal(item1['image'], item3['image'])
        np.testing.assert_array_equal(item1['label'], item2['label'])
        np.testing.assert_array_equal(item1['label'], item3['label'])
        np.testing.assert_array_equal(item3['image'], item4['image'])
        np.testing.assert_array_equal(item3['label'], item4['label'])
        num_samples += 1

    logger.info("Number of data in data1: {}".format(num_samples))
    assert num_samples == 6

    # Remove the generated json file
    if remove_json_files:
        delete_json_files()