Example #1
0
def test_profiling_complex_pipeline():
    """
    Generator -> Map     ->
                             -> Zip -> Batch
    TFReader  -> Shuffle ->
    """
    os.environ['PROFILING_MODE'] = 'true'
    os.environ['MINDDATA_PROFILING_DIR'] = '.'
    os.environ['DEVICE_ID'] = '1'

    source = [(np.array([x]),) for x in range(1024)]
    data1 = ds.GeneratorDataset(source, ["gen"])
    data1 = data1.map("gen", operations=[(lambda x: x + 1)])

    pattern = DATASET_ROOT + "/test.data"
    data2 = ds.TFRecordDataset(pattern, SCHEMA_FILE, shuffle=ds.Shuffle.FILES)
    data2 = data2.shuffle(4)

    data3 = ds.zip((data1, data2))

    for _ in data3:
        pass

    assert os.path.exists(PIPELINE_FILE_SIZE) is True
    os.remove(PIPELINE_FILE_SIZE)
    assert os.path.exists(PIPELINE_FILE_THR) is True
    os.remove(PIPELINE_FILE_THR)
    assert os.path.exists(DATASET_ITERATOR_FILE) is True
    os.remove(DATASET_ITERATOR_FILE)
    del os.environ['PROFILING_MODE']
    del os.environ['MINDDATA_PROFILING_DIR']
Example #2
0
def test_filter_by_generator_Partial2():
    dataset1 = ds.GeneratorDataset(source=generator_mc_p0(),
                                   column_names=["col1", "col2"])
    dataset2 = ds.GeneratorDataset(source=generator_mc_p1(),
                                   column_names=["col3", "col4"])

    dataset1f = dataset1.filter(input_columns=["col1"],
                                predicate=lambda x: x not in [3, 7, 9],
                                num_parallel_workers=2)
    dataset2f = dataset2.filter(input_columns=["col3"],
                                predicate=lambda x: x not in [203, 207, 209],
                                num_parallel_workers=2)
    dataset_zip = ds.zip((dataset1f, dataset2f))
    dataset_map = dataset_zip.map(input_columns=["col1", "col3"],
                                  output_columns=["out1", "out3"],
                                  operations=lambda x1, x3:
                                  (x1 + 400, x3 + 500))
    ret1 = []
    ret3 = []
    for item in dataset_map.create_dict_iterator():
        ret1.append(item["out1"])
        ret3.append(item["out3"])
    assert ret1[0] == 400
    assert ret1[6] == 408
    assert ret3[0] == 700
    assert ret3[6] == 708
Example #3
0
def test_get_column_name_zip():
    data1 = ds.Cifar10Dataset(CIFAR10_DIR)
    assert data1.get_col_names() == ["image", "label"]
    data2 = ds.CSVDataset(CSV_DIR)
    assert data2.get_col_names() == ["1", "2", "3", "4"]
    data = ds.zip((data1, data2))
    assert data.get_col_names() == ["image", "label", "1", "2", "3", "4"]
def test_imagefolder_zip():
    logger.info("Test Case zip")
    # define parameters
    repeat_count = 2

    # apply dataset operations
    data1 = ds.ImageFolderDataset(DATA_DIR, num_samples=10)
    data2 = ds.ImageFolderDataset(DATA_DIR, num_samples=10)

    data1 = data1.repeat(repeat_count)
    # rename dataset2 for no conflict
    data2 = data2.rename(input_columns=["image", "label"],
                         output_columns=["image1", "label1"])
    data3 = ds.zip((data1, data2))

    num_iter = 0
    for item in data3.create_dict_iterator(
            num_epochs=1):  # each data is a dictionary
        # in this example, each dictionary has keys "image" and "label"
        logger.info("image is {}".format(item["image"]))
        logger.info("label is {}".format(item["label"]))
        num_iter += 1

    logger.info("Number of data in data1: {}".format(num_iter))
    assert num_iter == 10
Example #5
0
def test_zip_exception_07():
    """
    Test zip: zip with string as parameter
    """
    logger.info("test_zip_exception_07")

    try:
        dataz = ds.zip(('dataset1', 'dataset2'))

        num_iter = 0
        for _ in dataz.create_dict_iterator(num_epochs=1, output_numpy=True):
            num_iter += 1
        assert False

    except Exception as e:
        logger.info("Got an exception in DE: {}".format(str(e)))

    try:
        data = ds.TFRecordDataset(DATA_DIR_1, SCHEMA_DIR_1)
        dataz = data.zip(('dataset1', ))

        num_iter = 0
        for _ in dataz.create_dict_iterator(num_epochs=1, output_numpy=True):
            num_iter += 1
        assert False

    except Exception as e:
        logger.info("Got an exception in DE: {}".format(str(e)))
Example #6
0
def test_map_reorder1():
    def generator_mc(maxid=1):
        for _ in range(maxid):
            yield (np.array([0]), np.array([1]), np.array([2]))

    # Three map and zip
    data0 = ds.GeneratorDataset(generator_mc, ["a0", "a1", "a2"])
    data0 = data0.map(input_columns="a0",
                      columns_order=["a2", "a1", "a0"],
                      operations=(lambda x: x))
    data1 = ds.GeneratorDataset(generator_mc, ["b0", "b1", "b2"])
    data1 = data1.map(input_columns="b0",
                      columns_order=["b1", "b2", "b0"],
                      operations=(lambda x: x))
    data2 = ds.zip((data0, data1))
    data2 = data2.map(input_columns="a0",
                      columns_order=["b2", "a2", "b1", "a1", "b0", "a0"],
                      operations=(lambda x: x))

    for item in data2.create_tuple_iterator():
        assert item == [
            np.array(2),
            np.array(2),
            np.array(1),
            np.array(1),
            np.array(0),
            np.array(0)
        ]
Example #7
0
def test_case3():
    data1 = ds.TFRecordDataset(FILES, SCHEMA_FILE).batch(2).repeat(10)
    data2 = ds.TFRecordDataset(FILES, SCHEMA_FILE).batch(2).repeat(5)
    data3 = ds.TFRecordDataset(FILES, SCHEMA_FILE).batch(2)

    data4 = ds.zip((data1, data2, data3))

    assert data4.get_dataset_size() == 6
Example #8
0
def zip_test(dataset):
    dataset_1 = copy.deepcopy(dataset)
    dataset_2 = copy.deepcopy(dataset)
    dataset_1 = dataset_1.apply(apply_func)
    dataset_zip = ds.zip((dataset_1, dataset_2))
    expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.']
    for i in dataset_zip.create_dict_iterator():
        ret = to_str(i["text"])
        for key, value in enumerate(ret):
            assert value == expect[key]
Example #9
0
def test_filter_by_generator_Partial0():
    dataset1 = ds.GeneratorDataset(source=generator_mc_p0(), column_names=["col1", "col2"])
    dataset2 = ds.GeneratorDataset(source=generator_mc_p1(), column_names=["col3", "col4"])
    dataset_zip = ds.zip((dataset1, dataset2))
    dataset_f1 = dataset_zip.filter(predicate=filter_func_Partial_0, num_parallel_workers=2)
    ret = []
    for item in dataset_f1.create_dict_iterator(num_epochs=1, output_numpy=True):
        ret.append(item["col1"])
    assert ret[0] == 5
    assert ret[6] == 12
def test_serdes_zip_dataset(remove_json_files=True):
    """
    Test serdes on zip dataset pipeline.
    """
    files = ["../data/dataset/testTFTestAllTypes/test.data"]
    schema_file = "../data/dataset/testTFTestAllTypes/datasetSchema.json"
    ds.config.set_seed(1)

    ds0 = ds.TFRecordDataset(files,
                             schema=schema_file,
                             shuffle=ds.Shuffle.GLOBAL)
    data1 = ds.TFRecordDataset(files,
                               schema=schema_file,
                               shuffle=ds.Shuffle.GLOBAL)
    data2 = ds.TFRecordDataset(files,
                               schema=schema_file,
                               shuffle=ds.Shuffle.FILES)
    data2 = data2.shuffle(10000)
    data2 = data2.rename(input_columns=[
        "col_sint16", "col_sint32", "col_sint64", "col_float", "col_1d",
        "col_2d", "col_3d", "col_binary"
    ],
                         output_columns=[
                             "column_sint16", "column_sint32", "column_sint64",
                             "column_float", "column_1d", "column_2d",
                             "column_3d", "column_binary"
                         ])
    data3 = ds.zip((data1, data2))
    ds.serialize(data3, "zip_dataset_pipeline.json")
    assert validate_jsonfile("zip_dataset_pipeline.json") is True
    assert validate_jsonfile("zip_dataset_pipeline_typo.json") is False

    data4 = ds.deserialize(json_filepath="zip_dataset_pipeline.json")
    ds.serialize(data4, "zip_dataset_pipeline_1.json")
    assert validate_jsonfile("zip_dataset_pipeline_1.json") is True
    assert filecmp.cmp('zip_dataset_pipeline.json',
                       'zip_dataset_pipeline_1.json')

    rows = 0
    for d0, d3, d4 in zip(ds0.create_tuple_iterator(output_numpy=True),
                          data3.create_tuple_iterator(output_numpy=True),
                          data4.create_tuple_iterator(output_numpy=True)):
        num_cols = len(d0)
        offset = 0
        for t1 in d0:
            np.testing.assert_array_equal(t1, d3[offset])
            np.testing.assert_array_equal(t1, d3[offset + num_cols])
            np.testing.assert_array_equal(t1, d4[offset])
            np.testing.assert_array_equal(t1, d4[offset + num_cols])
            offset += 1
        rows += 1
    assert rows == 12

    if remove_json_files:
        delete_json_files()
Example #11
0
def test_filter_by_generator_Partial0():
    ds.config.load('../data/dataset/declient_filter.cfg')
    dataset1 = ds.GeneratorDataset(source=generator_mc_p0(), column_names=["col1", "col2"])
    dataset2 = ds.GeneratorDataset(source=generator_mc_p1(), column_names=["col3", "col4"])
    dataset_zip = ds.zip((dataset1, dataset2))
    dataset_f1 = dataset_zip.filter(predicate=filter_func_Partial_0, num_parallel_workers=2)
    ret = []
    for item in dataset_f1.create_dict_iterator():
        ret.append(item["col1"])
    assert ret[0] == 5
    assert ret[6] == 12
Example #12
0
def test_filter_by_generator_Partial1():
    ds.config.load('../data/dataset/declient_filter.cfg')
    dataset1 = ds.GeneratorDataset(source=generator_mc_p0(), column_names=["col1", "col2"])
    dataset2 = ds.GeneratorDataset(source=generator_mc_p1(), column_names=["col3", "col4"])
    dataset_zip = ds.zip((dataset1, dataset2))
    dataset_f1 = dataset_zip.filter(predicate=filter_func_Partial_0, num_parallel_workers=2)
    dataset_map = dataset_f1.map(input_columns=["col1"], output_columns=["out1"], operations=lambda x1: x1 + 400)
    ret = []
    for item in dataset_map.create_dict_iterator():
        ret.append(item["out1"])
    assert ret[0] == 405
    assert ret[6] == 412
Example #13
0
def test_zip_03():
    """
    Test zip: zip 2 datasets, #rows-data1 > #rows-data2, #cols-data1 > #cols-data2
    """
    logger.info("test_zip_03")
    ds.config.set_seed(1)
    data1 = ds.TFRecordDataset(DATA_DIR_1, SCHEMA_DIR_1)
    data2 = ds.TFRecordDataset(DATA_DIR_3, SCHEMA_DIR_3)
    dataz = ds.zip((data1, data2))
    # Note: zipped dataset has 3 rows and 7 columns
    filename = "zip_03_result.npz"
    save_and_check_dict(dataz, filename, generate_golden=GENERATE_GOLDEN)
Example #14
0
def test_zip_04():
    """
    Test zip: zip >2 datasets
    """
    logger.info("test_zip_04")
    ds.config.set_seed(1)
    data1 = ds.TFRecordDataset(DATA_DIR_1, SCHEMA_DIR_1)
    data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2)
    data3 = ds.TFRecordDataset(DATA_DIR_3, SCHEMA_DIR_3)
    dataz = ds.zip((data1, data2, data3))
    # Note: zipped dataset has 3 rows and 9 columns
    filename = "zip_04_result.npz"
    save_and_check_dict(dataz, filename, generate_golden=GENERATE_GOLDEN)
Example #15
0
def test_case4():
    data1 = ds.TFRecordDataset(FILES, SCHEMA_FILE).batch(2).repeat(10)
    data2 = ds.TFRecordDataset(FILES)
    assert data2.get_dataset_size() == 12
    data2 = data2.batch(2)
    assert data2.get_dataset_size() == 6
    data2 = data2.shuffle(100)
    assert data2.get_dataset_size() == 6
    data2 = data2.repeat(3)
    assert data2.get_dataset_size() == 18

    data3 = ds.zip((data1, data2))

    assert data3.get_dataset_size() == 18
Example #16
0
def makeup_train_dataset(ds1, ds2, ds3, batchsize, epoch):
    ds1 = ds1.rename(input_columns=["label", "image"],
                     output_columns=["label1", "data1"])
    ds2 = ds2.rename(input_columns=["label", "image"],
                     output_columns=["label2", "data2"])
    ds3 = ds3.rename(input_columns=["image"], output_columns=["data3"])
    ds_new = ds.zip((ds1, ds2))
    ds_new = ds_new.project(columns=['data1', 'data2'])
    ds_new = ds.zip((ds3, ds_new))
    ds_new = ds_new.map(input_columns=['label'],
                        output_columns=['label'],
                        column_order=['data3', 'data2', 'data1', 'label'],
                        operations=lambda x: x)
    # to keep the order : data3 data2 data1 label

    # ds_new = ds_new.shuffle(ds_new.get_dataset_size())
    print("dataset batchsize:", batchsize)
    ds_new = ds_new.batch(batchsize)
    ds_new = ds_new.repeat(epoch)

    print("batch_size:", ds_new.get_batch_size(), "batch_num:",
          ds_new.get_dataset_size())

    return ds_new
Example #17
0
def test_case3():
    data1 = ds.TFRecordDataset(FILES, SCHEMA_FILE, columns_list=[
        "col_sint64"
    ]).batch(2).repeat(10).rename(["col_sint64"], ["a1"])
    data2 = ds.TFRecordDataset(FILES, SCHEMA_FILE, columns_list=[
        "col_sint64"
    ]).batch(2).repeat(5).rename(["col_sint64"], ["a2"])
    data3 = ds.TFRecordDataset(FILES, SCHEMA_FILE,
                               columns_list=["col_sint64"
                                             ]).batch(2).rename(["col_sint64"],
                                                                ["a3"])

    data4 = ds.zip((data1, data2, data3))

    assert data4.get_dataset_size() == 6
Example #18
0
def test_filter_by_generator_with_map_all_sort():
    dataset1 = ds.GeneratorDataset(generator_sort1(10), ["col1", "col2", "col3"])
    dataset2 = ds.GeneratorDataset(generator_sort2(10), ["col4 ", "col5", "col6"])

    dataz = ds.zip((dataset1, dataset2))
    dataset_f = dataz.filter(predicate=filter_func_part_sort, num_parallel_workers=1)
    num_iter = 0
    ret_data = []
    for item in dataset_f.create_dict_iterator():
        num_iter += 1
        ret_data.append(item)

    assert num_iter == 10
    assert ret_data[0]["col1"] == 0
    assert ret_data[9]["col6"] == 509
Example #19
0
def test_filter_by_generator_with_zip():
    dataset1 = ds.GeneratorDataset(generator_1d_zip1, ["data1"])
    dataset2 = ds.GeneratorDataset(generator_1d_zip2, ["data2"])
    dataz = ds.zip((dataset1, dataset2))
    dataset_f = dataz.filter(predicate=filter_func_zip, num_parallel_workers=1)
    num_iter = 0
    ret_data = []
    for item in dataset_f.create_dict_iterator():
        num_iter += 1
        ret_data.append({"data1": item["data1"], "data2": item["data2"]})
    assert num_iter == 21
    assert ret_data[0]["data1"] == 0
    assert ret_data[0]["data2"] == 100
    assert ret_data[5]["data1"] == 5
    assert ret_data[5]["data2"] == 105
Example #20
0
def test_zip_02():
    """
    Test zip: zip 2 datasets, #rows-data1 < #rows-data2, #cols-data1 == #cols-data2
    """
    logger.info("test_zip_02")
    ds.config.set_seed(1)
    data1 = ds.TFRecordDataset(DATA_DIR_3, SCHEMA_DIR_3)
    data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2)
    dataz = ds.zip((data1, data2))
    # Note: zipped dataset has 3 rows and 4 columns
    filename = "zip_02_result.npz"
    parameters = {"params": {}}
    save_and_check_dict(dataz,
                        parameters,
                        filename,
                        generate_golden=GENERATE_GOLDEN)
Example #21
0
def test_filter_by_generator_with_zip_after():
    dataset1 = ds.GeneratorDataset(generator_1d_zip1, ["data1"])
    dataset2 = ds.GeneratorDataset(generator_1d_zip1, ["data2"])
    dt1 = dataset1.filter(predicate=filter_func_zip_after, num_parallel_workers=4)
    dt2 = dataset2.filter(predicate=filter_func_zip_after, num_parallel_workers=4)
    dataz = ds.zip((dt1, dt2))
    num_iter = 0
    ret_data = []
    for item in dataz.create_dict_iterator(num_epochs=1, output_numpy=True):
        num_iter += 1
        ret_data.append({"data1": item["data1"], "data2": item["data2"]})
    assert num_iter == 21
    assert ret_data[0]["data1"] == 0
    assert ret_data[0]["data2"] == 0
    assert ret_data[5]["data1"] == 5
    assert ret_data[5]["data2"] == 5
Example #22
0
def test_zip_exception_06():
    """
    Test zip: zip with non-tuple of 1 dataset
    """
    logger.info("test_zip_exception_06")
    data1 = ds.TFRecordDataset(DATA_DIR_1, SCHEMA_DIR_1)

    try:
        dataz = ds.zip(data1)

        num_iter = 0
        for _, item in enumerate(dataz.create_dict_iterator()):
            logger.info("item[input_mask] is {}".format(item["input_mask"]))
            num_iter += 1
        logger.info("Number of data in zipped dataz: {}".format(num_iter))

    except BaseException as e:
        logger.info("Got an exception in DE: {}".format(str(e)))
Example #23
0
def test_zip_exception_04():
    """
    Test zip: zip with empty tuple of datasets
    """
    logger.info("test_zip_exception_04")

    try:
        dataz = ds.zip(())
        dataz = dataz.repeat(2)

        num_iter = 0
        for _, item in enumerate(dataz.create_dict_iterator()):
            logger.info("item[input_mask] is {}".format(item["input_mask"]))
            num_iter += 1
        logger.info("Number of data in zipped dataz: {}".format(num_iter))

    except BaseException as e:
        logger.info("Got an exception in DE: {}".format(str(e)))
Example #24
0
def test_case4():
    data1 = ds.TFRecordDataset(FILES, SCHEMA_FILE, columns_list=[
        "col_sint64"
    ]).batch(2).repeat(10).rename(["col_sint64"], ["a1"])
    data2 = ds.TFRecordDataset(FILES, columns_list=["col_sint64"
                                                    ]).rename(["col_sint64"],
                                                              ["a2"])
    assert data2.get_dataset_size() == 12
    data2 = data2.batch(2)
    assert data2.get_dataset_size() == 6
    data2 = data2.shuffle(100)
    assert data2.get_dataset_size() == 6
    data2 = data2.repeat(3)
    assert data2.get_dataset_size() == 18

    data3 = ds.zip((data1, data2))

    assert data3.get_dataset_size() == 18
Example #25
0
def test_zip_05():
    """
    Test zip: zip dataset with renamed columns
    """
    logger.info("test_zip_05")
    ds.config.set_seed(1)
    data1 = ds.TFRecordDataset(DATA_DIR_4, SCHEMA_DIR_4, shuffle=True)
    data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2, shuffle=True)

    data2 = data2.rename(input_columns="input_ids",
                         output_columns="new_input_ids")
    data2 = data2.rename(input_columns="segment_ids",
                         output_columns="new_segment_ids")

    dataz = ds.zip((data1, data2))
    # Note: zipped dataset has 5 rows and 9 columns
    filename = "zip_05_result.npz"
    save_and_check_dict(dataz, filename, generate_golden=GENERATE_GOLDEN)
Example #26
0
def skip_test_zip_exception_02():
    """
    Test zip: zip datasets with duplicate column name
    """
    logger.info("test_zip_exception_02")
    data1 = ds.TFRecordDataset(DATA_DIR_1, SCHEMA_DIR_1)
    data2 = ds.TFRecordDataset(DATA_DIR_4, SCHEMA_DIR_4)

    try:
        dataz = ds.zip((data1, data2))

        num_iter = 0
        for _, item in enumerate(dataz.create_dict_iterator()):
            logger.info("item[input_mask] is {}".format(item["input_mask"]))
            num_iter += 1
        logger.info("Number of data in zipped dataz: {}".format(num_iter))

    except BaseException as e:
        logger.info("Got an exception in DE: {}".format(str(e)))
Example #27
0
def test_zip_exception_01():
    """
    Test zip: zip same datasets
    """
    logger.info("test_zip_exception_01")
    data1 = ds.TFRecordDataset(DATA_DIR_1, SCHEMA_DIR_1)

    try:
        dataz = ds.zip((data1, data1))

        num_iter = 0
        for _, item in enumerate(
                dataz.create_dict_iterator(num_epochs=1, output_numpy=True)):
            logger.info("item[input_mask] is {}".format(item["input_mask"]))
            num_iter += 1
        logger.info("Number of data in zipped dataz: {}".format(num_iter))

    except Exception as e:
        logger.info("Got an exception in DE: {}".format(str(e)))
Example #28
0
def test_rename():
    data1 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2, shuffle=False)
    data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2, shuffle=False)

    data2 = data2.rename(input_columns=["input_ids", "segment_ids"],
                         output_columns=["masks", "seg_ids"])

    data = ds.zip((data1, data2))
    data = data.repeat(3)

    num_iter = 0

    for i, item in enumerate(data.create_dict_iterator()):
        logger.info("item[mask] is {}".format(item["masks"]))
        np.testing.assert_equal(item["masks"], item["input_ids"])
        logger.info("item[seg_ids] is {}".format(item["seg_ids"]))
        np.testing.assert_equal(item["segment_ids"], item["seg_ids"])
        # need to consume the data in the buffer
        num_iter += 1
    logger.info("Number of data in data: {}".format(num_iter))
    assert num_iter == 15
Example #29
0
def test_profiling_complex_pipeline():
    """
    Generator -> Map     ->
                             -> Zip
    TFReader  -> Shuffle ->
    """
    os.environ['PROFILING_MODE'] = 'true'
    os.environ['MINDDATA_PROFILING_DIR'] = '.'
    os.environ['DEVICE_ID'] = '1'

    source = [(np.array([x]),) for x in range(1024)]
    data1 = ds.GeneratorDataset(source, ["gen"])
    data1 = data1.map(operations=[(lambda x: x + 1)], input_columns=["gen"])

    pattern = DATASET_ROOT + "/test.data"
    data2 = ds.TFRecordDataset(pattern, SCHEMA_FILE, shuffle=ds.Shuffle.FILES)
    data2 = data2.shuffle(4)

    data3 = ds.zip((data1, data2))

    for _ in data3:
        pass

    with open(PIPELINE_FILE) as f:
        data = json.load(f)
        op_info = data["op_info"]
        assert len(op_info) == 5
        for i in range(5):
            assert "size" in op_info[i]["metrics"]["output_queue"]
            assert "length" in op_info[i]["metrics"]["output_queue"]
            assert "throughput" in op_info[i]["metrics"]["output_queue"]

    assert os.path.exists(PIPELINE_FILE) is True
    os.remove(PIPELINE_FILE)
    assert os.path.exists(DATASET_ITERATOR_FILE) is True
    os.remove(DATASET_ITERATOR_FILE)
    del os.environ['PROFILING_MODE']
    del os.environ['MINDDATA_PROFILING_DIR']
Example #30
0
def test_zip_06():
    """
    Test zip: zip dataset with renamed columns and repeat zipped dataset
    """
    logger.info("test_zip_06")
    ds.config.set_seed(1)
    data1 = ds.TFRecordDataset(DATA_DIR_4, SCHEMA_DIR_4, shuffle=False)
    data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2, shuffle=False)

    data2 = data2.rename(input_columns="input_ids",
                         output_columns="new_input_ids")
    data2 = data2.rename(input_columns="segment_ids",
                         output_columns="new_segment_ids")

    dataz = ds.zip((data1, data2))
    dataz = dataz.repeat(2)
    # Note: resultant dataset has 10 rows and 9 columns
    filename = "zip_06_result.npz"
    parameters = {"params": {}}
    save_and_check_dict(dataz,
                        parameters,
                        filename,
                        generate_golden=GENERATE_GOLDEN)