def test_profiling_complex_pipeline(): """ Generator -> Map -> -> Zip -> Batch TFReader -> Shuffle -> """ os.environ['PROFILING_MODE'] = 'true' os.environ['MINDDATA_PROFILING_DIR'] = '.' os.environ['DEVICE_ID'] = '1' source = [(np.array([x]),) for x in range(1024)] data1 = ds.GeneratorDataset(source, ["gen"]) data1 = data1.map("gen", operations=[(lambda x: x + 1)]) pattern = DATASET_ROOT + "/test.data" data2 = ds.TFRecordDataset(pattern, SCHEMA_FILE, shuffle=ds.Shuffle.FILES) data2 = data2.shuffle(4) data3 = ds.zip((data1, data2)) for _ in data3: pass assert os.path.exists(PIPELINE_FILE_SIZE) is True os.remove(PIPELINE_FILE_SIZE) assert os.path.exists(PIPELINE_FILE_THR) is True os.remove(PIPELINE_FILE_THR) assert os.path.exists(DATASET_ITERATOR_FILE) is True os.remove(DATASET_ITERATOR_FILE) del os.environ['PROFILING_MODE'] del os.environ['MINDDATA_PROFILING_DIR']
def test_filter_by_generator_Partial2(): dataset1 = ds.GeneratorDataset(source=generator_mc_p0(), column_names=["col1", "col2"]) dataset2 = ds.GeneratorDataset(source=generator_mc_p1(), column_names=["col3", "col4"]) dataset1f = dataset1.filter(input_columns=["col1"], predicate=lambda x: x not in [3, 7, 9], num_parallel_workers=2) dataset2f = dataset2.filter(input_columns=["col3"], predicate=lambda x: x not in [203, 207, 209], num_parallel_workers=2) dataset_zip = ds.zip((dataset1f, dataset2f)) dataset_map = dataset_zip.map(input_columns=["col1", "col3"], output_columns=["out1", "out3"], operations=lambda x1, x3: (x1 + 400, x3 + 500)) ret1 = [] ret3 = [] for item in dataset_map.create_dict_iterator(): ret1.append(item["out1"]) ret3.append(item["out3"]) assert ret1[0] == 400 assert ret1[6] == 408 assert ret3[0] == 700 assert ret3[6] == 708
def test_get_column_name_zip(): data1 = ds.Cifar10Dataset(CIFAR10_DIR) assert data1.get_col_names() == ["image", "label"] data2 = ds.CSVDataset(CSV_DIR) assert data2.get_col_names() == ["1", "2", "3", "4"] data = ds.zip((data1, data2)) assert data.get_col_names() == ["image", "label", "1", "2", "3", "4"]
def test_imagefolder_zip(): logger.info("Test Case zip") # define parameters repeat_count = 2 # apply dataset operations data1 = ds.ImageFolderDataset(DATA_DIR, num_samples=10) data2 = ds.ImageFolderDataset(DATA_DIR, num_samples=10) data1 = data1.repeat(repeat_count) # rename dataset2 for no conflict data2 = data2.rename(input_columns=["image", "label"], output_columns=["image1", "label1"]) data3 = ds.zip((data1, data2)) num_iter = 0 for item in data3.create_dict_iterator( num_epochs=1): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" logger.info("image is {}".format(item["image"])) logger.info("label is {}".format(item["label"])) num_iter += 1 logger.info("Number of data in data1: {}".format(num_iter)) assert num_iter == 10
def test_zip_exception_07(): """ Test zip: zip with string as parameter """ logger.info("test_zip_exception_07") try: dataz = ds.zip(('dataset1', 'dataset2')) num_iter = 0 for _ in dataz.create_dict_iterator(num_epochs=1, output_numpy=True): num_iter += 1 assert False except Exception as e: logger.info("Got an exception in DE: {}".format(str(e))) try: data = ds.TFRecordDataset(DATA_DIR_1, SCHEMA_DIR_1) dataz = data.zip(('dataset1', )) num_iter = 0 for _ in dataz.create_dict_iterator(num_epochs=1, output_numpy=True): num_iter += 1 assert False except Exception as e: logger.info("Got an exception in DE: {}".format(str(e)))
def test_map_reorder1(): def generator_mc(maxid=1): for _ in range(maxid): yield (np.array([0]), np.array([1]), np.array([2])) # Three map and zip data0 = ds.GeneratorDataset(generator_mc, ["a0", "a1", "a2"]) data0 = data0.map(input_columns="a0", columns_order=["a2", "a1", "a0"], operations=(lambda x: x)) data1 = ds.GeneratorDataset(generator_mc, ["b0", "b1", "b2"]) data1 = data1.map(input_columns="b0", columns_order=["b1", "b2", "b0"], operations=(lambda x: x)) data2 = ds.zip((data0, data1)) data2 = data2.map(input_columns="a0", columns_order=["b2", "a2", "b1", "a1", "b0", "a0"], operations=(lambda x: x)) for item in data2.create_tuple_iterator(): assert item == [ np.array(2), np.array(2), np.array(1), np.array(1), np.array(0), np.array(0) ]
def test_case3(): data1 = ds.TFRecordDataset(FILES, SCHEMA_FILE).batch(2).repeat(10) data2 = ds.TFRecordDataset(FILES, SCHEMA_FILE).batch(2).repeat(5) data3 = ds.TFRecordDataset(FILES, SCHEMA_FILE).batch(2) data4 = ds.zip((data1, data2, data3)) assert data4.get_dataset_size() == 6
def zip_test(dataset): dataset_1 = copy.deepcopy(dataset) dataset_2 = copy.deepcopy(dataset) dataset_1 = dataset_1.apply(apply_func) dataset_zip = ds.zip((dataset_1, dataset_2)) expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.'] for i in dataset_zip.create_dict_iterator(): ret = to_str(i["text"]) for key, value in enumerate(ret): assert value == expect[key]
def test_filter_by_generator_Partial0(): dataset1 = ds.GeneratorDataset(source=generator_mc_p0(), column_names=["col1", "col2"]) dataset2 = ds.GeneratorDataset(source=generator_mc_p1(), column_names=["col3", "col4"]) dataset_zip = ds.zip((dataset1, dataset2)) dataset_f1 = dataset_zip.filter(predicate=filter_func_Partial_0, num_parallel_workers=2) ret = [] for item in dataset_f1.create_dict_iterator(num_epochs=1, output_numpy=True): ret.append(item["col1"]) assert ret[0] == 5 assert ret[6] == 12
def test_serdes_zip_dataset(remove_json_files=True): """ Test serdes on zip dataset pipeline. """ files = ["../data/dataset/testTFTestAllTypes/test.data"] schema_file = "../data/dataset/testTFTestAllTypes/datasetSchema.json" ds.config.set_seed(1) ds0 = ds.TFRecordDataset(files, schema=schema_file, shuffle=ds.Shuffle.GLOBAL) data1 = ds.TFRecordDataset(files, schema=schema_file, shuffle=ds.Shuffle.GLOBAL) data2 = ds.TFRecordDataset(files, schema=schema_file, shuffle=ds.Shuffle.FILES) data2 = data2.shuffle(10000) data2 = data2.rename(input_columns=[ "col_sint16", "col_sint32", "col_sint64", "col_float", "col_1d", "col_2d", "col_3d", "col_binary" ], output_columns=[ "column_sint16", "column_sint32", "column_sint64", "column_float", "column_1d", "column_2d", "column_3d", "column_binary" ]) data3 = ds.zip((data1, data2)) ds.serialize(data3, "zip_dataset_pipeline.json") assert validate_jsonfile("zip_dataset_pipeline.json") is True assert validate_jsonfile("zip_dataset_pipeline_typo.json") is False data4 = ds.deserialize(json_filepath="zip_dataset_pipeline.json") ds.serialize(data4, "zip_dataset_pipeline_1.json") assert validate_jsonfile("zip_dataset_pipeline_1.json") is True assert filecmp.cmp('zip_dataset_pipeline.json', 'zip_dataset_pipeline_1.json') rows = 0 for d0, d3, d4 in zip(ds0.create_tuple_iterator(output_numpy=True), data3.create_tuple_iterator(output_numpy=True), data4.create_tuple_iterator(output_numpy=True)): num_cols = len(d0) offset = 0 for t1 in d0: np.testing.assert_array_equal(t1, d3[offset]) np.testing.assert_array_equal(t1, d3[offset + num_cols]) np.testing.assert_array_equal(t1, d4[offset]) np.testing.assert_array_equal(t1, d4[offset + num_cols]) offset += 1 rows += 1 assert rows == 12 if remove_json_files: delete_json_files()
def test_filter_by_generator_Partial0(): ds.config.load('../data/dataset/declient_filter.cfg') dataset1 = ds.GeneratorDataset(source=generator_mc_p0(), column_names=["col1", "col2"]) dataset2 = ds.GeneratorDataset(source=generator_mc_p1(), column_names=["col3", "col4"]) dataset_zip = ds.zip((dataset1, dataset2)) dataset_f1 = dataset_zip.filter(predicate=filter_func_Partial_0, num_parallel_workers=2) ret = [] for item in dataset_f1.create_dict_iterator(): ret.append(item["col1"]) assert ret[0] == 5 assert ret[6] == 12
def test_filter_by_generator_Partial1(): ds.config.load('../data/dataset/declient_filter.cfg') dataset1 = ds.GeneratorDataset(source=generator_mc_p0(), column_names=["col1", "col2"]) dataset2 = ds.GeneratorDataset(source=generator_mc_p1(), column_names=["col3", "col4"]) dataset_zip = ds.zip((dataset1, dataset2)) dataset_f1 = dataset_zip.filter(predicate=filter_func_Partial_0, num_parallel_workers=2) dataset_map = dataset_f1.map(input_columns=["col1"], output_columns=["out1"], operations=lambda x1: x1 + 400) ret = [] for item in dataset_map.create_dict_iterator(): ret.append(item["out1"]) assert ret[0] == 405 assert ret[6] == 412
def test_zip_03(): """ Test zip: zip 2 datasets, #rows-data1 > #rows-data2, #cols-data1 > #cols-data2 """ logger.info("test_zip_03") ds.config.set_seed(1) data1 = ds.TFRecordDataset(DATA_DIR_1, SCHEMA_DIR_1) data2 = ds.TFRecordDataset(DATA_DIR_3, SCHEMA_DIR_3) dataz = ds.zip((data1, data2)) # Note: zipped dataset has 3 rows and 7 columns filename = "zip_03_result.npz" save_and_check_dict(dataz, filename, generate_golden=GENERATE_GOLDEN)
def test_zip_04(): """ Test zip: zip >2 datasets """ logger.info("test_zip_04") ds.config.set_seed(1) data1 = ds.TFRecordDataset(DATA_DIR_1, SCHEMA_DIR_1) data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2) data3 = ds.TFRecordDataset(DATA_DIR_3, SCHEMA_DIR_3) dataz = ds.zip((data1, data2, data3)) # Note: zipped dataset has 3 rows and 9 columns filename = "zip_04_result.npz" save_and_check_dict(dataz, filename, generate_golden=GENERATE_GOLDEN)
def test_case4(): data1 = ds.TFRecordDataset(FILES, SCHEMA_FILE).batch(2).repeat(10) data2 = ds.TFRecordDataset(FILES) assert data2.get_dataset_size() == 12 data2 = data2.batch(2) assert data2.get_dataset_size() == 6 data2 = data2.shuffle(100) assert data2.get_dataset_size() == 6 data2 = data2.repeat(3) assert data2.get_dataset_size() == 18 data3 = ds.zip((data1, data2)) assert data3.get_dataset_size() == 18
def makeup_train_dataset(ds1, ds2, ds3, batchsize, epoch): ds1 = ds1.rename(input_columns=["label", "image"], output_columns=["label1", "data1"]) ds2 = ds2.rename(input_columns=["label", "image"], output_columns=["label2", "data2"]) ds3 = ds3.rename(input_columns=["image"], output_columns=["data3"]) ds_new = ds.zip((ds1, ds2)) ds_new = ds_new.project(columns=['data1', 'data2']) ds_new = ds.zip((ds3, ds_new)) ds_new = ds_new.map(input_columns=['label'], output_columns=['label'], column_order=['data3', 'data2', 'data1', 'label'], operations=lambda x: x) # to keep the order : data3 data2 data1 label # ds_new = ds_new.shuffle(ds_new.get_dataset_size()) print("dataset batchsize:", batchsize) ds_new = ds_new.batch(batchsize) ds_new = ds_new.repeat(epoch) print("batch_size:", ds_new.get_batch_size(), "batch_num:", ds_new.get_dataset_size()) return ds_new
def test_case3(): data1 = ds.TFRecordDataset(FILES, SCHEMA_FILE, columns_list=[ "col_sint64" ]).batch(2).repeat(10).rename(["col_sint64"], ["a1"]) data2 = ds.TFRecordDataset(FILES, SCHEMA_FILE, columns_list=[ "col_sint64" ]).batch(2).repeat(5).rename(["col_sint64"], ["a2"]) data3 = ds.TFRecordDataset(FILES, SCHEMA_FILE, columns_list=["col_sint64" ]).batch(2).rename(["col_sint64"], ["a3"]) data4 = ds.zip((data1, data2, data3)) assert data4.get_dataset_size() == 6
def test_filter_by_generator_with_map_all_sort(): dataset1 = ds.GeneratorDataset(generator_sort1(10), ["col1", "col2", "col3"]) dataset2 = ds.GeneratorDataset(generator_sort2(10), ["col4 ", "col5", "col6"]) dataz = ds.zip((dataset1, dataset2)) dataset_f = dataz.filter(predicate=filter_func_part_sort, num_parallel_workers=1) num_iter = 0 ret_data = [] for item in dataset_f.create_dict_iterator(): num_iter += 1 ret_data.append(item) assert num_iter == 10 assert ret_data[0]["col1"] == 0 assert ret_data[9]["col6"] == 509
def test_filter_by_generator_with_zip(): dataset1 = ds.GeneratorDataset(generator_1d_zip1, ["data1"]) dataset2 = ds.GeneratorDataset(generator_1d_zip2, ["data2"]) dataz = ds.zip((dataset1, dataset2)) dataset_f = dataz.filter(predicate=filter_func_zip, num_parallel_workers=1) num_iter = 0 ret_data = [] for item in dataset_f.create_dict_iterator(): num_iter += 1 ret_data.append({"data1": item["data1"], "data2": item["data2"]}) assert num_iter == 21 assert ret_data[0]["data1"] == 0 assert ret_data[0]["data2"] == 100 assert ret_data[5]["data1"] == 5 assert ret_data[5]["data2"] == 105
def test_zip_02(): """ Test zip: zip 2 datasets, #rows-data1 < #rows-data2, #cols-data1 == #cols-data2 """ logger.info("test_zip_02") ds.config.set_seed(1) data1 = ds.TFRecordDataset(DATA_DIR_3, SCHEMA_DIR_3) data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2) dataz = ds.zip((data1, data2)) # Note: zipped dataset has 3 rows and 4 columns filename = "zip_02_result.npz" parameters = {"params": {}} save_and_check_dict(dataz, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_filter_by_generator_with_zip_after(): dataset1 = ds.GeneratorDataset(generator_1d_zip1, ["data1"]) dataset2 = ds.GeneratorDataset(generator_1d_zip1, ["data2"]) dt1 = dataset1.filter(predicate=filter_func_zip_after, num_parallel_workers=4) dt2 = dataset2.filter(predicate=filter_func_zip_after, num_parallel_workers=4) dataz = ds.zip((dt1, dt2)) num_iter = 0 ret_data = [] for item in dataz.create_dict_iterator(num_epochs=1, output_numpy=True): num_iter += 1 ret_data.append({"data1": item["data1"], "data2": item["data2"]}) assert num_iter == 21 assert ret_data[0]["data1"] == 0 assert ret_data[0]["data2"] == 0 assert ret_data[5]["data1"] == 5 assert ret_data[5]["data2"] == 5
def test_zip_exception_06(): """ Test zip: zip with non-tuple of 1 dataset """ logger.info("test_zip_exception_06") data1 = ds.TFRecordDataset(DATA_DIR_1, SCHEMA_DIR_1) try: dataz = ds.zip(data1) num_iter = 0 for _, item in enumerate(dataz.create_dict_iterator()): logger.info("item[input_mask] is {}".format(item["input_mask"])) num_iter += 1 logger.info("Number of data in zipped dataz: {}".format(num_iter)) except BaseException as e: logger.info("Got an exception in DE: {}".format(str(e)))
def test_zip_exception_04(): """ Test zip: zip with empty tuple of datasets """ logger.info("test_zip_exception_04") try: dataz = ds.zip(()) dataz = dataz.repeat(2) num_iter = 0 for _, item in enumerate(dataz.create_dict_iterator()): logger.info("item[input_mask] is {}".format(item["input_mask"])) num_iter += 1 logger.info("Number of data in zipped dataz: {}".format(num_iter)) except BaseException as e: logger.info("Got an exception in DE: {}".format(str(e)))
def test_case4(): data1 = ds.TFRecordDataset(FILES, SCHEMA_FILE, columns_list=[ "col_sint64" ]).batch(2).repeat(10).rename(["col_sint64"], ["a1"]) data2 = ds.TFRecordDataset(FILES, columns_list=["col_sint64" ]).rename(["col_sint64"], ["a2"]) assert data2.get_dataset_size() == 12 data2 = data2.batch(2) assert data2.get_dataset_size() == 6 data2 = data2.shuffle(100) assert data2.get_dataset_size() == 6 data2 = data2.repeat(3) assert data2.get_dataset_size() == 18 data3 = ds.zip((data1, data2)) assert data3.get_dataset_size() == 18
def test_zip_05(): """ Test zip: zip dataset with renamed columns """ logger.info("test_zip_05") ds.config.set_seed(1) data1 = ds.TFRecordDataset(DATA_DIR_4, SCHEMA_DIR_4, shuffle=True) data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2, shuffle=True) data2 = data2.rename(input_columns="input_ids", output_columns="new_input_ids") data2 = data2.rename(input_columns="segment_ids", output_columns="new_segment_ids") dataz = ds.zip((data1, data2)) # Note: zipped dataset has 5 rows and 9 columns filename = "zip_05_result.npz" save_and_check_dict(dataz, filename, generate_golden=GENERATE_GOLDEN)
def skip_test_zip_exception_02(): """ Test zip: zip datasets with duplicate column name """ logger.info("test_zip_exception_02") data1 = ds.TFRecordDataset(DATA_DIR_1, SCHEMA_DIR_1) data2 = ds.TFRecordDataset(DATA_DIR_4, SCHEMA_DIR_4) try: dataz = ds.zip((data1, data2)) num_iter = 0 for _, item in enumerate(dataz.create_dict_iterator()): logger.info("item[input_mask] is {}".format(item["input_mask"])) num_iter += 1 logger.info("Number of data in zipped dataz: {}".format(num_iter)) except BaseException as e: logger.info("Got an exception in DE: {}".format(str(e)))
def test_zip_exception_01(): """ Test zip: zip same datasets """ logger.info("test_zip_exception_01") data1 = ds.TFRecordDataset(DATA_DIR_1, SCHEMA_DIR_1) try: dataz = ds.zip((data1, data1)) num_iter = 0 for _, item in enumerate( dataz.create_dict_iterator(num_epochs=1, output_numpy=True)): logger.info("item[input_mask] is {}".format(item["input_mask"])) num_iter += 1 logger.info("Number of data in zipped dataz: {}".format(num_iter)) except Exception as e: logger.info("Got an exception in DE: {}".format(str(e)))
def test_rename(): data1 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2, shuffle=False) data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2, shuffle=False) data2 = data2.rename(input_columns=["input_ids", "segment_ids"], output_columns=["masks", "seg_ids"]) data = ds.zip((data1, data2)) data = data.repeat(3) num_iter = 0 for i, item in enumerate(data.create_dict_iterator()): logger.info("item[mask] is {}".format(item["masks"])) np.testing.assert_equal(item["masks"], item["input_ids"]) logger.info("item[seg_ids] is {}".format(item["seg_ids"])) np.testing.assert_equal(item["segment_ids"], item["seg_ids"]) # need to consume the data in the buffer num_iter += 1 logger.info("Number of data in data: {}".format(num_iter)) assert num_iter == 15
def test_profiling_complex_pipeline(): """ Generator -> Map -> -> Zip TFReader -> Shuffle -> """ os.environ['PROFILING_MODE'] = 'true' os.environ['MINDDATA_PROFILING_DIR'] = '.' os.environ['DEVICE_ID'] = '1' source = [(np.array([x]),) for x in range(1024)] data1 = ds.GeneratorDataset(source, ["gen"]) data1 = data1.map(operations=[(lambda x: x + 1)], input_columns=["gen"]) pattern = DATASET_ROOT + "/test.data" data2 = ds.TFRecordDataset(pattern, SCHEMA_FILE, shuffle=ds.Shuffle.FILES) data2 = data2.shuffle(4) data3 = ds.zip((data1, data2)) for _ in data3: pass with open(PIPELINE_FILE) as f: data = json.load(f) op_info = data["op_info"] assert len(op_info) == 5 for i in range(5): assert "size" in op_info[i]["metrics"]["output_queue"] assert "length" in op_info[i]["metrics"]["output_queue"] assert "throughput" in op_info[i]["metrics"]["output_queue"] assert os.path.exists(PIPELINE_FILE) is True os.remove(PIPELINE_FILE) assert os.path.exists(DATASET_ITERATOR_FILE) is True os.remove(DATASET_ITERATOR_FILE) del os.environ['PROFILING_MODE'] del os.environ['MINDDATA_PROFILING_DIR']
def test_zip_06(): """ Test zip: zip dataset with renamed columns and repeat zipped dataset """ logger.info("test_zip_06") ds.config.set_seed(1) data1 = ds.TFRecordDataset(DATA_DIR_4, SCHEMA_DIR_4, shuffle=False) data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2, shuffle=False) data2 = data2.rename(input_columns="input_ids", output_columns="new_input_ids") data2 = data2.rename(input_columns="segment_ids", output_columns="new_segment_ids") dataz = ds.zip((data1, data2)) dataz = dataz.repeat(2) # Note: resultant dataset has 10 rows and 9 columns filename = "zip_06_result.npz" parameters = {"params": {}} save_and_check_dict(dataz, parameters, filename, generate_golden=GENERATE_GOLDEN)