def test_tfrecord_pad(): logger.info("test_tfrecord_pad") schema_file = "../data/dataset/testTFTestAllTypes/datasetSchemaPadBytes10.json" data = ds.TFRecordDataset(FILES, schema_file, shuffle=ds.Shuffle.FILES) filename = "tfrecord_pad_bytes10.npz" save_and_check_dict(data, filename, generate_golden=GENERATE_GOLDEN)
def test_zip_03(): """ Test zip: zip 2 datasets, #rows-data1 > #rows-data2, #cols-data1 > #cols-data2 """ logger.info("test_zip_03") ds.config.set_seed(1) data1 = ds.TFRecordDataset(DATA_DIR_1, SCHEMA_DIR_1) data2 = ds.TFRecordDataset(DATA_DIR_3, SCHEMA_DIR_3) dataz = ds.zip((data1, data2)) # Note: zipped dataset has 3 rows and 7 columns filename = "zip_03_result.npz" save_and_check_dict(dataz, filename, generate_golden=GENERATE_GOLDEN)
def test_zip_04(): """ Test zip: zip >2 datasets """ logger.info("test_zip_04") ds.config.set_seed(1) data1 = ds.TFRecordDataset(DATA_DIR_1, SCHEMA_DIR_1) data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2) data3 = ds.TFRecordDataset(DATA_DIR_3, SCHEMA_DIR_3) dataz = ds.zip((data1, data2, data3)) # Note: zipped dataset has 3 rows and 9 columns filename = "zip_04_result.npz" save_and_check_dict(dataz, filename, generate_golden=GENERATE_GOLDEN)
def test_tf_repeat_01(): """ Test a simple repeat operation. """ logger.info("Test Simple Repeat") # define parameters repeat_count = 2 # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR_TF, SCHEMA_DIR_TF, shuffle=False) data1 = data1.repeat(repeat_count) filename = "repeat_result.npz" save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
def test_batch_12(): """ Test batch: batch_size boolean value True, treated as valid value 1 """ logger.info("test_batch_12") # define parameters batch_size = True # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES) data1 = data1.batch(batch_size=batch_size) assert sum([1 for _ in data1]) == 12 filename = "batch_12_result.npz" save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
def test_batch_05(): """ Test batch: batch_size=1 (minimum valid size), drop_remainder default """ logger.info("test_batch_05") # define parameters batch_size = 1 # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES) data1 = data1.batch(batch_size) assert sum([1 for _ in data1]) == 12 filename = "batch_05_result.npz" save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
def test_batch_04(): """ Test batch: batch_size>1, drop_remainder=False, remainder exists """ logger.info("test_batch_04") # define parameters batch_size = 7 drop_remainder = False # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES) data1 = data1.batch(batch_size, drop_remainder) assert sum([1 for _ in data1]) == 2 filename = "batch_04_result.npz" save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
def test_batch_08(): """ Test batch: num_parallel_workers=1, drop_remainder default """ logger.info("test_batch_08") # define parameters batch_size = 6 num_parallel_workers = 1 # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES) data1 = data1.batch(batch_size, num_parallel_workers=num_parallel_workers) assert sum([1 for _ in data1]) == 2 filename = "batch_08_result.npz" save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
def test_batch_06(): """ Test batch: batch_size = number-of-rows-in-dataset, drop_remainder=True, reorder params """ logger.info("test_batch_06") # define parameters batch_size = 12 drop_remainder = False # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES) data1 = data1.batch(drop_remainder=drop_remainder, batch_size=batch_size) assert sum([1 for _ in data1]) == 1 filename = "batch_06_result.npz" save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
def test_zip_02(): """ Test zip: zip 2 datasets, #rows-data1 < #rows-data2, #cols-data1 == #cols-data2 """ logger.info("test_zip_02") ds.config.set_seed(1) data1 = ds.TFRecordDataset(DATA_DIR_3, SCHEMA_DIR_3) data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2) dataz = ds.zip((data1, data2)) # Note: zipped dataset has 3 rows and 4 columns filename = "zip_02_result.npz" parameters = {"params": {}} save_and_check_dict(dataz, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_shuffle_05(): """ Test shuffle: buffer_size > number-of-rows-in-dataset """ logger.info("test_shuffle_05") # define parameters buffer_size = 13 seed = 1 # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES) ds.config.set_seed(seed) data1 = data1.shuffle(buffer_size=buffer_size) filename = "shuffle_05_result.npz" save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
def test_shuffle_04(): """ Test shuffle: buffer_size=2 (minimum size), number-of-rows-in-dataset = 2 """ logger.info("test_shuffle_04") # define parameters buffer_size = 2 seed = 1 # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, num_samples=2) ds.config.set_seed(seed) data1 = data1.shuffle(buffer_size=buffer_size) filename = "shuffle_04_result.npz" save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
def test_2ops_batch_repeat(): """ Test Batch then Repeat """ logger.info("Test Batch then Repeat") # define parameters repeat_count = 2 batch_size = 5 # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False) data1 = data1.batch(batch_size, drop_remainder=True) data1 = data1.repeat(repeat_count) filename = "test_2ops_batch_repeat.npz" save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
def test_batch_10(): """ Test batch: batch_size > number-of-rows-in-dataset, drop_remainder=True """ logger.info("test_batch_10") # define parameters batch_size = 99 drop_remainder = True # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES) data1 = data1.batch(batch_size, drop_remainder=drop_remainder) assert sum([1 for _ in data1]) == 0 filename = "batch_10_result.npz" save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
def test_batch_11(): """ Test batch: batch_size=1 and dataset-size=1 """ logger.info("test_batch_11") # define parameters batch_size = 1 # apply dataset operations # Use schema file with 1 row schema_file = "../data/dataset/testTFTestAllTypes/datasetSchema1Row.json" data1 = ds.TFRecordDataset(DATA_DIR, schema_file) data1 = data1.batch(batch_size) assert sum([1 for _ in data1]) == 1 filename = "batch_11_result.npz" save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
def test_tf_repeat_04(): """ Test a simple repeat operation with column list. """ logger.info("Test Simple Repeat Column List") # define parameters repeat_count = 2 columns_list = ["col_sint64", "col_sint32"] # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR_TF, SCHEMA_DIR_TF, columns_list=columns_list, shuffle=False) data1 = data1.repeat(repeat_count) filename = "repeat_list_result.npz" save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
def test_2ops_batch_shuffle(): """ Test Batch then Shuffle """ logger.info("Test Batch then Shuffle") # define parameters buffer_size = 5 seed = 0 batch_size = 2 # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False) data1 = data1.batch(batch_size, drop_remainder=True) ds.config.set_seed(seed) data1 = data1.shuffle(buffer_size=buffer_size) filename = "test_2ops_batch_shuffle.npz" save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
def test_batch_07(): """ Test batch: num_parallel_workers>1, drop_remainder=False, reorder params """ logger.info("test_batch_07") # define parameters batch_size = 4 drop_remainder = False num_parallel_workers = 2 # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES) data1 = data1.batch(num_parallel_workers=num_parallel_workers, drop_remainder=drop_remainder, batch_size=batch_size) assert sum([1 for _ in data1]) == 3 filename = "batch_07_result.npz" save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
def test_zip_05(): """ Test zip: zip dataset with renamed columns """ logger.info("test_zip_05") ds.config.set_seed(1) data1 = ds.TFRecordDataset(DATA_DIR_4, SCHEMA_DIR_4, shuffle=True) data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2, shuffle=True) data2 = data2.rename(input_columns="input_ids", output_columns="new_input_ids") data2 = data2.rename(input_columns="segment_ids", output_columns="new_segment_ids") dataz = ds.zip((data1, data2)) # Note: zipped dataset has 5 rows and 9 columns filename = "zip_05_result.npz" save_and_check_dict(dataz, filename, generate_golden=GENERATE_GOLDEN)
def test_2ops_shuffle_repeat(): """ Test Shuffle then Repeat """ logger.info("Test Shuffle then Repeat") # define parameters repeat_count = 2 buffer_size = 5 seed = 0 # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False) ds.config.set_seed(seed) data1 = data1.shuffle(buffer_size=buffer_size) data1 = data1.repeat(repeat_count) filename = "test_2ops_shuffle_repeat.npz" save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
def test_zip_06(): """ Test zip: zip dataset with renamed columns and repeat zipped dataset """ logger.info("test_zip_06") ds.config.set_seed(1) data1 = ds.TFRecordDataset(DATA_DIR_4, SCHEMA_DIR_4, shuffle=False) data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2, shuffle=False) data2 = data2.rename(input_columns="input_ids", output_columns="new_input_ids") data2 = data2.rename(input_columns="segment_ids", output_columns="new_segment_ids") dataz = ds.zip((data1, data2)) dataz = dataz.repeat(2) # Note: resultant dataset has 10 rows and 9 columns filename = "zip_06_result.npz" parameters = {"params": {}} save_and_check_dict(dataz, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_tfrecord_no_schema(): logger.info("test_tfrecord_no_schema") data = ds.TFRecordDataset(FILES, shuffle=ds.Shuffle.FILES) filename = "tfrecord_no_schema.npz" save_and_check_dict(data, filename, generate_golden=GENERATE_GOLDEN)
def test_tfrecord_files_basic(): logger.info("test_tfrecord_files_basic") data = ds.TFRecordDataset(FILES, SCHEMA_FILE, shuffle=ds.Shuffle.FILES) filename = "tfrecord_files_basic.npz" save_and_check_dict(data, filename, generate_golden=GENERATE_GOLDEN)