def test_2ops_repeat_shuffle(): """ Test Repeat then Shuffle """ logger.info("Test Repeat then Shuffle") # define parameters repeat_count = 2 buffer_size = 5 seed = 0 parameters = { "params": { 'repeat_count': repeat_count, 'buffer_size': buffer_size, 'seed': seed } } # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False) data1 = data1.repeat(repeat_count) ds.config.set_seed(seed) data1 = data1.shuffle(buffer_size=buffer_size) filename = "test_2ops_repeat_shuffle.npz" save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_case_1(): """ Test Repeat then Batch """ logger.info("Test Repeat then Batch") # define parameters repeat_count = 2 batch_size = 5 parameters = { "params": { 'repeat_count': repeat_count, 'batch_size': batch_size } } # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False) data1 = data1.repeat(repeat_count) data1 = data1.batch(batch_size, drop_remainder=True) filename = "test_case_1_result.npz" save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_batch_08(): """ Test batch: num_parallel_workers=1, drop_remainder default """ logger.info("test_batch_08") # define parameters batch_size = 6 num_parallel_workers = 1 parameters = { "params": { 'batch_size': batch_size, 'num_parallel_workers': num_parallel_workers } } # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES) data1 = data1.batch(batch_size, num_parallel_workers=num_parallel_workers) assert sum([1 for _ in data1]) == 2 filename = "batch_08_result.npz" save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_batch_07(): """ Test batch: num_parallel_workers>1, drop_remainder=False, reorder params """ logger.info("test_batch_07") # define parameters batch_size = 4 drop_remainder = False num_parallel_workers = 2 parameters = { "params": { 'batch_size': batch_size, 'drop_remainder': drop_remainder, 'num_parallel_workers': num_parallel_workers } } # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES) data1 = data1.batch(num_parallel_workers=num_parallel_workers, drop_remainder=drop_remainder, batch_size=batch_size) assert sum([1 for _ in data1]) == 3 filename = "batch_07_result.npz" save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_batch_10(): """ Test batch: batch_size > number-of-rows-in-dataset, drop_remainder=True """ logger.info("test_batch_10") # define parameters batch_size = 99 drop_remainder = True parameters = { "params": { 'batch_size': batch_size, 'drop_remainder': drop_remainder } } # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES) data1 = data1.batch(batch_size, drop_remainder=drop_remainder) assert sum([1 for _ in data1]) == 0 filename = "batch_10_result.npz" save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_batch_03(): """ Test batch: batch_size>1, drop_remainder=False, no remainder exists """ logger.info("test_batch_03") # define parameters batch_size = 3 drop_remainder = False parameters = { "params": { 'batch_size': batch_size, 'drop_remainder': drop_remainder } } # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES) data1 = data1.batch(batch_size=batch_size, drop_remainder=drop_remainder) assert sum([1 for _ in data1]) == 4 filename = "batch_03_result.npz" save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_case_tf_file_no_schema(): logger.info("reading data from: {}".format(FILES[0])) parameters = {"params": {}} data = ds.TFRecordDataset(FILES, shuffle=ds.Shuffle.FILES) filename = "tf_file_no_schema.npz" save_and_check(data, parameters, filename, generate_golden=GENERATE_GOLDEN)
def skip_test_case_0_reverse(): """ Test Shuffle then Repeat """ logger.info("Test Shuffle then Repeat") # define parameters repeat_count = 2 buffer_size = 5 seed = 0 parameters = { "params": { 'repeat_count': repeat_count, 'buffer_size': buffer_size, 'reshuffle_each_iteration': False, 'seed': seed } } # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False) ds.config.set_seed(seed) data1 = data1.shuffle(buffer_size=buffer_size) data1 = data1.repeat(repeat_count) filename = "test_case_0_reverse_result.npz" save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_case_2_reverse(): """ Test Shuffle then Batch """ logger.info("Test Shuffle then Batch") # define parameters buffer_size = 5 seed = 0 batch_size = 2 parameters = { "params": { 'buffer_size': buffer_size, 'seed': seed, 'batch_size': batch_size } } # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False) ds.config.set_seed(seed) data1 = data1.shuffle(buffer_size=buffer_size) data1 = data1.batch(batch_size, drop_remainder=True) filename = "test_case_2_reverse_result.npz" save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_case_tf_file_pad(): logger.info("reading data from: {}".format(FILES[0])) parameters = {"params": {}} schema_file = "../data/dataset/testTFTestAllTypes/datasetSchemaPadBytes10.json" data = ds.TFRecordDataset(FILES, schema_file, shuffle=ds.Shuffle.FILES) filename = "tf_file_padBytes10.npz" save_and_check(data, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_case_storage(): """ test StorageDataset """ logger.info("Test Simple StorageDataset") # define parameters parameters = {"params": {}} # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False) filename = "storage_result.npz" save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_batch_05(): """ Test batch: batch_size=1 (minimum valid size), drop_remainder default """ logger.info("test_batch_05") # define parameters batch_size = 1 parameters = {"params": {'batch_size': batch_size}} # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES) data1 = data1.batch(batch_size) filename = "batch_05_result.npz" save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_tf_repeat_01(): """ a simple repeat operation. """ logger.info("Test Simple Repeat") # define parameters repeat_count = 2 parameters = {"params": {'repeat_count': repeat_count}} # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR_TF, SCHEMA_DIR_TF, shuffle=False) data1 = data1.repeat(repeat_count) filename = "repeat_result.npz" save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_case_columns_list(): """ a simple repeat operation. """ logger.info("Test Simple Repeat") # define parameters repeat_count = 2 parameters = {"params": {'repeat_count': repeat_count}} columns_list = ["col_sint64", "col_sint32"] # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=columns_list, shuffle=False) data1 = data1.repeat(repeat_count) filename = "columns_list_result.npz" save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_batch_12(): """ Test batch: batch_size boolean value True, treated as valid value 1 """ logger.info("test_batch_12") # define parameters batch_size = True parameters = {"params": {'batch_size': batch_size}} # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES) data1 = data1.batch(batch_size=batch_size) assert sum([1 for _ in data1]) == 12 filename = "batch_12_result.npz" save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_batch_11(): """ Test batch: batch_size=1 and dataset-size=1 """ logger.info("test_batch_11") # define parameters batch_size = 1 parameters = {"params": {'batch_size': batch_size}} # apply dataset operations # Use schema file with 1 row schema_file = "../data/dataset/testTFTestAllTypes/datasetSchema1Row.json" data1 = ds.TFRecordDataset(DATA_DIR, schema_file) data1 = data1.batch(batch_size) filename = "batch_11_result.npz" save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_batch_02(): """ Test batch: batch_size>1, drop_remainder=True, remainder exists """ logger.info("test_batch_02") # define parameters batch_size = 5 drop_remainder = True parameters = {"params": {'batch_size': batch_size, 'drop_remainder': drop_remainder}} # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES) data1 = data1.batch(batch_size, drop_remainder=drop_remainder) filename = "batch_02_result.npz" save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_shuffle_01(): """ Test shuffle: buffer_size < number-of-rows-in-dataset """ logger.info("test_shuffle_01") # define parameters buffer_size = 5 seed = 1 parameters = {"params": {'buffer_size': buffer_size, "seed": seed}} # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES) ds.config.set_seed(seed) data1 = data1.shuffle(buffer_size=buffer_size) filename = "shuffle_01_result.npz" save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_shuffle_04(): """ Test shuffle: buffer_size=2 (minimum size), number-of-rows-in-dataset = 2 """ logger.info("test_shuffle_04") # define parameters buffer_size = 2 seed = 1 parameters = {"params": {'buffer_size': buffer_size, "seed": seed}} # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, num_samples=2) ds.config.set_seed(seed) data1 = data1.shuffle(buffer_size=buffer_size) filename = "shuffle_04_result.npz" save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)
def test_batch_06(): """ Test batch: batch_size = number-of-rows-in-dataset, drop_remainder=True, reorder params """ logger.info("test_batch_06") # define parameters batch_size = 12 drop_remainder = False parameters = {"params": {'batch_size': batch_size, 'drop_remainder': drop_remainder}} # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES) data1 = data1.batch(drop_remainder=drop_remainder, batch_size=batch_size) filename = "batch_06_result.npz" save_and_check(data1, parameters, filename, generate_golden=GENERATE_GOLDEN)