def test_tfrecord_pad():
    logger.info("test_tfrecord_pad")

    schema_file = "../data/dataset/testTFTestAllTypes/datasetSchemaPadBytes10.json"
    data = ds.TFRecordDataset(FILES, schema_file, shuffle=ds.Shuffle.FILES)
    filename = "tfrecord_pad_bytes10.npz"
    save_and_check_dict(data, filename, generate_golden=GENERATE_GOLDEN)
Example #2
0
def test_zip_03():
    """
    Test zip: zip 2 datasets, #rows-data1 > #rows-data2, #cols-data1 > #cols-data2
    """
    logger.info("test_zip_03")
    ds.config.set_seed(1)
    data1 = ds.TFRecordDataset(DATA_DIR_1, SCHEMA_DIR_1)
    data2 = ds.TFRecordDataset(DATA_DIR_3, SCHEMA_DIR_3)
    dataz = ds.zip((data1, data2))
    # Note: zipped dataset has 3 rows and 7 columns
    filename = "zip_03_result.npz"
    save_and_check_dict(dataz, filename, generate_golden=GENERATE_GOLDEN)
Example #3
0
def test_zip_04():
    """
    Test zip: zip >2 datasets
    """
    logger.info("test_zip_04")
    ds.config.set_seed(1)
    data1 = ds.TFRecordDataset(DATA_DIR_1, SCHEMA_DIR_1)
    data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2)
    data3 = ds.TFRecordDataset(DATA_DIR_3, SCHEMA_DIR_3)
    dataz = ds.zip((data1, data2, data3))
    # Note: zipped dataset has 3 rows and 9 columns
    filename = "zip_04_result.npz"
    save_and_check_dict(dataz, filename, generate_golden=GENERATE_GOLDEN)
Example #4
0
def test_tf_repeat_01():
    """
    Test a simple repeat operation.
    """
    logger.info("Test Simple Repeat")
    # define parameters
    repeat_count = 2

    # apply dataset operations
    data1 = ds.TFRecordDataset(DATA_DIR_TF, SCHEMA_DIR_TF, shuffle=False)
    data1 = data1.repeat(repeat_count)

    filename = "repeat_result.npz"
    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
Example #5
0
def test_batch_12():
    """
    Test batch: batch_size boolean value True, treated as valid value 1
    """
    logger.info("test_batch_12")
    # define parameters
    batch_size = True

    # apply dataset operations
    data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
    data1 = data1.batch(batch_size=batch_size)

    assert sum([1 for _ in data1]) == 12
    filename = "batch_12_result.npz"
    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
Example #6
0
def test_batch_05():
    """
    Test batch: batch_size=1 (minimum valid size), drop_remainder default
    """
    logger.info("test_batch_05")
    # define parameters
    batch_size = 1

    # apply dataset operations
    data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
    data1 = data1.batch(batch_size)

    assert sum([1 for _ in data1]) == 12
    filename = "batch_05_result.npz"
    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
Example #7
0
def test_batch_04():
    """
    Test batch: batch_size>1, drop_remainder=False, remainder exists
    """
    logger.info("test_batch_04")
    # define parameters
    batch_size = 7
    drop_remainder = False

    # apply dataset operations
    data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
    data1 = data1.batch(batch_size, drop_remainder)

    assert sum([1 for _ in data1]) == 2
    filename = "batch_04_result.npz"
    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
Example #8
0
def test_batch_08():
    """
    Test batch: num_parallel_workers=1, drop_remainder default
    """
    logger.info("test_batch_08")
    # define parameters
    batch_size = 6
    num_parallel_workers = 1

    # apply dataset operations
    data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
    data1 = data1.batch(batch_size, num_parallel_workers=num_parallel_workers)

    assert sum([1 for _ in data1]) == 2
    filename = "batch_08_result.npz"
    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
Example #9
0
def test_batch_06():
    """
    Test batch: batch_size = number-of-rows-in-dataset, drop_remainder=True, reorder params
    """
    logger.info("test_batch_06")
    # define parameters
    batch_size = 12
    drop_remainder = False

    # apply dataset operations
    data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
    data1 = data1.batch(drop_remainder=drop_remainder, batch_size=batch_size)

    assert sum([1 for _ in data1]) == 1
    filename = "batch_06_result.npz"
    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
Example #10
0
def test_zip_02():
    """
    Test zip: zip 2 datasets, #rows-data1 < #rows-data2, #cols-data1 == #cols-data2
    """
    logger.info("test_zip_02")
    ds.config.set_seed(1)
    data1 = ds.TFRecordDataset(DATA_DIR_3, SCHEMA_DIR_3)
    data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2)
    dataz = ds.zip((data1, data2))
    # Note: zipped dataset has 3 rows and 4 columns
    filename = "zip_02_result.npz"
    parameters = {"params": {}}
    save_and_check_dict(dataz,
                        parameters,
                        filename,
                        generate_golden=GENERATE_GOLDEN)
Example #11
0
def test_shuffle_05():
    """
    Test shuffle: buffer_size > number-of-rows-in-dataset
    """
    logger.info("test_shuffle_05")
    # define parameters
    buffer_size = 13
    seed = 1

    # apply dataset operations
    data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
    ds.config.set_seed(seed)
    data1 = data1.shuffle(buffer_size=buffer_size)

    filename = "shuffle_05_result.npz"
    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
Example #12
0
def test_shuffle_04():
    """
    Test shuffle: buffer_size=2 (minimum size), number-of-rows-in-dataset = 2
    """
    logger.info("test_shuffle_04")
    # define parameters
    buffer_size = 2
    seed = 1

    # apply dataset operations
    data1 = ds.TFRecordDataset(DATA_DIR, num_samples=2)
    ds.config.set_seed(seed)
    data1 = data1.shuffle(buffer_size=buffer_size)

    filename = "shuffle_04_result.npz"
    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
Example #13
0
def test_2ops_batch_repeat():
    """
    Test Batch then Repeat
    """
    logger.info("Test Batch then Repeat")
    # define parameters
    repeat_count = 2
    batch_size = 5

    # apply dataset operations
    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False)
    data1 = data1.batch(batch_size, drop_remainder=True)
    data1 = data1.repeat(repeat_count)

    filename = "test_2ops_batch_repeat.npz"
    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
Example #14
0
def test_batch_10():
    """
    Test batch: batch_size > number-of-rows-in-dataset, drop_remainder=True
    """
    logger.info("test_batch_10")
    # define parameters
    batch_size = 99
    drop_remainder = True

    # apply dataset operations
    data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
    data1 = data1.batch(batch_size, drop_remainder=drop_remainder)

    assert sum([1 for _ in data1]) == 0
    filename = "batch_10_result.npz"
    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
Example #15
0
def test_batch_11():
    """
    Test batch: batch_size=1 and dataset-size=1
    """
    logger.info("test_batch_11")
    # define parameters
    batch_size = 1

    # apply dataset operations
    # Use schema file with 1 row
    schema_file = "../data/dataset/testTFTestAllTypes/datasetSchema1Row.json"
    data1 = ds.TFRecordDataset(DATA_DIR, schema_file)
    data1 = data1.batch(batch_size)

    assert sum([1 for _ in data1]) == 1
    filename = "batch_11_result.npz"
    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
Example #16
0
def test_tf_repeat_04():
    """
    Test a simple repeat operation with column list.
    """
    logger.info("Test Simple Repeat Column List")
    # define parameters
    repeat_count = 2
    columns_list = ["col_sint64", "col_sint32"]
    # apply dataset operations
    data1 = ds.TFRecordDataset(DATA_DIR_TF,
                               SCHEMA_DIR_TF,
                               columns_list=columns_list,
                               shuffle=False)
    data1 = data1.repeat(repeat_count)

    filename = "repeat_list_result.npz"
    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
Example #17
0
def test_2ops_batch_shuffle():
    """
    Test Batch then Shuffle
    """
    logger.info("Test Batch then Shuffle")
    # define parameters
    buffer_size = 5
    seed = 0
    batch_size = 2

    # apply dataset operations
    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False)
    data1 = data1.batch(batch_size, drop_remainder=True)
    ds.config.set_seed(seed)
    data1 = data1.shuffle(buffer_size=buffer_size)

    filename = "test_2ops_batch_shuffle.npz"
    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
Example #18
0
def test_batch_07():
    """
    Test batch: num_parallel_workers>1, drop_remainder=False, reorder params
    """
    logger.info("test_batch_07")
    # define parameters
    batch_size = 4
    drop_remainder = False
    num_parallel_workers = 2

    # apply dataset operations
    data1 = ds.TFRecordDataset(DATA_DIR, shuffle=ds.Shuffle.FILES)
    data1 = data1.batch(num_parallel_workers=num_parallel_workers, drop_remainder=drop_remainder,
                        batch_size=batch_size)

    assert sum([1 for _ in data1]) == 3
    filename = "batch_07_result.npz"
    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
Example #19
0
def test_zip_05():
    """
    Test zip: zip dataset with renamed columns
    """
    logger.info("test_zip_05")
    ds.config.set_seed(1)
    data1 = ds.TFRecordDataset(DATA_DIR_4, SCHEMA_DIR_4, shuffle=True)
    data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2, shuffle=True)

    data2 = data2.rename(input_columns="input_ids",
                         output_columns="new_input_ids")
    data2 = data2.rename(input_columns="segment_ids",
                         output_columns="new_segment_ids")

    dataz = ds.zip((data1, data2))
    # Note: zipped dataset has 5 rows and 9 columns
    filename = "zip_05_result.npz"
    save_and_check_dict(dataz, filename, generate_golden=GENERATE_GOLDEN)
Example #20
0
def test_2ops_shuffle_repeat():
    """
    Test Shuffle then Repeat
    """
    logger.info("Test Shuffle then Repeat")
    # define parameters
    repeat_count = 2
    buffer_size = 5
    seed = 0

    # apply dataset operations
    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False)
    ds.config.set_seed(seed)
    data1 = data1.shuffle(buffer_size=buffer_size)
    data1 = data1.repeat(repeat_count)

    filename = "test_2ops_shuffle_repeat.npz"
    save_and_check_dict(data1, filename, generate_golden=GENERATE_GOLDEN)
Example #21
0
def test_zip_06():
    """
    Test zip: zip dataset with renamed columns and repeat zipped dataset
    """
    logger.info("test_zip_06")
    ds.config.set_seed(1)
    data1 = ds.TFRecordDataset(DATA_DIR_4, SCHEMA_DIR_4, shuffle=False)
    data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2, shuffle=False)

    data2 = data2.rename(input_columns="input_ids",
                         output_columns="new_input_ids")
    data2 = data2.rename(input_columns="segment_ids",
                         output_columns="new_segment_ids")

    dataz = ds.zip((data1, data2))
    dataz = dataz.repeat(2)
    # Note: resultant dataset has 10 rows and 9 columns
    filename = "zip_06_result.npz"
    parameters = {"params": {}}
    save_and_check_dict(dataz,
                        parameters,
                        filename,
                        generate_golden=GENERATE_GOLDEN)
def test_tfrecord_no_schema():
    logger.info("test_tfrecord_no_schema")

    data = ds.TFRecordDataset(FILES, shuffle=ds.Shuffle.FILES)
    filename = "tfrecord_no_schema.npz"
    save_and_check_dict(data, filename, generate_golden=GENERATE_GOLDEN)
def test_tfrecord_files_basic():
    logger.info("test_tfrecord_files_basic")

    data = ds.TFRecordDataset(FILES, SCHEMA_FILE, shuffle=ds.Shuffle.FILES)
    filename = "tfrecord_files_basic.npz"
    save_and_check_dict(data, filename, generate_golden=GENERATE_GOLDEN)