def test_sampler_list(): data1 = ds.ImageFolderDataset("../data/dataset/testPK/data", sampler=[1, 3, 5]) data21 = ds.ImageFolderDataset("../data/dataset/testPK/data", shuffle=False).take(2).skip(1) data22 = ds.ImageFolderDataset("../data/dataset/testPK/data", shuffle=False).take(4).skip(3) data23 = ds.ImageFolderDataset("../data/dataset/testPK/data", shuffle=False).take(6).skip(5) dataset_equal(data1, data21 + data22 + data23, 0) data3 = ds.ImageFolderDataset("../data/dataset/testPK/data", sampler=1) dataset_equal(data3, data21, 0) def bad_pipeline(sampler, msg): with pytest.raises(Exception) as info: data1 = ds.ImageFolderDataset("../data/dataset/testPK/data", sampler=sampler) for _ in data1: pass assert msg in str(info.value) bad_pipeline(sampler=[1.5, 7], msg="Type of indices element must be int, but got list[0]: 1.5, type: <class 'float'>") bad_pipeline(sampler=["a", "b"], msg="Type of indices element must be int, but got list[0]: a, type: <class 'str'>.") bad_pipeline(sampler="a", msg="Unsupported sampler object of type (<class 'str'>)") bad_pipeline(sampler="", msg="Unsupported sampler object of type (<class 'str'>)") bad_pipeline(sampler=np.array([1, 2]), msg="Type of indices element must be int, but got list[0]: 1, type: <class 'numpy.int64'>.")
def test_deterministic_run_distribution(): """ Test deterministic run with with setting the seed being used in a distribution """ logger.info("test_deterministic_run_distribution") # Save original configuration values num_parallel_workers_original = ds.config.get_num_parallel_workers() seed_original = ds.config.get_seed() # when we set the seed all operations within our dataset should be deterministic ds.config.set_seed(0) ds.config.set_num_parallel_workers(1) # First dataset data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) random_horizontal_flip_op = c_vision.RandomHorizontalFlip(0.1) decode_op = c_vision.Decode() data1 = data1.map(input_columns=["image"], operations=decode_op) data1 = data1.map(input_columns=["image"], operations=random_horizontal_flip_op) # Second dataset data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) data2 = data2.map(input_columns=["image"], operations=decode_op) # If seed is set up on constructor, so the two ops output deterministic sequence random_horizontal_flip_op2 = c_vision.RandomHorizontalFlip(0.1) data2 = data2.map(input_columns=["image"], operations=random_horizontal_flip_op2) dataset_equal(data1, data2, 0) # Restore original configuration values ds.config.set_num_parallel_workers(num_parallel_workers_original) ds.config.set_seed(seed_original)
def test_seed_deterministic(): """ Test deterministic run with setting the seed, only works with num_parallel worker = 1 """ logger.info("test_seed_deterministic") # Save original configuration values num_parallel_workers_original = ds.config.get_num_parallel_workers() seed_original = ds.config.get_seed() ds.config.set_seed(0) ds.config.set_num_parallel_workers(1) # First dataset data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) # seed will be read in during constructor call random_crop_op = c_vision.RandomCrop([512, 512], [200, 200, 200, 200]) decode_op = c_vision.Decode() data1 = data1.map(input_columns=["image"], operations=decode_op) data1 = data1.map(input_columns=["image"], operations=random_crop_op) # Second dataset data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) data2 = data2.map(input_columns=["image"], operations=decode_op) # If seed is set up on constructor, so the two ops output deterministic sequence random_crop_op2 = c_vision.RandomCrop([512, 512], [200, 200, 200, 200]) data2 = data2.map(input_columns=["image"], operations=random_crop_op2) dataset_equal(data1, data2, 0) # Restore original configuration values ds.config.set_num_parallel_workers(num_parallel_workers_original) ds.config.set_seed(seed_original)
def test_sampler_list(): data1 = ds.ImageFolderDataset("../data/dataset/testPK/data", sampler=[1, 3, 5]) data21 = ds.ImageFolderDataset("../data/dataset/testPK/data", shuffle=False).take(2).skip(1) data22 = ds.ImageFolderDataset("../data/dataset/testPK/data", shuffle=False).take(4).skip(3) data23 = ds.ImageFolderDataset("../data/dataset/testPK/data", shuffle=False).take(6).skip(5) dataset_equal(data1, data21 + data22 + data23, 0)
def test_schema_file_vs_string(): logger.info("test_schema_file_vs_string") schema1 = ds.Schema(SCHEMA_FILE) with open(SCHEMA_FILE) as file: json_obj = json.load(file) schema2 = ds.Schema() schema2.from_json(json_obj) ds1 = ds.TFRecordDataset(FILES, schema1) ds2 = ds.TFRecordDataset(FILES, schema2) dataset_equal(ds1, ds2, 0)
def test_deterministic_run_fail(): """ Test RandomCrop with seed, expected to fail """ logger.info("test_deterministic_run_fail") # Save original configuration values num_parallel_workers_original = ds.config.get_num_parallel_workers() seed_original = ds.config.get_seed() # when we set the seed all operations within our dataset should be deterministic ds.config.set_seed(0) ds.config.set_num_parallel_workers(1) # First dataset data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) # Assuming we get the same seed on calling constructor, if this op is re-used then result won't be # the same in between the two datasets. For example, RandomCrop constructor takes seed (0) # outputs a deterministic series of numbers, e,g "a" = [1, 2, 3, 4, 5, 6] <- pretend these are random random_crop_op = c_vision.RandomCrop([512, 512], [200, 200, 200, 200]) decode_op = c_vision.Decode() data1 = data1.map(operations=decode_op, input_columns=["image"]) data1 = data1.map(operations=random_crop_op, input_columns=["image"]) # Second dataset data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) data2 = data2.map(operations=decode_op, input_columns=["image"]) # If seed is set up on constructor data2 = data2.map(operations=random_crop_op, input_columns=["image"]) try: dataset_equal(data1, data2, 0) except Exception as e: # two datasets split the number out of the sequence a logger.info("Got an exception in DE: {}".format(str(e))) assert "Array" in str(e) # Restore original configuration values ds.config.set_num_parallel_workers(num_parallel_workers_original) ds.config.set_seed(seed_original)
def test_seed_undeterministic(): """ Test seed with num parallel workers in c, this test is expected to fail some of the time """ logger.info("test_seed_undeterministic") # Save original configuration values num_parallel_workers_original = ds.config.get_num_parallel_workers() seed_original = ds.config.get_seed() ds.config.set_seed(0) ds.config.set_num_parallel_workers(3) # First dataset data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) # We get the seed when constructor is called random_crop_op = c_vision.RandomCrop([512, 512], [200, 200, 200, 200]) decode_op = c_vision.Decode() data1 = data1.map(operations=decode_op, input_columns=["image"]) data1 = data1.map(operations=random_crop_op, input_columns=["image"]) # Second dataset data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) data2 = data2.map(operations=decode_op, input_columns=["image"]) # Since seed is set up on constructor, so the two ops output deterministic sequence. # Assume the generated random sequence "a" = [1, 2, 3, 4, 5, 6] <- pretend these are random random_crop_op2 = c_vision.RandomCrop([512, 512], [200, 200, 200, 200]) data2 = data2.map(operations=random_crop_op2, input_columns=["image"]) try: dataset_equal(data1, data2, 0) except Exception as e: # two datasets both use numbers from the generated sequence "a" logger.info("Got an exception in DE: {}".format(str(e))) assert "Array" in str(e) # Restore original configuration values ds.config.set_num_parallel_workers(num_parallel_workers_original) ds.config.set_seed(seed_original)