def test_cifar_sampler_chain(): """ Test Cifar sampler chain """ logger.info("test_cifar_sampler_chain") sampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=False, num_samples=5) child_sampler = ds.RandomSampler(replacement=True, num_samples=4) child_sampler2 = ds.SequentialSampler(start_index=0, num_samples=2) child_sampler.add_child(child_sampler2) sampler.add_child(child_sampler) data1 = ds.Cifar10Dataset(CIFAR10_DATA_DIR, sampler=sampler) # Verify dataset size data1_size = data1.get_dataset_size() logger.info("dataset size is: {}".format(data1_size)) assert data1_size == 1 # Verify number of rows assert sum([1 for _ in data1]) == 1 # Verify dataset contents res = [] for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True): logger.info("item: {}".format(item)) res.append(item) logger.info("dataset: {}".format(res))
def test_manifest_sampler_chain(): """ Test Manifest sampler chain """ logger.info("test_manifest_sampler_chain") sampler = ds.RandomSampler(replacement=True, num_samples=2) child_sampler = ds.DistributedSampler(num_shards=1, shard_id=0, shuffle=False, num_samples=3, offset=1) sampler.add_child(child_sampler) data1 = ds.ManifestDataset(MANIFEST_DATA_FILE, sampler=sampler) # Verify dataset size data1_size = data1.get_dataset_size() logger.info("dataset size is: {}".format(data1_size)) assert data1_size == 2 # Verify number of rows assert sum([1 for _ in data1]) == 2 # Verify dataset contents res = [] for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True): logger.info("item: {}".format(item)) res.append(item) logger.info("dataset: {}".format(res))
def test_coco_sampler_chain(): """ Test Coco sampler chain """ logger.info("test_coco_sampler_chain") sampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=False, num_samples=5) child_sampler = ds.RandomSampler(replacement=True, num_samples=2) sampler.add_child(child_sampler) data1 = ds.CocoDataset(COCO_DATA_DIR, annotation_file=ANNOTATION_FILE, task="Detection", decode=True, sampler=sampler) # Verify dataset size data1_size = data1.get_dataset_size() logger.info("dataset size is: {}".format(data1_size)) assert data1_size == 1 # Verify number of rows assert sum([1 for _ in data1]) == 1 # Verify dataset contents res = [] for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True): logger.info("item: {}".format(item)) res.append(item) logger.info("dataset: {}".format(res))
def test_cv_minddataset_random_sampler_basic(add_and_remove_cv_file): data = get_data(CV_DIR_NAME, True) columns_list = ["data", "file_name", "label"] num_readers = 4 sampler = ds.RandomSampler() data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers, sampler=sampler) assert data_set.get_dataset_size() == 10 num_iter = 0 new_dataset = [] for item in data_set.create_dict_iterator(): logger.info( "-------------- cv reader basic: {} ------------------------". format(num_iter)) logger.info( "-------------- item[data]: {} -----------------------------". format(item["data"])) logger.info( "-------------- item[file_name]: {} ------------------------". format(item["file_name"])) logger.info( "-------------- item[label]: {} ----------------------------". format(item["label"])) num_iter += 1 new_dataset.append(item['file_name']) assert num_iter == 10 assert new_dataset != [x['file_name'] for x in data]
def test_cv_minddataset_random_sampler_replacement(add_and_remove_cv_file): columns_list = ["data", "file_name", "label"] num_readers = 4 sampler = ds.RandomSampler(replacement=True, num_samples=5) data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers, sampler=sampler) assert data_set.get_dataset_size() == 5 num_iter = 0 for item in data_set.create_dict_iterator(): logger.info( "-------------- cv reader basic: {} ------------------------". format(num_iter)) logger.info( "-------------- item[data]: {} -----------------------------". format(item["data"])) logger.info( "-------------- item[file_name]: {} ------------------------". format(item["file_name"])) logger.info( "-------------- item[label]: {} ----------------------------". format(item["label"])) num_iter += 1 assert num_iter == 5
def test_chained_sampler_04(): logger.info("Test Case Chained Sampler - Distributed and Random, with batch then repeat") # Create chained sampler, distributed and random sampler = ds.DistributedSampler(num_shards=4, shard_id=3) child_sampler = ds.RandomSampler() sampler.add_child(child_sampler) # Create ImageFolderDataset with sampler data1 = ds.ImageFolderDataset(DATA_DIR, sampler=sampler) data1 = data1.batch(batch_size=5, drop_remainder=True) data1 = data1.repeat(count=3) # Verify dataset size data1_size = data1.get_dataset_size() logger.info("dataset size is: {}".format(data1_size)) assert data1_size == 6 # Verify number of iterations num_iter = 0 for item in data1.create_dict_iterator(num_epochs=1): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" logger.info("image is {}".format(item["image"])) logger.info("label is {}".format(item["label"])) num_iter += 1 logger.info("Number of data in data1: {}".format(num_iter)) # Note: Each of the 4 shards has 44/4=11 samples # Note: Number of iterations is (11/5 = 2) * 3 = 6 assert num_iter == 6
def test_chained_sampler_03(): logger.info("Test Case Chained Sampler - Random and Sequential, with repeat then batch") # Create chained sampler, random and sequential sampler = ds.RandomSampler() child_sampler = ds.SequentialSampler() sampler.add_child(child_sampler) # Create ImageFolderDataset with sampler data1 = ds.ImageFolderDataset(DATA_DIR, sampler=sampler) data1 = data1.repeat(count=2) data1 = data1.batch(batch_size=5, drop_remainder=False) # Verify dataset size data1_size = data1.get_dataset_size() logger.info("dataset size is: {}".format(data1_size)) assert data1_size == 18 # Verify number of iterations num_iter = 0 for item in data1.create_dict_iterator(num_epochs=1): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" logger.info("image is {}".format(item["image"])) logger.info("label is {}".format(item["label"])) num_iter += 1 logger.info("Number of data in data1: {}".format(num_iter)) assert num_iter == 18
def test_cv_minddataset_random_sampler_repeat(add_and_remove_cv_file): columns_list = ["data", "file_name", "label"] num_readers = 4 sampler = ds.RandomSampler() data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers, sampler=sampler) assert data_set.get_dataset_size() == 10 ds1 = data_set.repeat(3) num_iter = 0 epoch1_dataset = [] epoch2_dataset = [] epoch3_dataset = [] for item in ds1.create_dict_iterator(num_epochs=1, output_numpy=True): logger.info( "-------------- cv reader basic: {} ------------------------".format(num_iter)) logger.info( "-------------- item[data]: {} -----------------------------".format(item["data"])) logger.info( "-------------- item[file_name]: {} ------------------------".format(item["file_name"])) logger.info( "-------------- item[label]: {} ----------------------------".format(item["label"])) num_iter += 1 if num_iter <= 10: epoch1_dataset.append(item['file_name']) elif num_iter <= 20: epoch2_dataset.append(item['file_name']) else: epoch3_dataset.append(item['file_name']) assert num_iter == 30 assert epoch1_dataset not in (epoch2_dataset, epoch3_dataset) assert epoch2_dataset not in (epoch1_dataset, epoch3_dataset) assert epoch3_dataset not in (epoch1_dataset, epoch2_dataset)
def test_imagefolder_numsamples(): logger.info("Test Case numSamples") # define parameters repeat_count = 1 # apply dataset operations data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_samples=10, num_parallel_workers=2) data1 = data1.repeat(repeat_count) num_iter = 0 for item in data1.create_dict_iterator(): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" logger.info("image is {}".format(item["image"])) logger.info("label is {}".format(item["label"])) num_iter += 1 logger.info("Number of data in data1: {}".format(num_iter)) assert num_iter == 10 random_sampler = ds.RandomSampler(num_samples=3, replacement=True) data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_samples=10, num_parallel_workers=2, sampler=random_sampler) num_iter = 0 for item in data1.create_dict_iterator(): num_iter += 1 assert num_iter == 3 random_sampler = ds.RandomSampler(num_samples=3, replacement=False) data1 = ds.ImageFolderDatasetV2(DATA_DIR, num_samples=10, num_parallel_workers=2, sampler=random_sampler) num_iter = 0 for item in data1.create_dict_iterator(): num_iter += 1 assert num_iter == 3
def test_config(replacement, num_samples, num_repeats): sampler = ds.RandomSampler(replacement=replacement, num_samples=num_samples) data1 = ds.ManifestDataset(manifest_file, sampler=sampler) data1 = data1.repeat(num_repeats) res = [] for item in data1.create_dict_iterator(): res.append(map_[(item["image"].shape[0], item["label"].item())]) if print_res: logger.info("image.shapes and labels: {}".format(res)) return res
def test_config(replacement, num_samples, num_repeats, validate): sampler = ds.RandomSampler(replacement=replacement, num_samples=num_samples) data1 = ds.ManifestDataset(manifest_file, sampler=sampler) while num_repeats > 0: res = [] for item in data1.create_dict_iterator(): res.append(map_[(item["image"].shape[0], item["label"].item())]) if print_res: logger.info("image.shapes and labels: {}".format(res)) if validate != sorted(res): break num_repeats -= 1 assert num_repeats > 0
def test_sampler_py_api(): sampler = ds.SequentialSampler().create() sampler.set_num_rows(128) sampler.set_num_samples(64) sampler.initialize() sampler.get_indices() sampler = ds.RandomSampler().create() sampler.set_num_rows(128) sampler.set_num_samples(64) sampler.initialize() sampler.get_indices() sampler = ds.DistributedSampler(8, 4).create() sampler.set_num_rows(128) sampler.set_num_samples(64) sampler.initialize() sampler.get_indices()
def test_random_sampler(): logger.info("Test Case RandomSampler") # define parameters repeat_count = 1 # apply dataset operations sampler = ds.RandomSampler() data1 = ds.ImageFolderDatasetV2(DATA_DIR, sampler=sampler) data1 = data1.repeat(repeat_count) num_iter = 0 for item in data1.create_dict_iterator(): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" logger.info("image is {}".format(item["image"])) logger.info("label is {}".format(item["label"])) num_iter += 1 logger.info("Number of data in data1: {}".format(num_iter)) assert num_iter == 44
def test_sampler_py_api(): sampler = ds.SequentialSampler().parse() sampler1 = ds.RandomSampler().parse() sampler1.add_child(sampler)