def test_tfrecord(): class TFRecordPipeline(Pipeline): def __init__(self, batch_size, num_threads, device_id, num_gpus, data, data_idx): super(TFRecordPipeline, self).__init__(batch_size, num_threads, device_id) self.input = ops.readers.TFRecord(path = data, index_path = data_idx, features = {"image/encoded" : tfrec.FixedLenFeature((), tfrec.string, ""), "image/class/label": tfrec.FixedLenFeature([1], tfrec.int64, -1)} ) def define_graph(self): inputs = self.input(name="Reader") images = inputs["image/encoded"] return images tfrecord = os.path.join(get_dali_extra_path(), 'db', 'tfrecord', 'train') tfrecord_idx_org = os.path.join(get_dali_extra_path(), 'db', 'tfrecord', 'train.idx') tfrecord_idx = "tfr_train.idx" idx_files_dir = tempfile.TemporaryDirectory() idx_file = os.path.join(idx_files_dir.name, tfrecord_idx) skip_second(tfrecord_idx_org, idx_file) pipe = TFRecordPipeline(1, 1, 0, 1, tfrecord, idx_file) pipe_org = TFRecordPipeline(1, 1, 0, 1, tfrecord, tfrecord_idx_org) pipe.build() pipe_org.build() iters = pipe.epoch_size("Reader") for _ in range(iters): out = pipe.run() out_ref = pipe_org.run() for a, b in zip(out, out_ref): assert np.array_equal(a.as_array(), b.as_array()) _ = pipe_org.run()
def test_tfrecord_reader_alias(): tfrecord = os.path.join(get_dali_extra_path(), 'db', 'tfrecord', 'train') tfrecord_idx = os.path.join(get_dali_extra_path(), 'db', 'tfrecord', 'train.idx') new_pipe = tfrecord_pipe(fn.readers.tfrecord, tfrecord, tfrecord_idx) legacy_pipe = tfrecord_pipe(fn.tfrecord_reader, tfrecord, tfrecord_idx) compare_pipelines(new_pipe, legacy_pipe, batch_size_alias_test, 50)
def test_pax_format(): global test_batch_size num_samples = 1000 tar_file_path = os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-0.tar") pax_tar_file_path = os.path.join(get_dali_extra_path(), "db/webdataset/pax/devel-0.tar") index_file = generate_temp_index_file(tar_file_path) num_shards = 100 for shard_id in range(num_shards): compare_pipelines( webdataset_raw_pipeline( tar_file_path, index_file.name, ["jpg", "cls"], num_shards=num_shards, shard_id=shard_id, batch_size=test_batch_size, device_id=0, num_threads=1, ), webdataset_raw_pipeline( pax_tar_file_path, None, ext=["jpg", "cls"], num_shards=num_shards, shard_id=shard_id, batch_size=test_batch_size, device_id=0, num_threads=1, ), test_batch_size, math.ceil(num_samples / num_shards / test_batch_size) * 2, )
def test_recordio(): class MXNetReaderPipeline(Pipeline): def __init__(self, batch_size, num_threads, device_id, num_gpus, data, data_idx): super(MXNetReaderPipeline, self).__init__(batch_size, num_threads, device_id) self.input = ops.readers.MXNet(path = [data], index_path=[data_idx], shard_id = device_id, num_shards = num_gpus) def define_graph(self): images, _ = self.input(name="Reader") return images recordio = os.path.join(get_dali_extra_path(), 'db', 'recordio', 'train.rec') recordio_idx_org = os.path.join(get_dali_extra_path(), 'db', 'recordio', 'train.idx') recordio_idx = "rio_train.idx" idx_files_dir = tempfile.TemporaryDirectory() idx_file = os.path.join(idx_files_dir.name, recordio_idx) skip_second(recordio_idx_org, idx_file) pipe = MXNetReaderPipeline(1, 1, 0, 1, recordio, idx_file) pipe_org = MXNetReaderPipeline(1, 1, 0, 1, recordio, recordio_idx_org) pipe.build() pipe_org.build() iters = pipe.epoch_size("Reader") for _ in range(iters): out = pipe.run() out_ref = pipe_org.run() for a, b in zip(out, out_ref): assert np.array_equal(a.as_array(), b.as_array()) _ = pipe_org.run()
def test_mxnet_reader_alias(): recordio = [ os.path.join(get_dali_extra_path(), 'db', 'recordio', 'train.rec') ] recordio_idx = [ os.path.join(get_dali_extra_path(), 'db', 'recordio', 'train.idx') ] new_pipe = mxnet_pipe(fn.readers.mxnet, recordio, recordio_idx) legacy_pipe = mxnet_pipe(fn.mxnet_reader, recordio, recordio_idx) compare_pipelines(new_pipe, legacy_pipe, batch_size_alias_test, 50)
def test_image_decoders(): def image_decoder_pipe(max_batch_size, input_data, device): pipe = Pipeline(batch_size=max_batch_size, num_threads=4, device_id=0) encoded = fn.external_source(source=input_data, cycle=False, device='cpu') decoded = fn.decoders.image(encoded, device=device) pipe.set_outputs(decoded) return pipe def image_decoder_crop_pipe(max_batch_size, input_data, device): pipe = Pipeline(batch_size=max_batch_size, num_threads=4, device_id=0) encoded = fn.external_source(source=input_data, cycle=False, device='cpu') decoded = fn.decoders.image_crop(encoded, device=device) pipe.set_outputs(decoded) return pipe def image_decoder_slice_pipe(max_batch_size, input_data, device): pipe = Pipeline(batch_size=max_batch_size, num_threads=4, device_id=0) encoded = fn.external_source(source=input_data, cycle=False, device='cpu') anch = fn.constant(fdata=.1) sh = fn.constant(fdata=.4) decoded = fn.decoders.image_slice(encoded, anch, sh, axes=0, device=device) pipe.set_outputs(decoded) return pipe def image_decoder_rcrop_pipe(max_batch_size, input_data, device): pipe = Pipeline(batch_size=max_batch_size, num_threads=4, device_id=0) encoded = fn.external_source(source=input_data, cycle=False, device='cpu') decoded = fn.decoders.image_random_crop(encoded, device=device) pipe.set_outputs(decoded) return pipe def peek_image_shape_pipe(max_batch_size, input_data, device): pipe = Pipeline(batch_size=max_batch_size, num_threads=4, device_id=0) encoded = fn.external_source(source=input_data, cycle=False, device='cpu') shape = fn.peek_image_shape(encoded, device=device) pipe.set_outputs(shape) return pipe image_decoder_extensions = ['.jpg', '.bmp', '.png', '.pnm', '.jp2'] image_decoder_pipes = [image_decoder_pipe, image_decoder_crop_pipe, image_decoder_slice_pipe, ] for ext in image_decoder_extensions: for pipe in image_decoder_pipes: yield test_decoders_check, pipe, \ os.path.join(test_utils.get_dali_extra_path(), 'db', 'single'), \ ext, ['cpu', 'mixed'] yield test_decoders_run, image_decoder_rcrop_pipe, \ os.path.join(test_utils.get_dali_extra_path(), 'db', 'single'), \ ext, ['cpu', 'mixed'] yield test_decoders_check, peek_image_shape_pipe, \ os.path.join(test_utils.get_dali_extra_path(), 'db', 'single'), '.jpg', ['cpu']
def test_index_generation(): global test_batch_size num_samples = 3000 tar_file_paths = [ os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-0.tar"), os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-1.tar"), os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-2.tar"), ] extract_dirs = [ generate_temp_extract(tar_file_path) for tar_file_path in tar_file_paths ] equivalent_files = sum( list( sorted(glob(extract_dir.name + "/*"), key=lambda s: int(s[s.rfind("/") + 1:s.rfind(".")]) ) # noqa: 203 for extract_dir in extract_dirs), [], ) num_shards = 100 for shard_id in range(num_shards): compare_pipelines( webdataset_raw_pipeline( tar_file_paths, [], ["jpg", "cls"], missing_component_behavior="error", num_shards=num_shards, shard_id=shard_id, batch_size=test_batch_size, device_id=0, num_threads=1, ), file_reader_pipeline( equivalent_files, ["jpg", "cls"], num_shards=num_shards, shard_id=shard_id, batch_size=test_batch_size, device_id=0, num_threads=1, ), test_batch_size, math.ceil(num_samples / num_shards / test_batch_size) * 2, )
def test_return_empty(): global test_batch_size num_samples = 1000 tar_file_path = os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/missing.tar") index_file = generate_temp_index_file(tar_file_path) extract_dir = generate_temp_extract(tar_file_path) equivalent_files = glob(extract_dir.name + "/*") equivalent_files = sorted( equivalent_files, key=(lambda s: int(s[s.rfind("/") + 1:s.rfind(".")]))) # noqa: 203 compare_pipelines( webdataset_raw_pipeline( tar_file_path, index_file.name, ["jpg", "txt"], batch_size=test_batch_size, device_id=0, num_threads=1, missing_component_behavior="empty", ), file_reader_pipeline(equivalent_files, ["jpg", []], batch_size=test_batch_size, device_id=0, num_threads=1), test_batch_size, math.ceil(num_samples / test_batch_size), )
def test_wrong_feature_shape(): features = { 'image/encoded': tfrec.FixedLenFeature((), tfrec.string, ""), 'image/object/bbox': tfrec.FixedLenFeature([], tfrec.float32, -1.0), 'image/object/class/label': tfrec.FixedLenFeature([], tfrec.int64, -1), } test_dummy_data_path = os.path.join(get_dali_extra_path(), 'db', 'coco_dummy') pipe = Pipeline(1, 1, 0) with pipe: input = fn.readers.tfrecord(path=os.path.join(test_dummy_data_path, 'small_coco.tfrecord'), index_path=os.path.join( test_dummy_data_path, 'small_coco_index.idx'), features=features) pipe.set_outputs(input['image/encoded'], input['image/object/class/label'], input['image/object/bbox']) pipe.build() # the error is raised because FixedLenFeature is used with insufficient shape to house the input assert_raises( RuntimeError, pipe.run, glob="Error when executing CPU operator*readers*tfrecord*" "Output tensor shape is too small*[]*Expected at least 4 elements")
def test_dtypes(): global test_batch_size num_samples = 100 tar_file_path = os.path.join(get_dali_extra_path(), "db/webdataset/sample-tar/dtypes.tar") index_file = generate_temp_index_file(tar_file_path) wds_pipeline = webdataset_raw_pipeline( tar_file_path, index_file.name, ["float16", "int32", "float64"], dtypes=[dali.types.FLOAT16, dali.types.INT32, dali.types.FLOAT64], batch_size=test_batch_size, device_id=0, num_threads=1, ) wds_pipeline.build() for sample_idx in range(num_samples): if sample_idx % test_batch_size == 0: f16, i32, f64 = wds_pipeline.run() assert (f16.as_array()[sample_idx % test_batch_size] == [float(sample_idx)] * 10).all() assert (i32.as_array()[sample_idx % test_batch_size] == [int(sample_idx)] * 10).all() assert (f64.as_array()[sample_idx % test_batch_size] == [float(sample_idx)] * 10).all()
def get_mix_size_image_pipeline(batch_size, num_threads, device, device_id=0, shard_id=0, num_shards=1, def_for_dataset=False): test_data_root = get_dali_extra_path() file_root = os.path.join(test_data_root, 'db', 'coco_dummy', 'images') annotations_file = os.path.join(test_data_root, 'db', 'coco_dummy', 'instances.json') pipe = Pipeline(batch_size, num_threads, device_id) with pipe: jpegs, _, _, image_ids = fn.readers.coco( file_root=file_root, annotations_file=annotations_file, shard_id=shard_id, num_shards=num_shards, ratio=False, image_ids=True) images = fn.decoders.image( jpegs, device=('mixed' if device == 'gpu' else 'cpu'), output_type=types.RGB) pipe.set_outputs(images) shapes = ((batch_size, None, None, None), ) dtypes = (tf.float32, ) return pipe, shapes, dtypes
def general_corner_case(test_batch_size=base.test_batch_size, dtypes=None, missing_component_behavior="", **kwargs): num_samples = 1000 tar_file_path = os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-0.tar") index_file = base.generate_temp_index_file(tar_file_path) extract_dir = base.generate_temp_extract(tar_file_path) equivalent_files = sorted( glob(extract_dir.name + "/*"), key=lambda s: int(s[s.rfind("/") + 1:s.rfind(".")])) compare_pipelines( base.webdataset_raw_pipeline( tar_file_path, index_file.name, ["jpg", "cls"], missing_component_behavior=missing_component_behavior, dtypes=dtypes, batch_size=test_batch_size, device_id=0, num_threads=1, **kwargs), base.file_reader_pipeline(equivalent_files, ["jpg", "cls"], batch_size=test_batch_size, device_id=0, num_threads=1, **kwargs), test_batch_size, math.ceil(num_samples / test_batch_size), )
def paths_index_paths_error(): webdataset_pipeline = base.webdataset_raw_pipeline( [ os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-0.tar"), os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-1.tar"), os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-2.tar"), ], ["test.idx"], ["jpg", "cls"], batch_size=1, device_id=0, num_threads=1, ) webdataset_pipeline.build()
def test_wds_sharding(): global test_batch_size num_samples = 3000 tar_file_paths = [ os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-0.tar"), os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-1.tar"), os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-2.tar"), ] index_files = [ generate_temp_index_file(tar_file_path) for tar_file_path in tar_file_paths ] extract_dirs = [ generate_temp_extract(tar_file_path) for tar_file_path in tar_file_paths ] equivalent_files = sum( list( sorted(glob(extract_dir.name + "/*"), key=lambda s: int(s[s.rfind("/") + 1:s.rfind(".")]) ) # noqa: 203 for extract_dir in extract_dirs), [], ) compare_pipelines( webdataset_raw_pipeline( tar_file_paths, [index_file.name for index_file in index_files], ["jpg", "cls"], batch_size=test_batch_size, device_id=0, num_threads=1, ), file_reader_pipeline( equivalent_files, ["jpg", "cls"], batch_size=test_batch_size, device_id=0, num_threads=1, ), test_batch_size, math.ceil(num_samples / test_batch_size), )
def _test_tfr_properties(device): root_path = os.path.join(get_dali_extra_path(), 'db', 'tfrecord', 'train') index_path = os.path.join(get_dali_extra_path(), 'db', 'tfrecord', 'train.idx') idx = [0, 171504, 553687, 651500, 820966, 1142396, 1380096, 1532947] p = tfr_properties(root_path, index_path, device, batch_size=8, num_threads=4, device_id=0) p.build() output = p.run() for out in output: out = out if device == 'cpu' else out.as_cpu() for source_info, ref_idx in zip(out, idx): assert _uint8_tensor_to_string( source_info) == f"{root_path} at index {ref_idx}"
def test_tfrecord_reader_alias2(): tfrecord = os.path.join(get_dali_extra_path(), 'db', 'tfrecord', 'train') tfrecord_idx = os.path.join(get_dali_extra_path(), 'db', 'tfrecord', 'train.idx') pipe = tfrecord_pipe_empty_fields(tfrecord, tfrecord_idx) pipe.build() out = pipe.run() for tensor in out[0]: data = np.array(tensor) assert len(data) != 0 assert data.dtype == np.uint8 for tensor in out[1]: data = np.array(tensor) assert len(data.shape) == 0 assert data.dtype == np.int64 for tensor in out[2]: data = np.array(tensor) assert len(data.shape) == 0 assert data.dtype == np.float32
def _test_improper_property(device): root_path = os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-0.tar") p = improper_property(root_path, device, batch_size=8, num_threads=4, device_id=0) p.build() p.run()
def test_audio_decoders(): def audio_decoder_pipe(max_batch_size, input_data, device): pipe = Pipeline(batch_size=max_batch_size, num_threads=4, device_id=0) encoded = fn.external_source(source=input_data, cycle=False, device='cpu') decoded, _ = fn.decoders.audio(encoded, downmix=True, sample_rate=12345, device=device) pipe.set_outputs(decoded) return pipe yield test_decoders_check, audio_decoder_pipe, \ os.path.join(test_utils.get_dali_extra_path(), 'db', 'audio'), '.wav'
def _test_lambda_np_readfromfile(name, py_callback_pickler): data_root = get_dali_extra_path() images_dir = os.path.join(data_root, 'db', 'single', 'jpeg') with open(os.path.join(images_dir, "image_list.txt"), 'r') as f: file_label = [line.rstrip().split(' ') for line in f if line != ''] files, _ = zip(*file_label) _create_and_compare_simple_pipelines(lambda x: (np.fromfile( os.path.join(images_dir, files[x.idx_in_epoch % len(files)]), dtype=np.uint8)), py_callback_pickler, batch_size=8, py_num_workers=2)
def get_image_pipeline(batch_size, num_threads, device, device_id=0, shard_id=0, num_shards=1, def_for_dataset=False): test_data_root = get_dali_extra_path() file_root = os.path.join(test_data_root, 'db', 'coco_dummy', 'images') annotations_file = os.path.join( test_data_root, 'db', 'coco_dummy', 'instances.json') pipe = Pipeline(batch_size, num_threads, device_id) with pipe: jpegs, _, _, image_ids = fn.readers.coco( file_root=file_root, annotations_file=annotations_file, shard_id=shard_id, num_shards=num_shards, ratio=False, image_ids=True) images = fn.decoders.image( jpegs, device=('mixed' if device == 'gpu' else 'cpu'), output_type=types.RGB) images = fn.resize( images, resize_x=224, resize_y=224, interp_type=types.INTERP_LINEAR) images = fn.crop_mirror_normalize( images, dtype=types.FLOAT, mean=[128., 128., 128.], std=[1., 1., 1.]) if device == 'gpu': image_ids = image_ids.gpu() ids_reshaped = fn.reshape(image_ids, shape=[1, 1]) ids_int16 = fn.cast(image_ids, dtype=types.INT16) pipe.set_outputs(images, ids_reshaped, ids_int16) shapes = ( (batch_size, 3, 224, 224), (batch_size, 1, 1), (batch_size, 1)) dtypes = ( tf.float32, tf.int32, tf.int16) return pipe, shapes, dtypes
def general_index_error(index_file_contents, tar_file_path="db/webdataset/MNIST/devel-0.tar", ext="jpg"): index_file = tempfile.NamedTemporaryFile() index_file.write(index_file_contents) index_file.flush() webdataset_pipeline = base.webdataset_raw_pipeline( os.path.join(get_dali_extra_path(), tar_file_path), index_file.name, ext, batch_size=1, device_id=0, num_threads=1, ) webdataset_pipeline.build() webdataset_pipeline.run() webdataset_pipeline.run()
def _test_wds_properties(device): root_path = os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-0.tar") ref_offset = [1536, 4096, 6144, 8704, 11264, 13824, 16384, 18432] p = wds_properties(root_path, device, batch_size=8, num_threads=4, device_id=0) p.build() output = p.run() for out in output: out = out if device == 'cpu' else out.as_cpu() for source_info, offset in zip(out, ref_offset): assert _uint8_tensor_to_string( source_info ) == f"archive {root_path}tar file at \"{root_path}\"component offset {offset}"
def test_raise_error_on_missing(): global test_batch_size tar_file_path = os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/missing.tar") index_file = generate_temp_index_file(tar_file_path) wds_pipeline = webdataset_raw_pipeline( tar_file_path, index_file.name, ["jpg", "cls"], missing_component_behavior="error", batch_size=test_batch_size, device_id=0, num_threads=1, ) assert_raises(RuntimeError, wds_pipeline.build, glob="Underful sample detected")
def test_skip_sample(): global test_batch_size num_samples = 500 tar_file_path = os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/missing.tar") index_file = generate_temp_index_file(tar_file_path) extract_dir = generate_temp_extract(tar_file_path) equivalent_files = list( filter( lambda s: int(s[s.rfind("/") + 1:s.rfind(".")]) < 2500, # noqa: 203 sorted(glob(extract_dir.name + "/*"), key=lambda s: int(s[s.rfind("/") + 1:s.rfind(".")]) ), # noqa: 203 )) compare_pipelines( webdataset_raw_pipeline( tar_file_path, index_file.name, ["jpg", "cls"], missing_component_behavior="skip", batch_size=test_batch_size, device_id=0, num_threads=1, ), file_reader_pipeline(equivalent_files, ["jpg", "cls"], batch_size=test_batch_size, device_id=0, num_threads=1), test_batch_size, math.ceil(num_samples / test_batch_size), ) wds_pipeline = webdataset_raw_pipeline( tar_file_path, index_file.name, ["jpg", "cls"], missing_component_behavior="skip", batch_size=test_batch_size, device_id=0, num_threads=1, ) wds_pipeline.build() assert_equal(list(wds_pipeline.epoch_size().values())[0], num_samples)
def create_closure_callback_img_reader(data_set_size): data_root = get_dali_extra_path() images_dir = os.path.join(data_root, 'db', 'single', 'jpeg') with open(os.path.join(images_dir, "image_list.txt"), 'r') as f: file_label = [line.rstrip().split(' ') for line in f if line != ''] files, labels = zip(*file_label) def py_file_reader(sample_info): if sample_info.idx_in_epoch >= data_set_size: raise StopIteration sample_idx = sample_info.idx_in_epoch % len(files) jpeg_filename = files[sample_idx] label = np.int32([labels[sample_idx]]) with open(os.path.join(images_dir, jpeg_filename), 'rb') as f: encoded_img = np.frombuffer(f.read(), dtype=np.uint8) return encoded_img, label return py_file_reader
def test_rn50_benchmark(pipe_fun=rn50_pipeline, batch_size=8, num_threads=2, num_samples=256, data_path=None, save_df=None): if not data_path: data_path = os.path.join(get_dali_extra_path(), 'db/single/jpeg') print(f'num_threads: {num_threads}, batch_size: {batch_size}') full_stand, build_stand, times_stand = run_benchmark( pipe_fun, batch_size, num_threads, num_samples, False, data_path) iter_time_stand = np.mean(times_stand[1:]) / batch_size avg_speed_stand = num_samples / full_stand print( f'Stand pipeline --- time: {full_stand:8.5f} [s] --- build + 1st iter time: {build_stand:.5f} [s] --- ' f'avg iter time per sample: {iter_time_stand:7.5f} [s] --- avg speed: {avg_speed_stand:8.3f} [img/s]' ) full_debug, build_debug, times_debug = run_benchmark( pipe_fun, batch_size, num_threads, num_samples, True, data_path) iter_time_debug = np.mean(times_debug[1:]) / batch_size avg_speed_debug = num_samples / full_debug print( f'Debug pipeline --- time: {full_debug:8.5f} [s] --- build + 1st iter time: {build_debug:.5f} [s] --- ' f'avg iter time per sample: {iter_time_debug:7.5f} [s] --- avg speed: {avg_speed_debug:8.3f} [img/s]' ) if save_df is not None: df = pd.DataFrame({ 'type': ['standard_sync', 'debug_old'], 'batch_size': batch_size, 'time': [full_stand, full_debug], 'iter_time': [iter_time_stand, iter_time_debug], 'avg_speed': [avg_speed_stand, avg_speed_debug] }) return pd.concat([save_df, df]) return None
def test_mixed_devices_decoder(): """ Tests hidden functionality of exposing eager operators as classes. """ seed = 42 batch_size = 8 file_root = os.path.join(get_dali_extra_path(), 'db/single/jpeg') pipe = mixed_image_decoder_pipeline(file_root, seed, batch_size=batch_size) pipe.build() pipe_out, = pipe.run() jpeg, _ = next( eager.readers.file(file_root=file_root, batch_size=batch_size, seed=seed)) eager_out = eager.decoders.image(jpeg, device="gpu") assert len(pipe_out) == len(eager_out) with eager.arithmetic(): for comp_tensor in (pipe_out == eager_out): assert np.all(comp_tensor.as_cpu())
def create_closure_generator_img_reader(batch_size, data_set_size): data_root = get_dali_extra_path() images_dir = os.path.join(data_root, 'db', 'single', 'jpeg') with open(os.path.join(images_dir, "image_list.txt"), 'r') as f: file_label = [line.rstrip().split(' ') for line in f if line != ''] files, labels = zip(*file_label) def py_file_gen_reader(): i = 0 while i + batch_size <= data_set_size: batch_imgs, batch_labels = [], [] for _ in range(batch_size): jpeg_filename = files[i] with open(os.path.join(images_dir, jpeg_filename), 'rb') as f: batch_imgs.append(np.frombuffer(f.read(), dtype=np.uint8)) batch_labels.append(np.int32([labels[i]])) i += 1 yield batch_imgs, batch_labels return py_file_gen_reader
def test_wide_sample(): test_batch_size = 1 num_samples = 1 tar_file_path = os.path.join(get_dali_extra_path(), "db/webdataset/sample-tar/wide.tar") index_file = base.generate_temp_index_file(tar_file_path) extract_dir = base.generate_temp_extract(tar_file_path) equivalent_files = list(sorted(glob(extract_dir.name + "/*"))) num_components = 1000 compare_pipelines( base.webdataset_raw_pipeline( tar_file_path, index_file.name, [str(x) for x in range(num_components)], batch_size=test_batch_size, device_id=0, num_threads=1, ), base.file_reader_pipeline( equivalent_files, [str(x) for x in range(num_components)], batch_size=test_batch_size, device_id=0, num_threads=1, ), test_batch_size, math.ceil(num_samples / test_batch_size) * 10, ) wds_pipeline = base.webdataset_raw_pipeline( tar_file_path, index_file.name, ["txt"], batch_size=test_batch_size, device_id=0, num_threads=1, ) wds_pipeline.build() assert_equal(list(wds_pipeline.epoch_size().values())[0], num_samples)
def _test_wds_properties(device, generate_index): root_path = os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-0.tar") ref_filenames = [ "2000.jpg", "2001.jpg", "2002.jpg", "2003.jpg", "2004.jpg", "2005.jpg", "2006.jpg", "2007.jpg" ] ref_indices = [1536, 4096, 6144, 8704, 11264, 13824, 16384, 18432] if generate_index: with tempfile.TemporaryDirectory() as idx_dir: index_paths = [ os.path.join(idx_dir, os.path.basename(root_path) + ".idx") ] generate_wds_index(root_path, index_paths[0]) p = wds_properties(root_path, device, index_paths, batch_size=8, num_threads=4, device_id=0) p.build() output = p.run() else: p = wds_properties(root_path, device, None, batch_size=8, num_threads=4, device_id=0) p.build() output = p.run() for out in output: out = out if device == 'cpu' else out.as_cpu() for source_info, ref_fname, ref_idx in zip(out, ref_filenames, ref_indices): assert _uint8_tensor_to_string( source_info) == f"{root_path}:{ref_idx}:{ref_fname}"