Ejemplo n.º 1
0
def test_tfrecord():
    class TFRecordPipeline(Pipeline):
        def __init__(self, batch_size, num_threads, device_id, num_gpus, data, data_idx):
            super(TFRecordPipeline, self).__init__(batch_size, num_threads, device_id)
            self.input = ops.readers.TFRecord(path = data,
                                              index_path = data_idx,
                                              features = {"image/encoded" : tfrec.FixedLenFeature((), tfrec.string, ""),
                                                          "image/class/label": tfrec.FixedLenFeature([1], tfrec.int64,  -1)}
                                             )

        def define_graph(self):
            inputs = self.input(name="Reader")
            images = inputs["image/encoded"]
            return images

    tfrecord = os.path.join(get_dali_extra_path(), 'db', 'tfrecord', 'train')
    tfrecord_idx_org = os.path.join(get_dali_extra_path(), 'db', 'tfrecord', 'train.idx')
    tfrecord_idx = "tfr_train.idx"

    idx_files_dir = tempfile.TemporaryDirectory()
    idx_file = os.path.join(idx_files_dir.name, tfrecord_idx)

    skip_second(tfrecord_idx_org, idx_file)

    pipe = TFRecordPipeline(1, 1, 0, 1, tfrecord, idx_file)
    pipe_org = TFRecordPipeline(1, 1, 0, 1, tfrecord, tfrecord_idx_org)
    pipe.build()
    pipe_org.build()
    iters = pipe.epoch_size("Reader")
    for _ in  range(iters):
        out = pipe.run()
        out_ref = pipe_org.run()
        for a, b in zip(out, out_ref):
            assert np.array_equal(a.as_array(), b.as_array())
        _ = pipe_org.run()
Ejemplo n.º 2
0
def test_tfrecord_reader_alias():
    tfrecord = os.path.join(get_dali_extra_path(), 'db', 'tfrecord', 'train')
    tfrecord_idx = os.path.join(get_dali_extra_path(), 'db', 'tfrecord',
                                'train.idx')
    new_pipe = tfrecord_pipe(fn.readers.tfrecord, tfrecord, tfrecord_idx)
    legacy_pipe = tfrecord_pipe(fn.tfrecord_reader, tfrecord, tfrecord_idx)
    compare_pipelines(new_pipe, legacy_pipe, batch_size_alias_test, 50)
def test_pax_format():
    global test_batch_size
    num_samples = 1000
    tar_file_path = os.path.join(get_dali_extra_path(),
                                 "db/webdataset/MNIST/devel-0.tar")
    pax_tar_file_path = os.path.join(get_dali_extra_path(),
                                     "db/webdataset/pax/devel-0.tar")
    index_file = generate_temp_index_file(tar_file_path)

    num_shards = 100
    for shard_id in range(num_shards):
        compare_pipelines(
            webdataset_raw_pipeline(
                tar_file_path,
                index_file.name,
                ["jpg", "cls"],
                num_shards=num_shards,
                shard_id=shard_id,
                batch_size=test_batch_size,
                device_id=0,
                num_threads=1,
            ),
            webdataset_raw_pipeline(
                pax_tar_file_path,
                None,
                ext=["jpg", "cls"],
                num_shards=num_shards,
                shard_id=shard_id,
                batch_size=test_batch_size,
                device_id=0,
                num_threads=1,
            ),
            test_batch_size,
            math.ceil(num_samples / num_shards / test_batch_size) * 2,
        )
Ejemplo n.º 4
0
def test_recordio():
    class MXNetReaderPipeline(Pipeline):
        def __init__(self, batch_size, num_threads, device_id, num_gpus, data, data_idx):
            super(MXNetReaderPipeline, self).__init__(batch_size, num_threads, device_id)
            self.input = ops.readers.MXNet(path = [data], index_path=[data_idx],
                                           shard_id = device_id, num_shards = num_gpus)

        def define_graph(self):
            images, _ = self.input(name="Reader")
            return images

    recordio = os.path.join(get_dali_extra_path(), 'db', 'recordio', 'train.rec')
    recordio_idx_org = os.path.join(get_dali_extra_path(), 'db', 'recordio', 'train.idx')
    recordio_idx = "rio_train.idx"

    idx_files_dir = tempfile.TemporaryDirectory()
    idx_file = os.path.join(idx_files_dir.name, recordio_idx)

    skip_second(recordio_idx_org, idx_file)

    pipe = MXNetReaderPipeline(1, 1, 0, 1, recordio, idx_file)
    pipe_org = MXNetReaderPipeline(1, 1, 0, 1, recordio, recordio_idx_org)
    pipe.build()
    pipe_org.build()
    iters = pipe.epoch_size("Reader")
    for _ in  range(iters):
        out = pipe.run()
        out_ref = pipe_org.run()
        for a, b in zip(out, out_ref):
            assert np.array_equal(a.as_array(), b.as_array())
        _ = pipe_org.run()
Ejemplo n.º 5
0
def test_mxnet_reader_alias():
    recordio = [
        os.path.join(get_dali_extra_path(), 'db', 'recordio', 'train.rec')
    ]
    recordio_idx = [
        os.path.join(get_dali_extra_path(), 'db', 'recordio', 'train.idx')
    ]
    new_pipe = mxnet_pipe(fn.readers.mxnet, recordio, recordio_idx)
    legacy_pipe = mxnet_pipe(fn.mxnet_reader, recordio, recordio_idx)
    compare_pipelines(new_pipe, legacy_pipe, batch_size_alias_test, 50)
Ejemplo n.º 6
0
def test_image_decoders():
    def image_decoder_pipe(max_batch_size, input_data, device):
        pipe = Pipeline(batch_size=max_batch_size, num_threads=4, device_id=0)
        encoded = fn.external_source(source=input_data, cycle=False, device='cpu')
        decoded = fn.decoders.image(encoded, device=device)
        pipe.set_outputs(decoded)
        return pipe

    def image_decoder_crop_pipe(max_batch_size, input_data, device):
        pipe = Pipeline(batch_size=max_batch_size, num_threads=4, device_id=0)
        encoded = fn.external_source(source=input_data, cycle=False, device='cpu')
        decoded = fn.decoders.image_crop(encoded, device=device)
        pipe.set_outputs(decoded)
        return pipe

    def image_decoder_slice_pipe(max_batch_size, input_data, device):
        pipe = Pipeline(batch_size=max_batch_size, num_threads=4, device_id=0)
        encoded = fn.external_source(source=input_data, cycle=False, device='cpu')
        anch = fn.constant(fdata=.1)
        sh = fn.constant(fdata=.4)
        decoded = fn.decoders.image_slice(encoded, anch, sh, axes=0, device=device)
        pipe.set_outputs(decoded)
        return pipe

    def image_decoder_rcrop_pipe(max_batch_size, input_data, device):
        pipe = Pipeline(batch_size=max_batch_size, num_threads=4, device_id=0)
        encoded = fn.external_source(source=input_data, cycle=False, device='cpu')
        decoded = fn.decoders.image_random_crop(encoded, device=device)
        pipe.set_outputs(decoded)
        return pipe

    def peek_image_shape_pipe(max_batch_size, input_data, device):
        pipe = Pipeline(batch_size=max_batch_size, num_threads=4, device_id=0)
        encoded = fn.external_source(source=input_data, cycle=False, device='cpu')
        shape = fn.peek_image_shape(encoded, device=device)
        pipe.set_outputs(shape)
        return pipe

    image_decoder_extensions = ['.jpg', '.bmp', '.png', '.pnm', '.jp2']
    image_decoder_pipes = [image_decoder_pipe,
                           image_decoder_crop_pipe,
                           image_decoder_slice_pipe,
                           ]

    for ext in image_decoder_extensions:
        for pipe in image_decoder_pipes:
            yield test_decoders_check, pipe, \
                  os.path.join(test_utils.get_dali_extra_path(), 'db', 'single'), \
                  ext, ['cpu', 'mixed']
        yield test_decoders_run, image_decoder_rcrop_pipe, \
              os.path.join(test_utils.get_dali_extra_path(), 'db', 'single'), \
              ext, ['cpu', 'mixed']

    yield test_decoders_check, peek_image_shape_pipe, \
          os.path.join(test_utils.get_dali_extra_path(), 'db', 'single'), '.jpg', ['cpu']
def test_index_generation():
    global test_batch_size
    num_samples = 3000
    tar_file_paths = [
        os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-0.tar"),
        os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-1.tar"),
        os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-2.tar"),
    ]

    extract_dirs = [
        generate_temp_extract(tar_file_path)
        for tar_file_path in tar_file_paths
    ]
    equivalent_files = sum(
        list(
            sorted(glob(extract_dir.name + "/*"),
                   key=lambda s: int(s[s.rfind("/") + 1:s.rfind(".")])
                   )  # noqa: 203
            for extract_dir in extract_dirs),
        [],
    )

    num_shards = 100
    for shard_id in range(num_shards):
        compare_pipelines(
            webdataset_raw_pipeline(
                tar_file_paths,
                [],
                ["jpg", "cls"],
                missing_component_behavior="error",
                num_shards=num_shards,
                shard_id=shard_id,
                batch_size=test_batch_size,
                device_id=0,
                num_threads=1,
            ),
            file_reader_pipeline(
                equivalent_files,
                ["jpg", "cls"],
                num_shards=num_shards,
                shard_id=shard_id,
                batch_size=test_batch_size,
                device_id=0,
                num_threads=1,
            ),
            test_batch_size,
            math.ceil(num_samples / num_shards / test_batch_size) * 2,
        )
def test_return_empty():
    global test_batch_size
    num_samples = 1000
    tar_file_path = os.path.join(get_dali_extra_path(),
                                 "db/webdataset/MNIST/missing.tar")
    index_file = generate_temp_index_file(tar_file_path)

    extract_dir = generate_temp_extract(tar_file_path)
    equivalent_files = glob(extract_dir.name + "/*")
    equivalent_files = sorted(
        equivalent_files,
        key=(lambda s: int(s[s.rfind("/") + 1:s.rfind(".")])))  # noqa: 203

    compare_pipelines(
        webdataset_raw_pipeline(
            tar_file_path,
            index_file.name,
            ["jpg", "txt"],
            batch_size=test_batch_size,
            device_id=0,
            num_threads=1,
            missing_component_behavior="empty",
        ),
        file_reader_pipeline(equivalent_files, ["jpg", []],
                             batch_size=test_batch_size,
                             device_id=0,
                             num_threads=1),
        test_batch_size,
        math.ceil(num_samples / test_batch_size),
    )
Ejemplo n.º 9
0
def test_wrong_feature_shape():
    features = {
        'image/encoded': tfrec.FixedLenFeature((), tfrec.string, ""),
        'image/object/bbox': tfrec.FixedLenFeature([], tfrec.float32, -1.0),
        'image/object/class/label': tfrec.FixedLenFeature([], tfrec.int64, -1),
    }
    test_dummy_data_path = os.path.join(get_dali_extra_path(), 'db',
                                        'coco_dummy')
    pipe = Pipeline(1, 1, 0)
    with pipe:
        input = fn.readers.tfrecord(path=os.path.join(test_dummy_data_path,
                                                      'small_coco.tfrecord'),
                                    index_path=os.path.join(
                                        test_dummy_data_path,
                                        'small_coco_index.idx'),
                                    features=features)
    pipe.set_outputs(input['image/encoded'], input['image/object/class/label'],
                     input['image/object/bbox'])
    pipe.build()
    # the error is raised because FixedLenFeature is used with insufficient shape to house the input
    assert_raises(
        RuntimeError,
        pipe.run,
        glob="Error when executing CPU operator*readers*tfrecord*"
        "Output tensor shape is too small*[]*Expected at least 4 elements")
def test_dtypes():
    global test_batch_size
    num_samples = 100
    tar_file_path = os.path.join(get_dali_extra_path(),
                                 "db/webdataset/sample-tar/dtypes.tar")
    index_file = generate_temp_index_file(tar_file_path)

    wds_pipeline = webdataset_raw_pipeline(
        tar_file_path,
        index_file.name,
        ["float16", "int32", "float64"],
        dtypes=[dali.types.FLOAT16, dali.types.INT32, dali.types.FLOAT64],
        batch_size=test_batch_size,
        device_id=0,
        num_threads=1,
    )
    wds_pipeline.build()
    for sample_idx in range(num_samples):
        if sample_idx % test_batch_size == 0:
            f16, i32, f64 = wds_pipeline.run()
        assert (f16.as_array()[sample_idx %
                               test_batch_size] == [float(sample_idx)] *
                10).all()
        assert (i32.as_array()[sample_idx %
                               test_batch_size] == [int(sample_idx)] *
                10).all()
        assert (f64.as_array()[sample_idx %
                               test_batch_size] == [float(sample_idx)] *
                10).all()
Ejemplo n.º 11
0
def get_mix_size_image_pipeline(batch_size,
                                num_threads,
                                device,
                                device_id=0,
                                shard_id=0,
                                num_shards=1,
                                def_for_dataset=False):
    test_data_root = get_dali_extra_path()
    file_root = os.path.join(test_data_root, 'db', 'coco_dummy', 'images')
    annotations_file = os.path.join(test_data_root, 'db', 'coco_dummy',
                                    'instances.json')

    pipe = Pipeline(batch_size, num_threads, device_id)
    with pipe:
        jpegs, _, _, image_ids = fn.readers.coco(
            file_root=file_root,
            annotations_file=annotations_file,
            shard_id=shard_id,
            num_shards=num_shards,
            ratio=False,
            image_ids=True)
        images = fn.decoders.image(
            jpegs,
            device=('mixed' if device == 'gpu' else 'cpu'),
            output_type=types.RGB)

        pipe.set_outputs(images)

    shapes = ((batch_size, None, None, None), )
    dtypes = (tf.float32, )

    return pipe, shapes, dtypes
Ejemplo n.º 12
0
def general_corner_case(test_batch_size=base.test_batch_size,
                        dtypes=None,
                        missing_component_behavior="",
                        **kwargs):
    num_samples = 1000
    tar_file_path = os.path.join(get_dali_extra_path(),
                                 "db/webdataset/MNIST/devel-0.tar")
    index_file = base.generate_temp_index_file(tar_file_path)

    extract_dir = base.generate_temp_extract(tar_file_path)
    equivalent_files = sorted(
        glob(extract_dir.name + "/*"),
        key=lambda s: int(s[s.rfind("/") + 1:s.rfind(".")]))

    compare_pipelines(
        base.webdataset_raw_pipeline(
            tar_file_path,
            index_file.name, ["jpg", "cls"],
            missing_component_behavior=missing_component_behavior,
            dtypes=dtypes,
            batch_size=test_batch_size,
            device_id=0,
            num_threads=1,
            **kwargs),
        base.file_reader_pipeline(equivalent_files, ["jpg", "cls"],
                                  batch_size=test_batch_size,
                                  device_id=0,
                                  num_threads=1,
                                  **kwargs),
        test_batch_size,
        math.ceil(num_samples / test_batch_size),
    )
Ejemplo n.º 13
0
 def paths_index_paths_error():
     webdataset_pipeline = base.webdataset_raw_pipeline(
         [
             os.path.join(get_dali_extra_path(),
                          "db/webdataset/MNIST/devel-0.tar"),
             os.path.join(get_dali_extra_path(),
                          "db/webdataset/MNIST/devel-1.tar"),
             os.path.join(get_dali_extra_path(),
                          "db/webdataset/MNIST/devel-2.tar"),
         ],
         ["test.idx"],
         ["jpg", "cls"],
         batch_size=1,
         device_id=0,
         num_threads=1,
     )
     webdataset_pipeline.build()
def test_wds_sharding():
    global test_batch_size
    num_samples = 3000
    tar_file_paths = [
        os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-0.tar"),
        os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-1.tar"),
        os.path.join(get_dali_extra_path(), "db/webdataset/MNIST/devel-2.tar"),
    ]
    index_files = [
        generate_temp_index_file(tar_file_path)
        for tar_file_path in tar_file_paths
    ]

    extract_dirs = [
        generate_temp_extract(tar_file_path)
        for tar_file_path in tar_file_paths
    ]
    equivalent_files = sum(
        list(
            sorted(glob(extract_dir.name + "/*"),
                   key=lambda s: int(s[s.rfind("/") + 1:s.rfind(".")])
                   )  # noqa: 203
            for extract_dir in extract_dirs),
        [],
    )

    compare_pipelines(
        webdataset_raw_pipeline(
            tar_file_paths,
            [index_file.name for index_file in index_files],
            ["jpg", "cls"],
            batch_size=test_batch_size,
            device_id=0,
            num_threads=1,
        ),
        file_reader_pipeline(
            equivalent_files,
            ["jpg", "cls"],
            batch_size=test_batch_size,
            device_id=0,
            num_threads=1,
        ),
        test_batch_size,
        math.ceil(num_samples / test_batch_size),
    )
Ejemplo n.º 15
0
def _test_tfr_properties(device):
    root_path = os.path.join(get_dali_extra_path(), 'db', 'tfrecord', 'train')
    index_path = os.path.join(get_dali_extra_path(), 'db', 'tfrecord',
                              'train.idx')
    idx = [0, 171504, 553687, 651500, 820966, 1142396, 1380096, 1532947]
    p = tfr_properties(root_path,
                       index_path,
                       device,
                       batch_size=8,
                       num_threads=4,
                       device_id=0)
    p.build()
    output = p.run()
    for out in output:
        out = out if device == 'cpu' else out.as_cpu()
        for source_info, ref_idx in zip(out, idx):
            assert _uint8_tensor_to_string(
                source_info) == f"{root_path} at index {ref_idx}"
Ejemplo n.º 16
0
def test_tfrecord_reader_alias2():
    tfrecord = os.path.join(get_dali_extra_path(), 'db', 'tfrecord', 'train')
    tfrecord_idx = os.path.join(get_dali_extra_path(), 'db', 'tfrecord',
                                'train.idx')
    pipe = tfrecord_pipe_empty_fields(tfrecord, tfrecord_idx)
    pipe.build()
    out = pipe.run()
    for tensor in out[0]:
        data = np.array(tensor)
        assert len(data) != 0
        assert data.dtype == np.uint8
    for tensor in out[1]:
        data = np.array(tensor)
        assert len(data.shape) == 0
        assert data.dtype == np.int64
    for tensor in out[2]:
        data = np.array(tensor)
        assert len(data.shape) == 0
        assert data.dtype == np.float32
Ejemplo n.º 17
0
def _test_improper_property(device):
    root_path = os.path.join(get_dali_extra_path(),
                             "db/webdataset/MNIST/devel-0.tar")
    p = improper_property(root_path,
                          device,
                          batch_size=8,
                          num_threads=4,
                          device_id=0)
    p.build()
    p.run()
Ejemplo n.º 18
0
def test_audio_decoders():
    def audio_decoder_pipe(max_batch_size, input_data, device):
        pipe = Pipeline(batch_size=max_batch_size, num_threads=4, device_id=0)
        encoded = fn.external_source(source=input_data, cycle=False, device='cpu')
        decoded, _ = fn.decoders.audio(encoded, downmix=True, sample_rate=12345, device=device)
        pipe.set_outputs(decoded)
        return pipe

    yield test_decoders_check, audio_decoder_pipe, \
          os.path.join(test_utils.get_dali_extra_path(), 'db', 'audio'), '.wav'
Ejemplo n.º 19
0
def _test_lambda_np_readfromfile(name, py_callback_pickler):
    data_root = get_dali_extra_path()
    images_dir = os.path.join(data_root, 'db', 'single', 'jpeg')

    with open(os.path.join(images_dir, "image_list.txt"), 'r') as f:
        file_label = [line.rstrip().split(' ') for line in f if line != '']
        files, _ = zip(*file_label)

    _create_and_compare_simple_pipelines(lambda x: (np.fromfile(
        os.path.join(images_dir, files[x.idx_in_epoch % len(files)]),
        dtype=np.uint8)),
                                         py_callback_pickler,
                                         batch_size=8,
                                         py_num_workers=2)
Ejemplo n.º 20
0
def get_image_pipeline(batch_size, num_threads, device, device_id=0, shard_id=0, num_shards=1,
        def_for_dataset=False):
    test_data_root = get_dali_extra_path()
    file_root = os.path.join(test_data_root, 'db', 'coco_dummy', 'images')
    annotations_file = os.path.join(
        test_data_root, 'db', 'coco_dummy', 'instances.json')

    pipe = Pipeline(batch_size, num_threads, device_id)
    with pipe:
        jpegs, _, _, image_ids = fn.readers.coco(
            file_root=file_root,
            annotations_file=annotations_file,
            shard_id=shard_id,
            num_shards=num_shards,
            ratio=False,
            image_ids=True)
        images = fn.decoders.image(
            jpegs,
            device=('mixed' if device == 'gpu' else 'cpu'),
            output_type=types.RGB)
        images = fn.resize(
            images,
            resize_x=224,
            resize_y=224,
            interp_type=types.INTERP_LINEAR)
        images = fn.crop_mirror_normalize(
            images,
            dtype=types.FLOAT,
            mean=[128., 128., 128.],
            std=[1., 1., 1.])
        if device == 'gpu':
            image_ids = image_ids.gpu()
        ids_reshaped = fn.reshape(image_ids, shape=[1, 1])
        ids_int16 = fn.cast(image_ids, dtype=types.INT16)

        pipe.set_outputs(images, ids_reshaped, ids_int16)

    shapes = (
        (batch_size, 3, 224, 224),
        (batch_size, 1, 1),
        (batch_size, 1))
    dtypes = (
        tf.float32,
        tf.int32,
        tf.int16)

    return pipe, shapes, dtypes
Ejemplo n.º 21
0
def general_index_error(index_file_contents,
                        tar_file_path="db/webdataset/MNIST/devel-0.tar",
                        ext="jpg"):
    index_file = tempfile.NamedTemporaryFile()
    index_file.write(index_file_contents)
    index_file.flush()
    webdataset_pipeline = base.webdataset_raw_pipeline(
        os.path.join(get_dali_extra_path(), tar_file_path),
        index_file.name,
        ext,
        batch_size=1,
        device_id=0,
        num_threads=1,
    )
    webdataset_pipeline.build()
    webdataset_pipeline.run()
    webdataset_pipeline.run()
Ejemplo n.º 22
0
def _test_wds_properties(device):
    root_path = os.path.join(get_dali_extra_path(),
                             "db/webdataset/MNIST/devel-0.tar")
    ref_offset = [1536, 4096, 6144, 8704, 11264, 13824, 16384, 18432]
    p = wds_properties(root_path,
                       device,
                       batch_size=8,
                       num_threads=4,
                       device_id=0)
    p.build()
    output = p.run()
    for out in output:
        out = out if device == 'cpu' else out.as_cpu()
        for source_info, offset in zip(out, ref_offset):
            assert _uint8_tensor_to_string(
                source_info
            ) == f"archive {root_path}tar file at \"{root_path}\"component offset {offset}"
def test_raise_error_on_missing():
    global test_batch_size
    tar_file_path = os.path.join(get_dali_extra_path(),
                                 "db/webdataset/MNIST/missing.tar")
    index_file = generate_temp_index_file(tar_file_path)
    wds_pipeline = webdataset_raw_pipeline(
        tar_file_path,
        index_file.name,
        ["jpg", "cls"],
        missing_component_behavior="error",
        batch_size=test_batch_size,
        device_id=0,
        num_threads=1,
    )
    assert_raises(RuntimeError,
                  wds_pipeline.build,
                  glob="Underful sample detected")
def test_skip_sample():
    global test_batch_size
    num_samples = 500
    tar_file_path = os.path.join(get_dali_extra_path(),
                                 "db/webdataset/MNIST/missing.tar")
    index_file = generate_temp_index_file(tar_file_path)

    extract_dir = generate_temp_extract(tar_file_path)
    equivalent_files = list(
        filter(
            lambda s: int(s[s.rfind("/") + 1:s.rfind(".")]) <
            2500,  # noqa: 203
            sorted(glob(extract_dir.name + "/*"),
                   key=lambda s: int(s[s.rfind("/") + 1:s.rfind(".")])
                   ),  # noqa: 203
        ))

    compare_pipelines(
        webdataset_raw_pipeline(
            tar_file_path,
            index_file.name,
            ["jpg", "cls"],
            missing_component_behavior="skip",
            batch_size=test_batch_size,
            device_id=0,
            num_threads=1,
        ),
        file_reader_pipeline(equivalent_files, ["jpg", "cls"],
                             batch_size=test_batch_size,
                             device_id=0,
                             num_threads=1),
        test_batch_size,
        math.ceil(num_samples / test_batch_size),
    )
    wds_pipeline = webdataset_raw_pipeline(
        tar_file_path,
        index_file.name,
        ["jpg", "cls"],
        missing_component_behavior="skip",
        batch_size=test_batch_size,
        device_id=0,
        num_threads=1,
    )
    wds_pipeline.build()
    assert_equal(list(wds_pipeline.epoch_size().values())[0], num_samples)
Ejemplo n.º 25
0
def create_closure_callback_img_reader(data_set_size):
    data_root = get_dali_extra_path()
    images_dir = os.path.join(data_root, 'db', 'single', 'jpeg')

    with open(os.path.join(images_dir, "image_list.txt"), 'r') as f:
        file_label = [line.rstrip().split(' ') for line in f if line != '']
        files, labels = zip(*file_label)

    def py_file_reader(sample_info):
        if sample_info.idx_in_epoch >= data_set_size:
            raise StopIteration
        sample_idx = sample_info.idx_in_epoch % len(files)
        jpeg_filename = files[sample_idx]
        label = np.int32([labels[sample_idx]])
        with open(os.path.join(images_dir, jpeg_filename), 'rb') as f:
            encoded_img = np.frombuffer(f.read(), dtype=np.uint8)
        return encoded_img, label

    return py_file_reader
Ejemplo n.º 26
0
def test_rn50_benchmark(pipe_fun=rn50_pipeline,
                        batch_size=8,
                        num_threads=2,
                        num_samples=256,
                        data_path=None,
                        save_df=None):
    if not data_path:
        data_path = os.path.join(get_dali_extra_path(), 'db/single/jpeg')

    print(f'num_threads: {num_threads}, batch_size: {batch_size}')

    full_stand, build_stand, times_stand = run_benchmark(
        pipe_fun, batch_size, num_threads, num_samples, False, data_path)
    iter_time_stand = np.mean(times_stand[1:]) / batch_size
    avg_speed_stand = num_samples / full_stand

    print(
        f'Stand pipeline --- time: {full_stand:8.5f} [s] --- build + 1st iter time: {build_stand:.5f} [s] --- '
        f'avg iter time per sample: {iter_time_stand:7.5f} [s] --- avg speed: {avg_speed_stand:8.3f} [img/s]'
    )

    full_debug, build_debug, times_debug = run_benchmark(
        pipe_fun, batch_size, num_threads, num_samples, True, data_path)
    iter_time_debug = np.mean(times_debug[1:]) / batch_size
    avg_speed_debug = num_samples / full_debug

    print(
        f'Debug pipeline --- time: {full_debug:8.5f} [s] --- build + 1st iter time: {build_debug:.5f} [s] --- '
        f'avg iter time per sample: {iter_time_debug:7.5f} [s] --- avg speed: {avg_speed_debug:8.3f} [img/s]'
    )

    if save_df is not None:
        df = pd.DataFrame({
            'type': ['standard_sync', 'debug_old'],
            'batch_size': batch_size,
            'time': [full_stand, full_debug],
            'iter_time': [iter_time_stand, iter_time_debug],
            'avg_speed': [avg_speed_stand, avg_speed_debug]
        })
        return pd.concat([save_df, df])

    return None
Ejemplo n.º 27
0
def test_mixed_devices_decoder():
    """ Tests hidden functionality of exposing eager operators as classes. """
    seed = 42
    batch_size = 8
    file_root = os.path.join(get_dali_extra_path(), 'db/single/jpeg')

    pipe = mixed_image_decoder_pipeline(file_root, seed, batch_size=batch_size)
    pipe.build()
    pipe_out, = pipe.run()

    jpeg, _ = next(
        eager.readers.file(file_root=file_root,
                           batch_size=batch_size,
                           seed=seed))
    eager_out = eager.decoders.image(jpeg, device="gpu")

    assert len(pipe_out) == len(eager_out)

    with eager.arithmetic():
        for comp_tensor in (pipe_out == eager_out):
            assert np.all(comp_tensor.as_cpu())
Ejemplo n.º 28
0
def create_closure_generator_img_reader(batch_size, data_set_size):
    data_root = get_dali_extra_path()
    images_dir = os.path.join(data_root, 'db', 'single', 'jpeg')

    with open(os.path.join(images_dir, "image_list.txt"), 'r') as f:
        file_label = [line.rstrip().split(' ') for line in f if line != '']
        files, labels = zip(*file_label)

    def py_file_gen_reader():
        i = 0
        while i + batch_size <= data_set_size:
            batch_imgs, batch_labels = [], []
            for _ in range(batch_size):
                jpeg_filename = files[i]
                with open(os.path.join(images_dir, jpeg_filename), 'rb') as f:
                    batch_imgs.append(np.frombuffer(f.read(), dtype=np.uint8))
                batch_labels.append(np.int32([labels[i]]))
                i += 1
            yield batch_imgs, batch_labels

    return py_file_gen_reader
Ejemplo n.º 29
0
def test_wide_sample():
    test_batch_size = 1
    num_samples = 1
    tar_file_path = os.path.join(get_dali_extra_path(),
                                 "db/webdataset/sample-tar/wide.tar")
    index_file = base.generate_temp_index_file(tar_file_path)

    extract_dir = base.generate_temp_extract(tar_file_path)
    equivalent_files = list(sorted(glob(extract_dir.name + "/*")))

    num_components = 1000
    compare_pipelines(
        base.webdataset_raw_pipeline(
            tar_file_path,
            index_file.name,
            [str(x) for x in range(num_components)],
            batch_size=test_batch_size,
            device_id=0,
            num_threads=1,
        ),
        base.file_reader_pipeline(
            equivalent_files,
            [str(x) for x in range(num_components)],
            batch_size=test_batch_size,
            device_id=0,
            num_threads=1,
        ),
        test_batch_size,
        math.ceil(num_samples / test_batch_size) * 10,
    )
    wds_pipeline = base.webdataset_raw_pipeline(
        tar_file_path,
        index_file.name,
        ["txt"],
        batch_size=test_batch_size,
        device_id=0,
        num_threads=1,
    )
    wds_pipeline.build()
    assert_equal(list(wds_pipeline.epoch_size().values())[0], num_samples)
Ejemplo n.º 30
0
def _test_wds_properties(device, generate_index):
    root_path = os.path.join(get_dali_extra_path(),
                             "db/webdataset/MNIST/devel-0.tar")
    ref_filenames = [
        "2000.jpg", "2001.jpg", "2002.jpg", "2003.jpg", "2004.jpg", "2005.jpg",
        "2006.jpg", "2007.jpg"
    ]
    ref_indices = [1536, 4096, 6144, 8704, 11264, 13824, 16384, 18432]
    if generate_index:
        with tempfile.TemporaryDirectory() as idx_dir:
            index_paths = [
                os.path.join(idx_dir,
                             os.path.basename(root_path) + ".idx")
            ]
            generate_wds_index(root_path, index_paths[0])
            p = wds_properties(root_path,
                               device,
                               index_paths,
                               batch_size=8,
                               num_threads=4,
                               device_id=0)
            p.build()
            output = p.run()
    else:
        p = wds_properties(root_path,
                           device,
                           None,
                           batch_size=8,
                           num_threads=4,
                           device_id=0)
        p.build()
        output = p.run()
    for out in output:
        out = out if device == 'cpu' else out.as_cpu()
        for source_info, ref_fname, ref_idx in zip(out, ref_filenames,
                                                   ref_indices):
            assert _uint8_tensor_to_string(
                source_info) == f"{root_path}:{ref_idx}:{ref_fname}"