Example #1
0
def time_tiledb(dataset, batch_size=1):
    ds = hub.Dataset(dataset)
    if os.path.exists(dataset.split("/")[1] + "_tileDB"):
        ds_tldb = tiledb.open(dataset.split("/")[1] + "_tileDB")
    else:
        if not os.path.exists(dataset.split("/")[1] + "_tileDB"):
            os.makedirs(dataset.split("/")[1] + "_tileDB")
        ds_numpy = np.concatenate(
            (
                ds["image"].compute().reshape(ds.shape[0], -1),
                ds["label"].compute().reshape(ds.shape[0], -1),
            ),
            axis=1,
        )
        ds_tldb = tiledb.from_numpy(
            dataset.split("/")[1] + "_tileDB", ds_numpy)

    assert type(ds_tldb) == tiledb.array.DenseArray

    with Timer("Time"):
        counter = 0
        t0 = time()
        for batch in range(ds.shape[0] // batch_size):
            x, y = (
                ds_tldb[batch * batch_size:(batch + 1) * batch_size, :-1],
                ds_tldb[batch * batch_size:(batch + 1) * batch_size, -1],
            )
            counter += 1
            t1 = time()
            print("Batch", counter, f"dt: {t1 - t0}")
            t0 = t1
def time_batches(dataset, batch_size=1, num_batches=1, hub=False):
    np.random.seed(0)
    with Timer("Time"):
        counter = 0
        t0 = time()
        for batch in range(num_batches):
            if hub is False:
                dataset[batch * batch_size:(batch + 1) *
                        batch_size, :-1] = np.random.randint(255,
                                                             size=(batch_size,
                                                                   28 * 28))
                dataset[batch * batch_size:(batch + 1) * batch_size,
                        -1] = np.random.randint(10, size=(batch_size, ))
            else:
                dataset["image"][batch * batch_size:(batch + 1) *
                                 batch_size] = np.random.randint(
                                     255, size=(batch_size, 28, 28, 1))
                dataset["label"][batch * batch_size:(batch + 1) *
                                 batch_size] = np.random.randint(
                                     10, size=(batch_size, 1))
                dataset.flush()
            counter += 1
            t1 = time()
            print("Batch", counter, f"dt: {t1 - t0}")
            t0 = t1
Example #3
0
def test_pipeline():

    ds = hub.Dataset("./data/test/test_pipeline_multiple2",
                     mode="w",
                     shape=(100, ),
                     schema=my_schema)

    for i in range(len(ds)):
        ds["image", i] = np.ones((28, 28, 4), dtype="int32")
        ds["label", i] = f"hello {i}"
        ds["confidence", i] = 0.2

    with Timer("multiple pipes"):

        @hub.transform(schema=my_schema)
        def my_transform(sample, multiplier: int = 2):
            return {
                "image": sample["image"] * multiplier,
                "label": sample["label"],
                "confidence": sample["confidence"] * multiplier,
            }

        out_ds = my_transform(ds, multiplier=2)
        out_ds = my_transform(out_ds, multiplier=2)
        out_ds = out_ds.store("./data/test/test_pipeline_multiple_4")

        assert (out_ds["image", 0].compute() == 4).all()
Example #4
0
def test_to_from_pytorch():
    my_schema = {
        "image": Tensor((10, 10, 3), "uint8"),
        "label": {
            "c": Tensor((5, 3), "uint8"),
            "d": {
                "e": Tensor((5, 3), "uint8")
            },
            "f": "float",
        },
    }
    ds = hub.Dataset(
        schema=my_schema,
        shape=(10, ),
        url="./data/test_from_pytorch/test20",
        mode="w",
        cache=False,
    )
    for i in range(10):
        ds["image", i] = i * np.ones((10, 10, 3))
        ds["label", "d", "e", i] = i * np.ones((5, 3))

    ds = ds.to_pytorch()
    out_ds = hub.Dataset.from_pytorch(ds)
    with Timer("storing"):
        res_ds = out_ds.store("./data/test_from_pytorch/test30")

    for i in range(10):
        assert (res_ds["label", "d", "e", i].numpy() == i * np.ones(
            (5, 3))).all()
Example #5
0
def time_random_access(dataset_name="activeloop/mnist",
                       offset=1000,
                       span=1000,
                       field="image"):
    dset = Dataset(dataset_name, cache=False, storage_cache=False)
    with Timer(
            f"{dataset_name} read at offset {offset:03} of length {span:03}"):
        dset[field][offset:offset + span].compute()
Example #6
0
def bench_pil_compression(img_path=img_path, count=count):
    img = Image.open(img_path)
    arr = np.array(img)
    print(arr.shape)
    with Timer("PIL compression"):
        for i in range(0, count):
            img = Image.fromarray(arr)
            b = BytesIO()
            img.save(b, format="png")
            assert b.tell() > 0
Example #7
0
def main():
    numcodecs.register_codec(PngCodec, "png")
    with Timer("Compress"):
        arr = zarr.create(
            shape=(10, 10, 1920, 1080, 7),
            dtype="uint8",
            compressor=PngCodec(solo_channel=True),
            store=zarr.MemoryStore(),
        )
        arr[:] = np.ones((10, 10, 1920, 1080, 7), dtype="uint8")
        print(arr[:].shape)
def time_iter_hub_wasabi_tensorflow(
    dataset_info, batch_size=BATCH_SIZE, prefetch_factor=PREFETCH_SIZE, process=None
):
    dset = Dataset(dataset_info["hub_name"], cache=False, storage_cache=False, mode="r")
    loader = dset.to_tensorflow().batch(batch_size).prefetch(prefetch_factor)

    with Timer("Hub (remote - Wasabi) `.to_tensorflow()`"):
        for batch in loader:
            image = batch["image"]
            label = batch["label"]
            if process is not None:
                process(image, label)
Example #9
0
def main():
    schema = {
        "image": Image(shape=(None, None), max_shape=(28, 28)),
        "label": ClassLabel(num_classes=10),
    }
    path = "./data/examples/new_api_intro2"

    ds = Dataset(path, shape=(10, ), mode="w", schema=schema)
    print(len(ds))
    for i in range(len(ds)):
        with Timer("writing single element"):
            ds["image", i] = np.ones((28, 28), dtype="uint8")
            ds["label", i] = 3

    ds.resize_shape(200)
    print(ds.shape)
    print(ds["label", 100:110].numpy())
    with Timer("Committing"):
        ds.flush()

    ds = Dataset(path)
    print(ds.schema)
    print(ds["image", 0].compute())
Example #10
0
def bench_hub_compression(times=REPEAT_TIMES):
    arr = np.array(IMG)
    ds = hub.Dataset(
        "./data/bench_png_compression",
        mode="w",
        shape=times,
        schema={"image": hub.schema.Image(arr.shape, compressor="png")},
    )

    batch = np.zeros((times, ) + arr.shape, dtype="uint8")
    for i in range(times):
        batch[i] = arr

    with Timer("Hub compression"):
        ds["image", :times] = batch
def time_iter_hub_local_tensorflow(
    dataset_info, batch_size=BATCH_SIZE, prefetch_factor=PREFETCH_SIZE, process=None
):
    dset = Dataset.from_tfds(dataset_info["name"], split=dataset_info["split"])
    path = os.path.join(ROOT, "Hub_data", "tfds")
    dset.store(path)
    dset = Dataset(path, cache=False, storage_cache=False, mode="r")
    loader = dset.to_tensorflow().batch(batch_size).prefetch(prefetch_factor)

    with Timer("Hub (local) `.to_tensorflow()`"):
        for batch in loader:
            image = batch["image"]
            label = batch["label"]
            if process is not None:
                process(image, label)
Example #12
0
def time_iter_tensorflow(dataset_name="activeloop/mnist",
                         batch_size=1,
                         prefetch_factor=0,
                         process=None):

    dset = Dataset(dataset_name, cache=False, storage_cache=False, mode="r")

    loader = dset.to_tensorflow().batch(batch_size).prefetch(prefetch_factor)

    with Timer(
            f"{dataset_name} TF prefetch {prefetch_factor:03} in batches of {batch_size:03}"
    ):
        for idx, batch in enumerate(loader):
            image = batch["image"]
            label = batch["label"]
            if process is not None:
                process(idx, image, label)
def time_hub(dataset, batch_size=1):
    ds = hub.Dataset(dataset, cache=False, storage_cache=False, mode="r")

    assert type(ds) == hub.api.dataset.Dataset

    with Timer("Time"):
        counter = 0
        t0 = time()
        for batch in range(ds.shape[0] // batch_size):
            x, y = (
                ds[batch * batch_size : (batch + 1) * batch_size]["image"].compute(),
                ds[batch * batch_size : (batch + 1) * batch_size]["label"].compute(),
            )
            counter += 1
            t1 = time()
            print("Batch", counter, f"dt: {t1 - t0}")
            t0 = t1
Example #14
0
def bench_hub_compression(img_path=img_path, count=count):
    img = Image.open(img_path)
    arr = np.array(img)
    print(arr.shape)
    ds = hub.Dataset(
        "./data/benchmarks/bench_png_compression",
        mode="w",
        shape=count,
        schema={"image": hub.schema.Image(arr.shape, compressor="png")},
    )
    print(ds._tensors["/image"].chunks)
    bigarr = np.zeros((count, ) + arr.shape, dtype="uint8")
    for i in range(count):
        bigarr[i] = arr

    with Timer("Hub compression"):
        ds["image", :count] = bigarr
Example #15
0
def time_iter_tensorflow(dataset_info,
                         batch_size=BATCH_SIZE,
                         prefetch_factor=PREFETCH_SIZE,
                         process=None):
    # turn off optimizations
    options = tf.data.Options()
    blockAS = tf.data.experimental.AutoShardPolicy.OFF
    options.experimental_distribute.auto_shard_policy = blockAS
    options.experimental_optimization.autotune_cpu_budget = 1

    loader = tfds.load(dataset_info["name"],
                       split=dataset_info["split"]).with_options(options)

    with Timer("Tensorflow (local, native - TFDS)"):
        for batch in loader:
            image = batch["image"]
            label = batch["label"]
            if process is not None:
                process(image, label)
def time_iter_hub_wasabi_pytorch(
    dataset_info,
    batch_size=BATCH_SIZE,
    prefetch_factor=PREFETCH_SIZE,
    num_workers=NUM_WORKERS,
    process=None,
):
    dset = Dataset(dataset_info["hub_name"], cache=False, storage_cache=False, mode="r")
    loader = torch.utils.data.DataLoader(
        dset.to_pytorch(),
        batch_size=batch_size,
        prefetch_factor=prefetch_factor,
        num_workers=num_workers,
    )

    with Timer("Hub (remote - Wasabi) `.to_pytorch()`"):
        for image, label in loader:
            if process is not None:
                process(image, label)
Example #17
0
def test_from_pytorch():
    from torch.utils.data import Dataset

    class TestDataset(Dataset):
        def __init__(self, transform=None):
            self.transform = transform

        def __len__(self):
            return 12

        def __iter__(self):
            for i in range(len(self)):
                yield self[i]

        def __getitem__(self, idx):
            image = 5 * np.ones((256, 256, 3))
            landmarks = 7 * np.ones((10, 10, 10))
            named = "testing text labels"
            sample = {
                "data": {
                    "image": image,
                    "landmarks": landmarks
                },
                "labels": {
                    "named": named
                },
            }

            if self.transform:
                sample = self.transform(sample)
            return sample

    tds = TestDataset()
    with Timer("from_pytorch"):
        ds = hub.Dataset.from_pytorch(tds)

    ds = ds.store("./data/test_from_pytorch/test1")

    assert (ds["data", "image", 3].numpy() == 5 * np.ones((256, 256, 3))).all()
    assert (ds["data", "landmarks", 2].numpy() == 7 * np.ones(
        (10, 10, 10))).all()
    assert ds["labels", "named", 5].numpy() == "testing text labels"
def main():
    sample_count = 70000
    step = 10
    with Timer("Time"):

        ds = hub.Dataset(
            "./data/examples/mnist_upload_speed_benchmark",
            mode="w",
            schema=schema,
            shape=(sample_count, ),
            cache=2**26,
        )

        arr = (np.random.rand(step, 28, 28) * 100).astype("uint8")

        for i in range(0, sample_count, step):
            # with Timer(f"Sample {i}"):
            ds["image", i:i + step] = arr

        ds.commit()
Example #19
0
def time_iter_pytorch(dataset_name="activeloop/mnist",
                      batch_size=1,
                      prefetch_factor=0,
                      process=None):

    dset = Dataset(dataset_name, cache=False, storage_cache=False, mode="r")

    loader = torch.utils.data.DataLoader(
        dset.to_pytorch(),
        batch_size=batch_size,
        prefetch_factor=prefetch_factor,
        num_workers=1,
    )

    with Timer(
            f"{dataset_name} PyTorch prefetch {prefetch_factor:03} in batches of {batch_size:03}"
    ):
        for idx, (image, label) in enumerate(loader):
            if process is not None:
                process(idx, image, label)
Example #20
0
def time_iter_pytorch(
    dataset_info,
    batch_size=BATCH_SIZE,
    prefetch_factor=PREFETCH_SIZE,
    num_workers=NUM_WORKERS,
    process=None,
):
    dset = prepare_torch_dataset(dataset_info)

    loader = torch.utils.data.DataLoader(
        dset,
        batch_size=batch_size,
        prefetch_factor=prefetch_factor,
        num_workers=num_workers,
    )

    with Timer("PyTorch (local, native)"):
        for image, label in loader:
            if process is not None:
                process(image, label)
Example #21
0
def test_multiprocessing(sample_size=200,
                         width=100,
                         channels=4,
                         dtype="uint8"):

    my_schema = {
        "image":
        Image(
            (width, width, channels),
            dtype,
            (width, width, channels),
            chunks=(sample_size // 20),
            compressor="LZ4",
        ),
    }

    with Timer("multiprocesing"):

        @hub.transform(schema=my_schema, scheduler="threaded", workers=4)
        def my_transform(x):

            a = np.random.random((width, width, channels))
            for i in range(100):
                a *= np.random.random((width, width, channels))

            return {
                "image": (np.ones(
                    (width, width, channels), dtype=dtype) * 255),
            }

        ds = hub.Dataset(
            "./data/test/test_pipeline_basic_4",
            mode="w",
            shape=(sample_size, ),
            schema=my_schema,
            cache=2 * 26,
        )

        ds_t = my_transform(ds).store("./data/test/test_pipeline_basic_4")

    assert (ds_t["image", :].compute() == 255).all()
Example #22
0
def time_zarr(dataset, batch_size=1):
    ds = hub.Dataset(dataset)
    if os.path.exists(dataset.split("/")[1] + "_zarr"):
        ds_zarr = zarr.open(dataset.split("/")[1] + "_zarr")
    else:
        store = zarr.DirectoryStore(dataset.split("/")[1] + "_zarr")
        shape = [
            ds["image"].shape[0],
            ds["image"].shape[1] * ds["image"].shape[2] * ds["image"].shape[3]
            + 1,
        ]
        ds_zarr = zarr.create((shape[0], shape[1]),
                              store=store,
                              chunks=(batch_size, None))
        for batch in range(ds.shape[0] // batch_size):
            ds_numpy = np.concatenate(
                (
                    ds["image", batch * batch_size:(batch + 1) *
                       batch_size].compute().reshape(batch_size, -1),
                    ds["label", batch * batch_size:(batch + 1) *
                       batch_size].compute().reshape(batch_size, -1),
                ),
                axis=1,
            )
            ds_zarr[batch * batch_size:(batch + 1) * batch_size] = ds_numpy

    assert type(ds_zarr) == zarr.core.Array

    with Timer("Time"):
        counter = 0
        t0 = time()
        for batch in range(ds.shape[0] // batch_size):
            x, y = (
                ds_zarr[batch * batch_size:(batch + 1) * batch_size, :-1],
                ds_zarr[batch * batch_size:(batch + 1) * batch_size, -1],
            )
            counter += 1
            t1 = time()
            print("Batch", counter, f"dt: {t1 - t0}")
            t0 = t1
Example #23
0
def main():
    with Timer("Time"):
        schema = {
            "image":
            Image(
                (None, None, 4),
                dtype="uint8",
                chunks=(1, 2048, 2048, 4),
                max_shape=(100000, 100000, 4),
            )
        }
        ds = hub.Dataset("./data/examples/big_image",
                         mode="w",
                         schema=schema,
                         shape=(10000, ))

        print(ds["image"].shape, ds["image"].dtype)

        ds["image", 3, 0:2048, 0:2048] = np.ones(
            (2048, 2048, 4), dtype="uint8")  # single chunk read/write
        print(ds._tensors["/image"].get_shape((3, )))
        ds.commit()
Example #24
0
def test():
    tv_cifar_ds = torchvision.datasets.CIFAR10(".", download=True)

    hub_cifar = HubAdapter2(tv_cifar_ds)

    pt2hb_ds = hub.Dataset.from_pytorch(hub_cifar, scheduler="threaded", workers=8)
    res_ds = pt2hb_ds.store("./data/test/cifar/train")
    hub_s3_ds = hub.Dataset(
        url="./data/test/cifar/train", cache=False, storage_cache=False
    )
    for key, value in hub_s3_ds._tensors.items():
        print(key, value.shape, value.chunks)
    hub_s3_ds = hub_s3_ds.to_pytorch()
    dl = torch.utils.data.DataLoader(hub_s3_ds, batch_size=10, num_workers=0)
    with Timer("Time"):
        counter = 0
        t0 = time()
        for i, b in enumerate(dl):
            x, y = b["label"], b["label"]
            counter += 100
            t1 = time()
            print(counter, f"dt: {t1 - t0}")
            t0 = t1
Example #25
0
def time_iter_hub_local_pytorch(
    dataset_info,
    batch_size=BATCH_SIZE,
    prefetch_factor=PREFETCH_SIZE,
    num_workers=NUM_WORKERS,
    process=None,
):
    mnist = prepare_torch_dataset(dataset_info)
    path = os.path.join(ROOT, "Hub_data", "torch")
    Dataset.from_pytorch(HubAdapter(mnist)).store(path)
    dset = Dataset(path, cache=False, storage_cache=False, mode="r")

    loader = torch.utils.data.DataLoader(
        dset.to_pytorch(),
        batch_size=batch_size,
        prefetch_factor=prefetch_factor,
        num_workers=num_workers,
    )

    with Timer("Hub (local) `.to_pytorch()`"):
        for image, label in loader:
            if process is not None:
                process(image, label)
Example #26
0
def benchmark(sample_size=100, width=1000, channels=4, dtype="int8"):
    numpy_arr = np.zeros((sample_size, width, width, channels), dtype=dtype)
    zarr_fs = zarr.zeros(
        (sample_size, width, width, channels),
        dtype=dtype,
        store=zarr.storage.FSStore("./data/test/array"),
        overwrite=True,
    )
    zarr_lmdb = zarr.zeros(
        (sample_size, width, width, channels),
        dtype=dtype,
        store=zarr.storage.LMDBStore("./data/test/array2"),
        overwrite=True,
    )

    my_schema = {
        "image": Tensor((width, width, channels), dtype,
                        (width, width, channels)),
    }

    ds_fs = hub.Dataset(
        "./data/test/test_pipeline_basic_3",
        mode="w",
        shape=(sample_size, ),
        schema=my_schema,
        cache=0,
    )

    ds_fs_cache = hub.Dataset(
        "./data/test/test_pipeline_basic_2",
        mode="w",
        shape=(sample_size, ),
        schema=my_schema,
    )
    if False:
        print(
            f"~~~ Sequential write of {sample_size}x{width}x{width}x{channels} random arrays ~~~"
        )
        for name, arr in [
            ("Numpy", numpy_arr),
            ("Zarr FS", zarr_fs),
            ("Zarr LMDB", zarr_lmdb),
            ("Hub FS", ds_fs["image"]),
            ("Hub FS+Cache", ds_fs_cache["image"]),
        ]:
            with Timer(name):
                for i in range(sample_size):
                    arr[i] = (np.random.rand(width, width, channels) *
                              255).astype(dtype)

    print(
        f"~~~ Pipeline {sample_size}x{width}x{width}x{channels} random arrays ~~~"
    )
    for name, processes in [
        ("single", 1),
        ("processed", 10),
    ]:  # , ("ray", 10), ("green", 10), ("dask", 10)]:

        @hub.transform(schema=my_schema, scheduler=name, processes=processes)
        def my_transform(sample):
            return {
                "image":
                (np.random.rand(width, width, channels) * 255).astype(dtype),
            }

        with Timer(name):
            out_ds = my_transform(ds_fs)
            out_ds.store(f"./data/test/test_pipeline_basic_output_{name}")
Example #27
0
    ]:  # , ("ray", 10), ("green", 10), ("dask", 10)]:

        @hub.transform(schema=my_schema, scheduler=name, processes=processes)
        def my_transform(sample):
            return {
                "image":
                (np.random.rand(width, width, channels) * 255).astype(dtype),
            }

        with Timer(name):
            out_ds = my_transform(ds_fs)
            out_ds.store(f"./data/test/test_pipeline_basic_output_{name}")


if __name__ == "__main__":
    with Timer("Test Transform"):
        with Timer("test threaded"):
            test_threaded()

        with Timer("test pipeline"):
            test_pipeline()

        with Timer("test multiprocessing"):
            test_multiprocessing()

        with Timer("test Pipeline"):
            test_pipeline_basic()

        with Timer("test Pipeline Dynamic"):
            test_pipeline_dynamic()
Example #28
0
@pytest.mark.skipif(not pytorch_loaded(),
                    reason="requires pytorch to be loaded")
def test_to_pytorch_bug():
    ds = hub.Dataset("activeloop/mnist", mode="r")
    data = ds.to_pytorch()


@pytest.mark.skipif(not pytorch_loaded(),
                    reason="requires pytorch to be loaded")
def test_to_tensorflow_bug():
    ds = hub.Dataset("activeloop/coco_train")
    data = ds.to_tensorflow()


if __name__ == "__main__":
    with Timer("Test Converters"):
        with Timer("from MNIST"):
            test_from_tfds_mnist()

        with Timer("from COCO"):
            test_from_tfds_coco()

        with Timer("from TF"):
            test_from_tensorflow()

        with Timer("To From TF"):
            test_to_from_tensorflow()

        with Timer("To From PyTorch"):
            test_to_from_pytorch()
Example #29
0
def bench_pil_compression(times=REPEAT_TIMES):
    with Timer("PIL compression"):
        for i in range(times):
            b = BytesIO()
            IMG.save(b, format="png")
Example #30
0
import hub
from hub.utils import Timer
import tensorflow_datasets as tfds


def benchmark_coco(num=5):
    with tfds.testing.mock_data(num_examples=num):
        ds = hub.Dataset.from_tfds("coco", num=num)

        res_ds = ds.store(
            "./data/test_tfds/coco", length=num
        )  # mock data doesn't have length, so explicitly provided


if __name__ == "__main__":
    nums = [5, 100, 1000, 10000, 100000]
    for num in nums:
        with Timer("Coco " + str(num) + " samples"):
            benchmark_coco(num)