Example #1
0
def test_from_pytorch():
    from torch.utils.data import Dataset

    class TestDataset(Dataset):
        def __init__(self, transform=None):
            self.transform = transform

        def __len__(self):
            return 12

        def __iter__(self):
            for i in range(len(self)):
                yield self[i]

        def __getitem__(self, idx):
            image = 5 * np.ones((256, 256, 3))
            landmarks = 7 * np.ones((10, 10, 10))
            named = "testing text labels"
            sample = {
                "data": {
                    "image": image,
                    "landmarks": landmarks
                },
                "labels": {
                    "named": named
                },
            }

            if self.transform:
                sample = self.transform(sample)
            return sample

    tds = TestDataset()
    with Timer("from_pytorch"):
        ds = hub.Dataset.from_pytorch(tds)

    ds = ds.store("./data/test_from_pytorch/test1")

    assert (ds["data", "image", 3].numpy() == 5 * np.ones((256, 256, 3))).all()
    assert (ds["data", "landmarks", 2].numpy() == 7 * np.ones(
        (10, 10, 10))).all()
    assert ds["labels", "named", 5].numpy() == "testing text labels"
Example #2
0
def time_iter_pytorch(
    dataset_info,
    batch_size=BATCH_SIZE,
    prefetch_factor=PREFETCH_SIZE,
    num_workers=NUM_WORKERS,
    process=None,
):
    dset = prepare_torch_dataset(dataset_info)

    loader = torch.utils.data.DataLoader(
        dset,
        batch_size=batch_size,
        prefetch_factor=prefetch_factor,
        num_workers=num_workers,
    )

    with Timer("PyTorch (local, native)"):
        for image, label in loader:
            if process is not None:
                process(image, label)
Example #3
0
def time_iter_pytorch(dataset_name="activeloop/mnist",
                      batch_size=1,
                      prefetch_factor=0,
                      process=None):

    dset = Dataset(dataset_name, cache=False, storage_cache=False, mode="r")

    loader = torch.utils.data.DataLoader(
        dset.to_pytorch(),
        batch_size=batch_size,
        prefetch_factor=prefetch_factor,
        num_workers=1,
    )

    with Timer(
            f"{dataset_name} PyTorch prefetch {prefetch_factor:03} in batches of {batch_size:03}"
    ):
        for idx, (image, label) in enumerate(loader):
            if process is not None:
                process(idx, image, label)
def main():
    sample_count = 70000
    step = 10
    with Timer("Time"):

        ds = hub.Dataset(
            "./data/examples/mnist_upload_speed_benchmark",
            mode="w",
            schema=schema,
            shape=(sample_count, ),
            cache=2**26,
        )

        arr = (np.random.rand(step, 28, 28) * 100).astype("uint8")

        for i in range(0, sample_count, step):
            # with Timer(f"Sample {i}"):
            ds["image", i:i + step] = arr

        ds.commit()
Example #5
0
def test_multiprocessing(sample_size=200,
                         width=100,
                         channels=4,
                         dtype="uint8"):

    my_schema = {
        "image":
        Image(
            (width, width, channels),
            dtype,
            (width, width, channels),
            chunks=(sample_size // 20),
            compressor="LZ4",
        ),
    }

    with Timer("multiprocesing"):

        @hub.transform(schema=my_schema, scheduler="threaded", workers=4)
        def my_transform(x):

            a = np.random.random((width, width, channels))
            for i in range(100):
                a *= np.random.random((width, width, channels))

            return {
                "image": (np.ones(
                    (width, width, channels), dtype=dtype) * 255),
            }

        ds = hub.Dataset(
            "./data/test/test_pipeline_basic_4",
            mode="w",
            shape=(sample_size, ),
            schema=my_schema,
            cache=2 * 26,
        )

        ds_t = my_transform(ds).store("./data/test/test_pipeline_basic_4")

    assert (ds_t["image", :].compute() == 255).all()
Example #6
0
def time_zarr(dataset, batch_size=1):
    ds = hub.Dataset(dataset)
    if os.path.exists(dataset.split("/")[1] + "_zarr"):
        ds_zarr = zarr.open(dataset.split("/")[1] + "_zarr")
    else:
        store = zarr.DirectoryStore(dataset.split("/")[1] + "_zarr")
        shape = [
            ds["image"].shape[0],
            ds["image"].shape[1] * ds["image"].shape[2] * ds["image"].shape[3]
            + 1,
        ]
        ds_zarr = zarr.create((shape[0], shape[1]),
                              store=store,
                              chunks=(batch_size, None))
        for batch in range(ds.shape[0] // batch_size):
            ds_numpy = np.concatenate(
                (
                    ds["image", batch * batch_size:(batch + 1) *
                       batch_size].compute().reshape(batch_size, -1),
                    ds["label", batch * batch_size:(batch + 1) *
                       batch_size].compute().reshape(batch_size, -1),
                ),
                axis=1,
            )
            ds_zarr[batch * batch_size:(batch + 1) * batch_size] = ds_numpy

    assert type(ds_zarr) == zarr.core.Array

    with Timer("Time"):
        counter = 0
        t0 = time()
        for batch in range(ds.shape[0] // batch_size):
            x, y = (
                ds_zarr[batch * batch_size:(batch + 1) * batch_size, :-1],
                ds_zarr[batch * batch_size:(batch + 1) * batch_size, -1],
            )
            counter += 1
            t1 = time()
            print("Batch", counter, f"dt: {t1 - t0}")
            t0 = t1
Example #7
0
def time_iter_hub_wasabi_pytorch(
    dataset_info,
    batch_size=BATCH_SIZE,
    prefetch_factor=PREFETCH_SIZE,
    num_workers=NUM_WORKERS,
    process=None,
):
    dset = Dataset(dataset_info["hub_name"],
                   cache=False,
                   storage_cache=False,
                   mode="r")
    loader = torch.utils.data.DataLoader(
        dset.to_pytorch(),
        batch_size=batch_size,
        prefetch_factor=prefetch_factor,
        num_workers=num_workers,
    )

    with Timer("Hub (remote - Wasabi) `.to_pytorch()`"):
        for image, label in loader:
            if process is not None:
                process(image, label)
Example #8
0
def main():
    with Timer("Time"):
        schema = {
            "image":
            Image(
                (None, None, 4),
                dtype="uint8",
                chunks=(1, 2048, 2048, 4),
                max_shape=(100000, 100000, 4),
            )
        }
        ds = hub.Dataset("./data/examples/big_image",
                         mode="w",
                         schema=schema,
                         shape=(10000, ))

        print(ds["image"].shape, ds["image"].dtype)

        ds["image", 3, 0:2048, 0:2048] = np.ones(
            (2048, 2048, 4), dtype="uint8")  # single chunk read/write
        print(ds._tensors["/image"].get_shape((3, )))
        ds.commit()
Example #9
0
def test():
    tv_cifar_ds = torchvision.datasets.CIFAR10(".", download=True)

    hub_cifar = HubAdapter2(tv_cifar_ds)

    pt2hb_ds = hub.Dataset.from_pytorch(hub_cifar, scheduler="threaded", workers=8)
    res_ds = pt2hb_ds.store("./data/test/cifar/train")
    hub_s3_ds = hub.Dataset(
        url="./data/test/cifar/train", cache=False, storage_cache=False
    )
    for key, value in hub_s3_ds._tensors.items():
        print(key, value.shape, value.chunks)
    hub_s3_ds = hub_s3_ds.to_pytorch()
    dl = torch.utils.data.DataLoader(hub_s3_ds, batch_size=10, num_workers=0)
    with Timer("Time"):
        counter = 0
        t0 = time()
        for i, b in enumerate(dl):
            x, y = b["label"], b["label"]
            counter += 100
            t1 = time()
            print(counter, f"dt: {t1 - t0}")
            t0 = t1
Example #10
0
def time_iter_hub_local_pytorch(
    dataset_info,
    batch_size=BATCH_SIZE,
    prefetch_factor=PREFETCH_SIZE,
    num_workers=NUM_WORKERS,
    process=None,
):
    mnist = prepare_torch_dataset(dataset_info)
    path = os.path.join(ROOT, "Hub_data", "torch")
    Dataset.from_pytorch(HubAdapter(mnist)).store(path)
    dset = Dataset(path, cache=False, storage_cache=False, mode="r")

    loader = torch.utils.data.DataLoader(
        dset.to_pytorch(),
        batch_size=batch_size,
        prefetch_factor=prefetch_factor,
        num_workers=num_workers,
    )

    with Timer("Hub (local) `.to_pytorch()`"):
        for image, label in loader:
            if process is not None:
                process(image, label)
Example #11
0
    ]:  # , ("ray", 10), ("green", 10), ("dask", 10)]:

        @hub.transform(schema=my_schema, scheduler=name, processes=processes)
        def my_transform(sample):
            return {
                "image":
                (np.random.rand(width, width, channels) * 255).astype(dtype),
            }

        with Timer(name):
            out_ds = my_transform(ds_fs)
            out_ds.store(f"./data/test/test_pipeline_basic_output_{name}")


if __name__ == "__main__":
    with Timer("Test Transform"):
        with Timer("test threaded"):
            test_threaded()

        with Timer("test pipeline"):
            test_pipeline()

        with Timer("test multiprocessing"):
            test_multiprocessing()

        with Timer("test Pipeline"):
            test_pipeline_basic()

        with Timer("test Pipeline Dynamic"):
            test_pipeline_dynamic()
Example #12
0
@pytest.mark.skipif(not pytorch_loaded(),
                    reason="requires pytorch to be loaded")
def test_to_pytorch_bug():
    ds = hub.Dataset("activeloop/mnist", mode="r")
    data = ds.to_pytorch()


@pytest.mark.skipif(not pytorch_loaded(),
                    reason="requires pytorch to be loaded")
def test_to_tensorflow_bug():
    ds = hub.Dataset("activeloop/coco_train")
    data = ds.to_tensorflow()


if __name__ == "__main__":
    with Timer("Test Converters"):
        with Timer("from MNIST"):
            test_from_tfds_mnist()

        with Timer("from COCO"):
            test_from_tfds_coco()

        with Timer("from TF"):
            test_from_tensorflow()

        with Timer("To From TF"):
            test_to_from_tensorflow()

        with Timer("To From PyTorch"):
            test_to_from_pytorch()
Example #13
0
def bench_pil_compression(times=REPEAT_TIMES):
    with Timer("PIL compression"):
        for i in range(times):
            b = BytesIO()
            IMG.save(b, format="png")
Example #14
0
import hub
from hub.utils import Timer
from hub import dev_mode

dev_mode()

if __name__ == "__main__":
    # path = "s3://snark-test/coco_dataset"
    path = "./data/test/coco"
    with Timer("Eurosat TFDS"):
        out_ds = hub.Dataset.from_tfds("coco", num=1000)

        res_ds = out_ds.store(path)
        ds = hub.load(path)
Example #15
0
def benchmark(sample_size=100, width=1000, channels=4, dtype="int8"):
    numpy_arr = np.zeros((sample_size, width, width, channels), dtype=dtype)
    zarr_fs = zarr.zeros(
        (sample_size, width, width, channels),
        dtype=dtype,
        store=zarr.storage.FSStore("./data/test/array"),
        overwrite=True,
    )
    zarr_lmdb = zarr.zeros(
        (sample_size, width, width, channels),
        dtype=dtype,
        store=zarr.storage.LMDBStore("./data/test/array2"),
        overwrite=True,
    )

    my_schema = {
        "image": Tensor((width, width, channels), dtype,
                        (width, width, channels)),
    }

    ds_fs = hub.Dataset(
        "./data/test/test_pipeline_basic_3",
        mode="w",
        shape=(sample_size, ),
        schema=my_schema,
        cache=0,
    )

    ds_fs_cache = hub.Dataset(
        "./data/test/test_pipeline_basic_2",
        mode="w",
        shape=(sample_size, ),
        schema=my_schema,
    )
    if False:
        print(
            f"~~~ Sequential write of {sample_size}x{width}x{width}x{channels} random arrays ~~~"
        )
        for name, arr in [
            ("Numpy", numpy_arr),
            ("Zarr FS", zarr_fs),
            ("Zarr LMDB", zarr_lmdb),
            ("Hub FS", ds_fs["image"]),
            ("Hub FS+Cache", ds_fs_cache["image"]),
        ]:
            with Timer(name):
                for i in range(sample_size):
                    arr[i] = (np.random.rand(width, width, channels) *
                              255).astype(dtype)

    print(
        f"~~~ Pipeline {sample_size}x{width}x{width}x{channels} random arrays ~~~"
    )
    for name, processes in [
        ("single", 1),
        ("processed", 10),
    ]:  # , ("ray", 10), ("green", 10), ("dask", 10)]:

        @hub.transform(schema=my_schema, scheduler=name, processes=processes)
        def my_transform(sample):
            return {
                "image":
                (np.random.rand(width, width, channels) * 255).astype(dtype),
            }

        with Timer(name):
            out_ds = my_transform(ds_fs)
            out_ds.store(f"./data/test/test_pipeline_basic_output_{name}")
Example #16
0
import hub
from hub.utils import Timer
import tensorflow_datasets as tfds


def benchmark_coco(num=5):
    with tfds.testing.mock_data(num_examples=num):
        ds = hub.Dataset.from_tfds("coco", num=num)

        res_ds = ds.store(
            "./data/test_tfds/coco", length=num
        )  # mock data doesn't have length, so explicitly provided


if __name__ == "__main__":
    nums = [5, 100, 1000, 10000, 100000]
    for num in nums:
        with Timer("Coco " + str(num) + " samples"):
            benchmark_coco(num)
Example #17
0
def time_random_access(
    dataset_name="activeloop/mnist", offset=1000, span=1000, field="image"
):
    dset = Dataset(dataset_name, cache=False, storage_cache=False)
    with Timer(f"{dataset_name} read at offset {offset:03} of length {span:03}"):
        dset[field][offset : offset + span].compute()