def test_from_pytorch(): from torch.utils.data import Dataset class TestDataset(Dataset): def __init__(self, transform=None): self.transform = transform def __len__(self): return 12 def __iter__(self): for i in range(len(self)): yield self[i] def __getitem__(self, idx): image = 5 * np.ones((256, 256, 3)) landmarks = 7 * np.ones((10, 10, 10)) named = "testing text labels" sample = { "data": { "image": image, "landmarks": landmarks }, "labels": { "named": named }, } if self.transform: sample = self.transform(sample) return sample tds = TestDataset() with Timer("from_pytorch"): ds = hub.Dataset.from_pytorch(tds) ds = ds.store("./data/test_from_pytorch/test1") assert (ds["data", "image", 3].numpy() == 5 * np.ones((256, 256, 3))).all() assert (ds["data", "landmarks", 2].numpy() == 7 * np.ones( (10, 10, 10))).all() assert ds["labels", "named", 5].numpy() == "testing text labels"
def time_iter_pytorch( dataset_info, batch_size=BATCH_SIZE, prefetch_factor=PREFETCH_SIZE, num_workers=NUM_WORKERS, process=None, ): dset = prepare_torch_dataset(dataset_info) loader = torch.utils.data.DataLoader( dset, batch_size=batch_size, prefetch_factor=prefetch_factor, num_workers=num_workers, ) with Timer("PyTorch (local, native)"): for image, label in loader: if process is not None: process(image, label)
def time_iter_pytorch(dataset_name="activeloop/mnist", batch_size=1, prefetch_factor=0, process=None): dset = Dataset(dataset_name, cache=False, storage_cache=False, mode="r") loader = torch.utils.data.DataLoader( dset.to_pytorch(), batch_size=batch_size, prefetch_factor=prefetch_factor, num_workers=1, ) with Timer( f"{dataset_name} PyTorch prefetch {prefetch_factor:03} in batches of {batch_size:03}" ): for idx, (image, label) in enumerate(loader): if process is not None: process(idx, image, label)
def main(): sample_count = 70000 step = 10 with Timer("Time"): ds = hub.Dataset( "./data/examples/mnist_upload_speed_benchmark", mode="w", schema=schema, shape=(sample_count, ), cache=2**26, ) arr = (np.random.rand(step, 28, 28) * 100).astype("uint8") for i in range(0, sample_count, step): # with Timer(f"Sample {i}"): ds["image", i:i + step] = arr ds.commit()
def test_multiprocessing(sample_size=200, width=100, channels=4, dtype="uint8"): my_schema = { "image": Image( (width, width, channels), dtype, (width, width, channels), chunks=(sample_size // 20), compressor="LZ4", ), } with Timer("multiprocesing"): @hub.transform(schema=my_schema, scheduler="threaded", workers=4) def my_transform(x): a = np.random.random((width, width, channels)) for i in range(100): a *= np.random.random((width, width, channels)) return { "image": (np.ones( (width, width, channels), dtype=dtype) * 255), } ds = hub.Dataset( "./data/test/test_pipeline_basic_4", mode="w", shape=(sample_size, ), schema=my_schema, cache=2 * 26, ) ds_t = my_transform(ds).store("./data/test/test_pipeline_basic_4") assert (ds_t["image", :].compute() == 255).all()
def time_zarr(dataset, batch_size=1): ds = hub.Dataset(dataset) if os.path.exists(dataset.split("/")[1] + "_zarr"): ds_zarr = zarr.open(dataset.split("/")[1] + "_zarr") else: store = zarr.DirectoryStore(dataset.split("/")[1] + "_zarr") shape = [ ds["image"].shape[0], ds["image"].shape[1] * ds["image"].shape[2] * ds["image"].shape[3] + 1, ] ds_zarr = zarr.create((shape[0], shape[1]), store=store, chunks=(batch_size, None)) for batch in range(ds.shape[0] // batch_size): ds_numpy = np.concatenate( ( ds["image", batch * batch_size:(batch + 1) * batch_size].compute().reshape(batch_size, -1), ds["label", batch * batch_size:(batch + 1) * batch_size].compute().reshape(batch_size, -1), ), axis=1, ) ds_zarr[batch * batch_size:(batch + 1) * batch_size] = ds_numpy assert type(ds_zarr) == zarr.core.Array with Timer("Time"): counter = 0 t0 = time() for batch in range(ds.shape[0] // batch_size): x, y = ( ds_zarr[batch * batch_size:(batch + 1) * batch_size, :-1], ds_zarr[batch * batch_size:(batch + 1) * batch_size, -1], ) counter += 1 t1 = time() print("Batch", counter, f"dt: {t1 - t0}") t0 = t1
def time_iter_hub_wasabi_pytorch( dataset_info, batch_size=BATCH_SIZE, prefetch_factor=PREFETCH_SIZE, num_workers=NUM_WORKERS, process=None, ): dset = Dataset(dataset_info["hub_name"], cache=False, storage_cache=False, mode="r") loader = torch.utils.data.DataLoader( dset.to_pytorch(), batch_size=batch_size, prefetch_factor=prefetch_factor, num_workers=num_workers, ) with Timer("Hub (remote - Wasabi) `.to_pytorch()`"): for image, label in loader: if process is not None: process(image, label)
def main(): with Timer("Time"): schema = { "image": Image( (None, None, 4), dtype="uint8", chunks=(1, 2048, 2048, 4), max_shape=(100000, 100000, 4), ) } ds = hub.Dataset("./data/examples/big_image", mode="w", schema=schema, shape=(10000, )) print(ds["image"].shape, ds["image"].dtype) ds["image", 3, 0:2048, 0:2048] = np.ones( (2048, 2048, 4), dtype="uint8") # single chunk read/write print(ds._tensors["/image"].get_shape((3, ))) ds.commit()
def test(): tv_cifar_ds = torchvision.datasets.CIFAR10(".", download=True) hub_cifar = HubAdapter2(tv_cifar_ds) pt2hb_ds = hub.Dataset.from_pytorch(hub_cifar, scheduler="threaded", workers=8) res_ds = pt2hb_ds.store("./data/test/cifar/train") hub_s3_ds = hub.Dataset( url="./data/test/cifar/train", cache=False, storage_cache=False ) for key, value in hub_s3_ds._tensors.items(): print(key, value.shape, value.chunks) hub_s3_ds = hub_s3_ds.to_pytorch() dl = torch.utils.data.DataLoader(hub_s3_ds, batch_size=10, num_workers=0) with Timer("Time"): counter = 0 t0 = time() for i, b in enumerate(dl): x, y = b["label"], b["label"] counter += 100 t1 = time() print(counter, f"dt: {t1 - t0}") t0 = t1
def time_iter_hub_local_pytorch( dataset_info, batch_size=BATCH_SIZE, prefetch_factor=PREFETCH_SIZE, num_workers=NUM_WORKERS, process=None, ): mnist = prepare_torch_dataset(dataset_info) path = os.path.join(ROOT, "Hub_data", "torch") Dataset.from_pytorch(HubAdapter(mnist)).store(path) dset = Dataset(path, cache=False, storage_cache=False, mode="r") loader = torch.utils.data.DataLoader( dset.to_pytorch(), batch_size=batch_size, prefetch_factor=prefetch_factor, num_workers=num_workers, ) with Timer("Hub (local) `.to_pytorch()`"): for image, label in loader: if process is not None: process(image, label)
]: # , ("ray", 10), ("green", 10), ("dask", 10)]: @hub.transform(schema=my_schema, scheduler=name, processes=processes) def my_transform(sample): return { "image": (np.random.rand(width, width, channels) * 255).astype(dtype), } with Timer(name): out_ds = my_transform(ds_fs) out_ds.store(f"./data/test/test_pipeline_basic_output_{name}") if __name__ == "__main__": with Timer("Test Transform"): with Timer("test threaded"): test_threaded() with Timer("test pipeline"): test_pipeline() with Timer("test multiprocessing"): test_multiprocessing() with Timer("test Pipeline"): test_pipeline_basic() with Timer("test Pipeline Dynamic"): test_pipeline_dynamic()
@pytest.mark.skipif(not pytorch_loaded(), reason="requires pytorch to be loaded") def test_to_pytorch_bug(): ds = hub.Dataset("activeloop/mnist", mode="r") data = ds.to_pytorch() @pytest.mark.skipif(not pytorch_loaded(), reason="requires pytorch to be loaded") def test_to_tensorflow_bug(): ds = hub.Dataset("activeloop/coco_train") data = ds.to_tensorflow() if __name__ == "__main__": with Timer("Test Converters"): with Timer("from MNIST"): test_from_tfds_mnist() with Timer("from COCO"): test_from_tfds_coco() with Timer("from TF"): test_from_tensorflow() with Timer("To From TF"): test_to_from_tensorflow() with Timer("To From PyTorch"): test_to_from_pytorch()
def bench_pil_compression(times=REPEAT_TIMES): with Timer("PIL compression"): for i in range(times): b = BytesIO() IMG.save(b, format="png")
import hub from hub.utils import Timer from hub import dev_mode dev_mode() if __name__ == "__main__": # path = "s3://snark-test/coco_dataset" path = "./data/test/coco" with Timer("Eurosat TFDS"): out_ds = hub.Dataset.from_tfds("coco", num=1000) res_ds = out_ds.store(path) ds = hub.load(path)
def benchmark(sample_size=100, width=1000, channels=4, dtype="int8"): numpy_arr = np.zeros((sample_size, width, width, channels), dtype=dtype) zarr_fs = zarr.zeros( (sample_size, width, width, channels), dtype=dtype, store=zarr.storage.FSStore("./data/test/array"), overwrite=True, ) zarr_lmdb = zarr.zeros( (sample_size, width, width, channels), dtype=dtype, store=zarr.storage.LMDBStore("./data/test/array2"), overwrite=True, ) my_schema = { "image": Tensor((width, width, channels), dtype, (width, width, channels)), } ds_fs = hub.Dataset( "./data/test/test_pipeline_basic_3", mode="w", shape=(sample_size, ), schema=my_schema, cache=0, ) ds_fs_cache = hub.Dataset( "./data/test/test_pipeline_basic_2", mode="w", shape=(sample_size, ), schema=my_schema, ) if False: print( f"~~~ Sequential write of {sample_size}x{width}x{width}x{channels} random arrays ~~~" ) for name, arr in [ ("Numpy", numpy_arr), ("Zarr FS", zarr_fs), ("Zarr LMDB", zarr_lmdb), ("Hub FS", ds_fs["image"]), ("Hub FS+Cache", ds_fs_cache["image"]), ]: with Timer(name): for i in range(sample_size): arr[i] = (np.random.rand(width, width, channels) * 255).astype(dtype) print( f"~~~ Pipeline {sample_size}x{width}x{width}x{channels} random arrays ~~~" ) for name, processes in [ ("single", 1), ("processed", 10), ]: # , ("ray", 10), ("green", 10), ("dask", 10)]: @hub.transform(schema=my_schema, scheduler=name, processes=processes) def my_transform(sample): return { "image": (np.random.rand(width, width, channels) * 255).astype(dtype), } with Timer(name): out_ds = my_transform(ds_fs) out_ds.store(f"./data/test/test_pipeline_basic_output_{name}")
import hub from hub.utils import Timer import tensorflow_datasets as tfds def benchmark_coco(num=5): with tfds.testing.mock_data(num_examples=num): ds = hub.Dataset.from_tfds("coco", num=num) res_ds = ds.store( "./data/test_tfds/coco", length=num ) # mock data doesn't have length, so explicitly provided if __name__ == "__main__": nums = [5, 100, 1000, 10000, 100000] for num in nums: with Timer("Coco " + str(num) + " samples"): benchmark_coco(num)
def time_random_access( dataset_name="activeloop/mnist", offset=1000, span=1000, field="image" ): dset = Dataset(dataset_name, cache=False, storage_cache=False) with Timer(f"{dataset_name} read at offset {offset:03} of length {span:03}"): dset[field][offset : offset + span].compute()