def write_hub(arr, path, overwrite=True): """Write a hub dataset to disk """ if os.path.exists(path) and os.path.isdir(path) and overwrite: shutil.rmtree(path) if os.path.exists(path): raise FileExistsError("Output path {} already exists".format(path)) if arr.ndim == 1: schema = {"value": hub.schema.Tensor(arr.shape[0])} dataset = hub.Dataset(path, shape=(1, ), schema=schema, mode='w') dataset["value", 0][:] = arr.astype(np.float32) dataset.flush() dataset.close() elif arr.ndim == 2: schema = {"value": hub.schema.Tensor(arr.shape[1])} dataset = hub.Dataset(path, shape=(arr.shape[0], ), schema=schema, mode='w') dataset["value"][:] = arr.astype(np.float32) dataset.flush() dataset.close() else: raise ValueError("hub backend only supports 1D or 2D arrays")
def get_dataset_from_hub(samples=1, read_from_fs=False, pytorch=False): """ Build dataset and transform to pytorch or tensorflow """ my_schema = {"img": Tensor(shape=(3, 256, 256)), "label": "uint8"} if not read_from_fs: ds = hub.Dataset( "kristina/benchmarking", shape=(samples, ), schema=my_schema, cache=False, ) else: ds = hub.Dataset( "s3://snark-test/benchmarking", shape=(samples, ), schema=my_schema, cache=False, ) for i in range(samples): ds["img", i] = np.random.rand(3, 256, 256) ds["label", i] = 0 ds_hub = ds.to_pytorch() if pytorch else ds.to_tensorflow() ds = MyDataset(ds_hub) return ds
def create_large_dataset(): sample_count = 60 # change this to big number to test # Decide schema of the dataset schema = { "image": Tensor((1920, 1080, 3), chunks=(2, 1920, 1080, 3), dtype="float64") } array = np.random.random((10, 1920, 1080, 3)) # Write the dataset ds = hub.Dataset( "./data/examples/large_dataset_build", shape=(sample_count, ), schema=schema, ) for i in range(len(ds) // 10): ds["image", i * 10:i * 10 + 10] = i * array ds.commit() ds = hub.Dataset("./data/examples/large_dataset_build") print(ds.keys, ds["image"].shape, ds["image"].dtype) # Read the dataset with hub.Dataset("./data/examples/large_dataset_build") as ds: for i in range(len(ds) // 10): assert (ds["image", i * 10, 0, 0, 0].compute() / array[0, 0, 0, 0]) == i
def infer_dataset(path, scheduler="single", workers=1): # TODO: handle s3 path if not os.path.isdir(path): raise Exception("input path must be either a directory") hub_path = os.path.join("./", path, "hub") if os.path.isdir(hub_path): print('inferred dataset found in "%s", using that' % hub_path) return hub.Dataset(hub_path, mode="r") root = _find_root(path) ds = None directory_parsers = state.get_parsers() if len(directory_parsers) <= 0: raise Exception("directory parsers list was empty.") # go through all functions created using the `directory_parser` decorator in # `hub.schema.auto.directory_parsers` for parser in directory_parsers: ds = parser(root, scheduler, workers) if ds is not None: break if ds is None: raise Exception( 'could not infer dataset for the root "%s". either add a new parser to' % root + "`hub.schema.auto.directory_parsers` or write a custom transform + schema." ) ds.store(hub_path) # TODO: handle s3 return hub.Dataset(hub_path, mode="r")
def test_read_mode(): my_schema = {"abc": "uint8"} ds = hub.Dataset("./data/test_versioning/read_ds", schema=my_schema, shape=(10, )) ds.checkout("second", create=True) ds2 = hub.Dataset("./data/test_versioning/read_ds", mode="r") with pytest.raises(ReadModeException): ds2.commit("first") with pytest.raises(ReadModeException): ds2.checkout("third", create=True) with pytest.raises(ReadModeException): ds2["abc", 4] = 10
def test_stacked_transform(): schema = {"test": Tensor((2, 2), dtype="uint8")} @hub.transform(schema=schema) def multiply_transform(sample, multiplier=1, times=1): if times == 1: return {"test": multiplier * sample["test"]} else: return [{ "test": multiplier * sample["test"] } for i in range(times)] @hub.transform(schema=schema) def multiply_transform_2(sample, multiplier=1, times=1): if times == 1: return {"test": multiplier * sample["test"]} else: return [{ "test": multiplier * sample["test"] } for i in range(times)] ds = hub.Dataset("./data/stacked_transform", mode="w", shape=(5, ), schema=schema) for i in range(5): ds["test", i] = np.ones((2, 2)) ds1 = multiply_transform(ds, multiplier=2, times=5) ds2 = multiply_transform(ds1, multiplier=3, times=2) ds3 = multiply_transform_2(ds2, multiplier=5, times=3) ds4 = ds3.store("./data/stacked_transform_2") assert len(ds4) == 150 assert (ds4["test", 0].compute() == 30 * np.ones((2, 2))).all()
def time_tiledb(dataset, batch_size=1): ds = hub.Dataset(dataset) if os.path.exists(dataset.split("/")[1] + "_tileDB"): ds_tldb = tiledb.open(dataset.split("/")[1] + "_tileDB") else: if not os.path.exists(dataset.split("/")[1] + "_tileDB"): os.makedirs(dataset.split("/")[1] + "_tileDB") ds_numpy = np.concatenate( ( ds["image"].compute().reshape(ds.shape[0], -1), ds["label"].compute().reshape(ds.shape[0], -1), ), axis=1, ) ds_tldb = tiledb.from_numpy( dataset.split("/")[1] + "_tileDB", ds_numpy) assert type(ds_tldb) == tiledb.array.DenseArray with Timer("Time"): counter = 0 t0 = time() for batch in range(ds.shape[0] // batch_size): x, y = ( ds_tldb[batch * batch_size:(batch + 1) * batch_size, :-1], ds_tldb[batch * batch_size:(batch + 1) * batch_size, -1], ) counter += 1 t1 = time() print("Batch", counter, f"dt: {t1 - t0}") t0 = t1
def test_pipeline_basic(): ds = hub.Dataset("./data/test/test_pipeline_basic", mode="w", shape=(100, ), schema=my_schema) for i in range(len(ds)): ds["image", i] = np.ones((28, 28, 4), dtype="int32") ds["label", i] = f"hello {i}" ds["confidence", i] = 0.2 @hub.transform(schema=my_schema) def my_transform(sample, multiplier: int = 2): return { "image": sample["image"] * multiplier, "label": sample["label"], "confidence": sample["confidence"] * multiplier, } out_ds = my_transform(ds, multiplier=2) assert (out_ds["image", 0].compute() == 2).all() assert len(list(out_ds)) == 100 res_ds = out_ds.store("./data/test/test_pipeline_basic_output") assert res_ds["label", 5].compute() == "hello 5" assert (res_ds["image", 4].compute() == 2 * np.ones( (28, 28, 4), dtype="int32")).all() assert len(res_ds) == len(out_ds) assert res_ds.shape[0] == out_ds.shape[0] assert "image" in res_ds.schema.dict_ and "label" in res_ds.schema.dict_
def test_pipeline_ray(): ds = hub.Dataset( "./data/test/test_pipeline_basic", mode="w", shape=(100, ), schema=my_schema, cache=False, ) for i in range(len(ds)): ds["image", i] = np.ones((28, 28, 4), dtype="int32") ds["label", i] = f"hello {i}" ds["confidence/confidence", i] = 0.2 @hub.transform(schema=my_schema, scheduler="ray") def my_transform(sample, multiplier: int = 2): return { "image": sample["image"] * multiplier, "label": sample["label"], "confidence": { "confidence": sample["confidence"]["confidence"] * multiplier }, } out_ds = my_transform(ds, multiplier=2) assert (out_ds["image", 0].compute() == 2).all() assert len(list(out_ds)) == 100 out_ds.store("./data/test/test_pipeline_basic_output")
def test_checkout_address_not_found(): my_schema = {"abc": "uint8"} ds = hub.Dataset("./data/test_versioning/ds_address", schema=my_schema, shape=(10, )) with pytest.raises(AddressNotFound): ds.checkout("second")
def test_pipeline(): ds = hub.Dataset("./data/test/test_pipeline_multiple2", mode="w", shape=(100, ), schema=my_schema) for i in range(len(ds)): ds["image", i] = np.ones((28, 28, 4), dtype="int32") ds["label", i] = f"hello {i}" ds["confidence", i] = 0.2 with Timer("multiple pipes"): @hub.transform(schema=my_schema) def my_transform(sample, multiplier: int = 2): return { "image": sample["image"] * multiplier, "label": sample["label"], "confidence": sample["confidence"] * multiplier, } out_ds = my_transform(ds, multiplier=2) out_ds = my_transform(out_ds, multiplier=2) out_ds = out_ds.store("./data/test/test_pipeline_multiple_4") assert (out_ds["image", 0].compute() == 4).all()
def example_to_pytorch(): ds = hub.Dataset("activeloop/fashion_mnist_train") torch_ds = ds.to_pytorch(output_type=list) torch_dataloader = torch.utils.data.DataLoader( torch_ds, batch_size=8, ) return torch_dataloader
def __iter__(self): if self.dataset is None: self.dataset = hub.Dataset(self.path, self.storage) for x in self._enumerate(self.dataset): x = self.transform(x) yield (*list(x), )
def test_dataset_with_objects(): schema = {"images": Tensor(shape=(10,), dtype="object", chunks=(5,))} ds = hub.Dataset( "./data/test/test_dataset_with_objects", mode="w", shape=(100,), schema=schema ) ds["images", 6, 5] = np.ones((20, 30, 4), dtype="uint8") ds.close()
def test_old_datasets(): ds = hub.Dataset("activeloop/mnist") with pytest.raises(VersioningNotSupportedException): ds.checkout("third") with pytest.raises(VersioningNotSupportedException): ds.checkout("third", create=True) with pytest.raises(VersioningNotSupportedException): ds.log()
def generate_dataset(shape=(10,), size=(1024, 1024), chunksize=None): """ Generates a datasets with random tensors """ my_schema = {"img": Tensor(shape=shape, chunks=chunksize)} ds = hub.Dataset("kristina/benchmarking", shape=(10,), schema=my_schema) for i in range(shape): ds[i] = np.random.rand(size) return ds
def get_hub_dataset(): schema = hub.schema.SchemaDict({ 'text': hub.schema.Tensor(shape=(None, ), dtype='int64', max_shape=(2049, )) }) ds = hub.Dataset("snsi/pile_train0", schema=schema, shape=(100000, )).to_pytorch() # ds = hub.Dataset("interneuron/pile_train0", shape=(None,)).to_pytorch() return HubAdapter(ds)
def test_commit_checkout(): my_schema = {"img": hub.schema.Tensor((1000, 1000, 3))} ds = hub.Dataset("./data/eg_1", shape=(10, ), schema=my_schema, mode="w") for i in range(10): ds["img", i] = np.ones((1000, 1000, 3)) first_commit_id = ds.commit("stored all ones") for i in range(5): ds["img", i] = ds["img", i].compute() * 2 second_commit_id = ds.commit("multiplied value of some images by 2") assert (ds["img", 4].compute() == 2 * np.ones((1000, 1000, 3))).all() ds.checkout(first_commit_id) # now all images are ones again for i in range(10): assert (ds["img", i].compute() == np.ones((1000, 1000, 3))).all() ds.checkout( "alternate", create=True ) # creating a new branch as we are currently not on the head of master for i in range(5): ds["img", i] = ds["img", i].compute() * 3 # if we had not checked out to "alternate" branch earlier here it would auto checkout to a new branch ds.commit("multiplied value of some images by 3") assert (ds["img", 4].compute() == 3 * np.ones((1000, 1000, 3))).all() ds.checkout(second_commit_id) # first 5 images are 2s, rest are 1s now for i in range(5, 10): ds["img", i] = ds["img", i].compute() * 2 # we are not at the head of master but rather at the last commit, so we automatically get checkouted out to a new branch here # this happens any time we try to commit when we are not at the head of the branch ds.commit("multiplied value of remaining images by 2") for i in range(10): assert (ds["img", i].compute() == 2 * np.ones((1000, 1000, 3))).all() ds.checkout("alternate") for i in range(5, 10): ds["img", i] = ds["img", i].compute() * 3 for i in range(10): assert (ds["img", i].compute() == 3 * np.ones((1000, 1000, 3))).all() # we are already at the head of alternate so it does not check us out to a new branch, rather we commit on the alternate branch itself ds.commit("multiplied value of remaining images by 3")
def test_hub_open(): ds = hub.Dataset("./data/test/hub_open", token=None, shape=(10000, ), mode="w", schema=schema) ds["label/a", 5, 50, 50] = 9 assert ds["label/a", 5, 50, 50].numpy() == 9 ds["image", 5, 4, 120:200, 150:300, :] = 3 * np.ones((80, 150, 3), "uint8") assert (ds["image", 5, 4, 120:200, 150:300, :].numpy() == 3 * np.ones( (80, 150, 3), "uint8")).all()
def test_auto_checkout_bug(): my_schema = {"abc": "uint8"} ds = hub.Dataset("./data/test_versioning/branch_bug", shape=(10, ), schema=my_schema, mode="w") ds["abc", 0] = 1 a = ds.commit("it is 1") ds["abc", 0] = 2 b = ds.commit("it is 2") c = ds.checkout(a) d = ds.checkout("other", True) ds["abc", 0] = 3 e = ds.commit("it is 3") ds.checkout(b) ds["abc", 0] = 4 f = ds.commit("it is 4") g = ds.checkout(a) dsv = ds[0:3] dsv["abc", 0] = 5 h = ds.commit("it is 5") i = ds.checkout(e) tsv = ds[0:5, "abc"] tsv[0] = 6 j = ds.commit("it is 6") ds.log() ds.checkout(a) assert dsv["abc", 0].compute() == 1 assert ds["abc", 0].compute() == 1 ds.checkout(b) assert ds["abc", 0].compute() == 2 ds.checkout(c) assert ds["abc", 0].compute() == 1 ds.checkout(d) assert ds["abc", 0].compute() == 3 ds.checkout(e) assert ds["abc", 0].compute() == 3 ds.checkout(f) assert ds["abc", 0].compute() == 4 ds.checkout(g) assert ds["abc", 0].compute() == 1 ds.checkout(h) assert ds["abc", 0].compute() == 5 ds.checkout(i) assert ds["abc", 0].compute() == 3 ds.checkout(j) assert ds["abc", 0].compute() == 6 ds.checkout("master") assert ds["abc", 0].compute() == 2 ds["abc", 0] = 7 ds.checkout("copy", True) assert ds["abc", 0].compute() == 7 ds.checkout("other") assert ds["abc", 0].compute() == 3
def time_hub(dataset, batch_size=1, num_batches=1, local=True, user=None): my_schema = { "image": hub.schema.Image(shape=(28, 28, 1), dtype="uint8"), "label": hub.schema.ClassLabel(num_classes=10), } if local is True: ds = hub.Dataset( "./" + dataset + "_hub", shape=(batch_size * num_batches, ), schema=my_schema, mode="w", ) else: ds = hub.Dataset( user + "/" + dataset, shape=(batch_size * num_batches, ), schema=my_schema, mode="w", ) assert type(ds) == hub.api.dataset.Dataset time_batches(ds, batch_size, num_batches, hub=True)
def bench_hub_compression(times=REPEAT_TIMES): arr = np.array(IMG) ds = hub.Dataset( "./data/bench_png_compression", mode="w", shape=times, schema={"image": hub.schema.Image(arr.shape, compressor="png")}, ) batch = np.zeros((times, ) + arr.shape, dtype="uint8") for i in range(times): batch[i] = arr with Timer("Hub compression"): ds["image", :times] = batch
def benchmark_compress_hub_setup( times, image_path="./images/compression_benchmark_image.png"): img = Image.open(image_path) arr = np.array(img) ds = hub.Dataset( "./data/bench_png_compression", mode="w", shape=times, schema={"image": hub.schema.Image(arr.shape, compressor="png")}, ) batch = np.zeros((times, ) + arr.shape, dtype="uint8") for i in range(times): batch[i] = arr return (ds, times, batch)
def bench_hub_compression(img_path=img_path, count=count): img = Image.open(img_path) arr = np.array(img) print(arr.shape) ds = hub.Dataset( "./data/benchmarks/bench_png_compression", mode="w", shape=count, schema={"image": hub.schema.Image(arr.shape, compressor="png")}, ) print(ds._tensors["/image"].chunks) bigarr = np.zeros((count, ) + arr.shape, dtype="uint8") for i in range(count): bigarr[i] = arr with Timer("Hub compression"): ds["image", :count] = bigarr
def test_threaded(): init_schema = { "image": Tensor(shape=(None, None, None), max_shape=(4, 224, 224), dtype="float32") } schema = { "image": Tensor(shape=(None, None, None), max_shape=(4, 224, 224), dtype="float32"), "label": Tensor(shape=(None, ), max_shape=(6, ), dtype="uint8"), "text_label": Text((None, ), "int64", (14, )), "flight_code": Text((None, ), "int64", (10, )), } ds_init = hub.Dataset( "./data/hub/new_pipeline_threaded2", mode="w", shape=(10, ), schema=init_schema, cache=False, ) for i in range(len(ds_init)): ds_init["image", i] = np.ones((4, 220, 224)) ds_init["image", i] = np.ones((4, 221, 224)) @hub.transform(schema=schema, scheduler="threaded", workers=2) def create_classification_dataset(sample): ts = sample["image"] return [{ "image": ts, "label": np.ones((6, )), "text_label": "PLANTED", "flight_code": "UYKNTHNXR", } for _ in range(5)] ds = create_classification_dataset(ds_init).store( "./data/hub/new_pipeline_threaded_final") assert ds["image", 0].shape[1] == 221
def time_hub(dataset, batch_size=1): ds = hub.Dataset(dataset, cache=False, storage_cache=False, mode="r") assert type(ds) == hub.api.dataset.Dataset with Timer("Time"): counter = 0 t0 = time() for batch in range(ds.shape[0] // batch_size): x, y = ( ds[batch * batch_size : (batch + 1) * batch_size]["image"].compute(), ds[batch * batch_size : (batch + 1) * batch_size]["label"].compute(), ) counter += 1 t1 = time() print("Batch", counter, f"dt: {t1 - t0}") t0 = t1
def test_commit(): my_schema = {"abc": "uint32"} ds = hub.Dataset("./data/test_versioning/eg_1", shape=(10, ), schema=my_schema, mode="w") ds["abc", 0] = 1 a = ds.commit("first") ds["abc", 0] = 2 b = ds.commit("second") ds["abc", 0] = 3 c = ds.commit("third") assert ds["abc", 0].compute() == 3 ds.checkout(a) assert ds["abc", 0].compute() == 1 ds.checkout(b) assert ds["abc", 0].compute() == 2 ds.checkout(c) assert ds["abc", 0].compute() == 3
def test_commit_checkout_2(): my_schema = { "abc": "uint32", "img": Image((1000, 1000, 3), dtype="uint16"), } ds = hub.Dataset("./data/test_versioning/eg_3", shape=(100, ), schema=my_schema, mode="w") for i in range(100): ds["img", i] = i * np.ones((1000, 1000, 3)) a = ds.commit("first") # chunk 7.0.0.0 gets rewritten ds["img", 21] = 2 * ds["img", 21].compute() # the rest part of the chunk stays intact assert (ds["img", 21].compute() == 2 * 21 * np.ones((1000, 1000, 3))).all() assert (ds["img", 22].compute() == 22 * np.ones((1000, 1000, 3))).all() assert (ds["img", 23].compute() == 23 * np.ones((1000, 1000, 3))).all() # other chunks are still accessed from original chunk, for eg chunk 11 that contains 35th sample has single copy assert (ds["img", 35].compute() == 35 * np.ones((1000, 1000, 3))).all() b = ds.commit("second") # going back to first commit ds.checkout(a) # sanity check assert (ds["img", 21].compute() == 21 * np.ones((1000, 1000, 3))).all() ds.checkout("another", create=True) ds["img", 21] = 3 * ds["img", 21].compute() assert (ds["img", 21].compute() == 3 * 21 * np.ones((1000, 1000, 3))).all( ) # and not 6 * 21 as it would have been, had we checked out from b ds.commit("first2") ds.checkout("master") assert (ds["img", 21].compute() == 2 * 21 * np.ones((1000, 1000, 3))).all() ds.log()
def main(): ds = hub.Dataset("eurosat/eurosat-rgb") # 26000 samples in dataset, accessing values print(ds["image"][10].numpy()) print( ds["label", 15].numpy() ) # alternate way to access, by specifying both key and sample number at once print(ds["filename", 20:22].numpy()) # accessing multiple elements at once # Splitting into train and test sets train_ds = ds[:13000] test_ds = ds[13000:] # Using hub with tensorflow train_tf_ds = train_ds.to_tensorflow().batch(2) for batch in train_tf_ds: print(batch["label"], batch["filename"], batch["image"]) break test_tf_ds = test_ds.to_tensorflow().batch(2) for batch in test_tf_ds: print(batch["label"], batch["filename"], batch["image"]) break # Using hub with pytorch train_pt_ds = train_ds.to_pytorch() train_loader = torch.utils.data.DataLoader(train_pt_ds, batch_size=2) for batch in train_loader: print(batch["label"], batch["image"] ) # pytorch tensors don't support text labels such as filename break test_pt_ds = test_ds.to_pytorch() test_loader = torch.utils.data.DataLoader(test_pt_ds, batch_size=2) for batch in test_loader: print(batch["label"], batch["image"] ) # pytorch tensors don't support text labels such as filename break
def main(): sample_count = 70000 step = 10 with Timer("Time"): ds = hub.Dataset( "./data/examples/mnist_upload_speed_benchmark", mode="w", schema=schema, shape=(sample_count, ), cache=2**26, ) arr = (np.random.rand(step, 28, 28) * 100).astype("uint8") for i in range(0, sample_count, step): # with Timer(f"Sample {i}"): ds["image", i:i + step] = arr ds.commit()