def test_dataset_name(): schema = {"temp": "uint8"} ds = Dataset( "./data/test_ds_name", shape=(10,), schema=schema, name="my_dataset", mode="w" ) ds.flush() assert ds.name == "my_dataset" ds2 = Dataset("./data/test_ds_name") ds2.rename("my_dataset_2") assert ds2.name == "my_dataset_2" ds3 = Dataset("./data/test_ds_name") assert ds3.name == "my_dataset_2"
def test_dataset_copy_exception(): ds = Dataset("./data/test_data_cp", shape=(100,), schema=simple_schema) ds2 = Dataset("./data/test_data_cp_2", shape=(100,), schema=simple_schema) for i in range(100): ds["num", i] = i ds2["num", i] = 2 * i ds.flush() ds2.flush() with pytest.raises(DirectoryNotEmptyException): ds3 = ds.copy("./data/test_data_cp_2") ds.delete() ds2.delete()
def test_dataset(url="./data/test/dataset", token=None, public=True): ds = Dataset( url, token=token, shape=(10000,), mode="w", schema=my_schema, public=public ) sds = ds[5] sds["label/a", 50, 50] = 2 assert sds["label", 50, 50, "a"].numpy() == 2 ds["image", 5, 4, 100:200, 150:300, :] = np.ones((100, 150, 3), "uint8") assert ( ds["image", 5, 4, 100:200, 150:300, :].numpy() == np.ones((100, 150, 3), "uint8") ).all() ds["image", 8, 6, 500:550, 700:730] = np.ones((50, 30, 3)) subds = ds[3:15] subsubds = subds[4:9] assert ( subsubds["image", 1, 6, 500:550, 700:730].numpy() == np.ones((50, 30, 3)) ).all() subds = ds[5:7] ds["image", 6, 3:5, 100:135, 700:720] = 5 * np.ones((2, 35, 20, 3)) assert ( subds["image", 1, 3:5, 100:135, 700:720].numpy() == 5 * np.ones((2, 35, 20, 3)) ).all() ds["label", "c"] = 4 * np.ones((10000, 5, 3), "uint8") assert (ds["label/c"].numpy() == 4 * np.ones((10000, 5, 3), "uint8")).all() ds["label", "c", 2, 4] = 6 * np.ones((3)) sds = ds["label", "c"] ssds = sds[1:3, 4] sssds = ssds[1] assert (sssds.numpy() == 6 * np.ones((3))).all() ds.flush() sds = ds["/label", 5:15, "c"] sds[2:4, 4, :] = 98 * np.ones((2, 3)) assert (ds[7:9, 4, "label", "/c"].numpy() == 98 * np.ones((2, 3))).all() labels = ds["label", 1:5] d = labels["d"] e = d["e"] e[:] = 77 * np.ones((4, 5, 3)) assert (e.numpy() == 77 * np.ones((4, 5, 3))).all() ds.close()
def test_dataset_schema_bug(): schema = {"abc": Primitive("int32"), "def": "int64"} ds = Dataset("./data/schema_bug", schema=schema, shape=(100, )) ds.flush() ds2 = Dataset("./data/schema_bug", schema=schema, shape=(100, )) schema = { "abc": "uint8", "def": { "ghi": Tensor((100, 100)), "rst": Tensor((100, 100, 100)), }, } ds = Dataset("./data/schema_bug_2", schema=schema, shape=(100, )) ds.flush() ds2 = Dataset("./data/schema_bug_2", schema=schema, shape=(100, ))
def test_append_dataset(): dt = {"first": Tensor(shape=(250, 300)), "second": "float"} url = "./data/test/model" ds = Dataset(schema=dt, shape=(100,), url=url, mode="w") ds.append_shape(20) ds["first"][0] = np.ones((250, 300)) assert len(ds) == 120 assert ds["first"].shape[0] == 120 assert ds["first", 5:10].shape[0] == 5 assert ds["second"].shape[0] == 120 ds.flush() ds = Dataset(url) assert ds["first"].shape[0] == 120 assert ds["first", 5:10].shape[0] == 5 assert ds["second"].shape[0] == 120
def test_minio_endpoint(): token = { "aws_access_key_id": os.getenv("ACTIVELOOP_MINIO_KEY"), "aws_secret_access_key": os.getenv("ACTIVELOOP_MINIO_SECRET_ACCESS_KEY"), "endpoint_url": "https://play.min.io:9000", "region": "us-east-1", } schema = {"abc": Tensor((100, 100, 3))} ds = Dataset( "s3://bucket/random_dataset", token=token, shape=(10,), schema=schema, mode="w" ) for i in range(10): ds["abc", i] = i * np.ones((100, 100, 3)) ds.flush() for i in range(10): assert (ds["abc", i].compute() == i * np.ones((100, 100, 3))).all()
def main(): schema = { "image": Image(shape=(None, None), max_shape=(28, 28)), "label": ClassLabel(num_classes=10), } path = "./data/examples/new_api_intro2" ds = Dataset(path, shape=(10, ), mode="w", schema=schema) print(len(ds)) for i in range(len(ds)): with Timer("writing single element"): ds["image", i] = np.ones((28, 28), dtype="uint8") ds["label", i] = 3 ds.resize_shape(200) print(ds.shape) print(ds["label", 100:110].numpy()) with Timer("Committing"): ds.flush() ds = Dataset(path) print(ds.schema) print(ds["image", 0].compute())
def main(): # Tag is set {Username}/{Dataset} tag = "davitb/basic11" # Create dataset ds = Dataset( tag, shape=(4, ), schema={ "image": schema.Tensor((512, 512), dtype="float"), "label": schema.Tensor((512, 512), dtype="float"), }, mode="w+", ) # Upload Data ds["image"][:] = np.ones((4, 512, 512)) ds["label"][:] = np.ones((4, 512, 512)) ds.flush() # Load the data ds = Dataset(tag) print(ds["image"][0].compute())
def hub(): from hub import Dataset, schema, transform from skimage.io import imread from pathlib import Path USER = Path('rayos') dataset = './rayos/test' custom = { 'image': schema.Image(shape=(None, None), dtype='uint8', max_shape=(512, 512)), 'label': schema.ClassLabel(num_classes=2), } @transform(schema=custom) def load_transform(sample): image = imread(sample) label = int(sample.split('.')[-2]) return {"image": image, "label": label} fnames = [ r"C:\Users\Emc11\Dropbox\ん\エラティカ ニ\f0b2dbfa779195e0769a1ebaf7d22488.jpg", r"C:\Users\Emc11\Dropbox\ん\エラティカ 三\bfbf442331b996dcd3909080199df88d.jpg", r"C:\Users\Emc11\Dropbox\ん\エラティカ 三\90596a829d162455bd44759748b0e779.jpg", r"C:\Users\Emc11\Dropbox\ん\エラティカ ニ\5956d21f8b3ffa492669001f6be4d20c.jpg", r"C:\Users\Emc11\Dropbox\ん\エラティカ 三\8a360e1daa60742752da3a4ded7241fb.png", r"C:\Users\Emc11\Dropbox\ん\エラティカ ニ\c5504009cd88251533ea265b4fcf2ede.jpg", ] ds = Dataset(dataset, shape=(len(fnames), ), mode='w+', schema=custom) ds.flush() dase = load_transform(fnames) ds2 = dase.store(dataset) data = Dataset(dataset)
def upload( self, results, url: str, token: dict, progressbar: bool = True, public: bool = True, ): """Batchified upload of results. For each tensor batchify based on its chunk and upload. If tensor is dynamic then still upload element by element. Parameters ---------- dataset: hub.Dataset Dataset object that should be written to results: Output of transform function progressbar: bool public: bool, optional only applicable if using hub storage, ignored otherwise setting this to False allows only the user who created it to access the dataset and the dataset won't be visible in the visualizer to the public Returns ---------- ds: hub.Dataset Uploaded dataset """ if len(list(results.values())) == 0: shape = (0,) else: shape = (len(list(results.values())[0]),) ds = Dataset( url, mode="w", shape=shape, schema=self.schema, token=token, cache=False, public=public, ) tasks = [] for key, value in results.items(): length = ds[key].chunksize[0] value = get_value(value) value = str_to_int(value, ds.tokenizer) batched_values = batchify(value, length) chunk_id = list(range(len(batched_values))) index_batched_values = list(zip(chunk_id, batched_values)) ds._tensors[f"/{key}"].disable_dynamicness() results = [ self.upload_chunk.remote(el, key=key, ds=ds) for el in index_batched_values ] tasks.extend(results) results = ray.get(tasks) self.set_dynamic_shapes(results, ds) ds.flush() return ds