def test_text_dataset_tokenizer(): schema = { "names": Text(shape=(None, ), max_shape=(1000, ), dtype="int64"), } ds = Dataset("./data/test/testing_text", mode="w", schema=schema, shape=(10, ), tokenizer=True) text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." ds["names", 4] = text + " 4" assert ds["names", 4].numpy() == text + " 4" ds["names"][5] = text + " 5" assert ds["names"][5].numpy() == text + " 5" dsv = ds[7:9] dsv["names", 0] = text + " 7" assert dsv["names", 0].numpy() == text + " 7" dsv["names"][1] = text + " 8" assert dsv["names"][1].numpy() == text + " 8"
def test_dataset_view_lazy(): dt = { "first": Tensor(shape=(2,)), "second": "float", "text": Text(shape=(None,), max_shape=(12,)), } url = "./data/test/dsv_lazy" ds = Dataset(schema=dt, shape=(4,), url=url, mode="w") ds["text", 3] = "hello world" ds["second", 2] = 3.14 ds["first", 2] = np.array([5, 6]) dsv = ds[2:] dsv.disable_lazy() assert dsv["text", 1] == "hello world" assert dsv["second", 0] == 3.14 assert (dsv["first", 0] == np.array([5, 6])).all() dsv.enable_lazy() assert dsv["text", 1].compute() == "hello world" assert dsv["second", 0].compute() == 3.14 assert (dsv["first", 0].compute() == np.array([5, 6])).all()
def test_dataset(url="./data/test/dataset", token=None, public=True): ds = Dataset( url, token=token, shape=(10000,), mode="w", schema=my_schema, public=public ) sds = ds[5] sds["label/a", 50, 50] = 2 assert sds["label", 50, 50, "a"].numpy() == 2 ds["image", 5, 4, 100:200, 150:300, :] = np.ones((100, 150, 3), "uint8") assert ( ds["image", 5, 4, 100:200, 150:300, :].numpy() == np.ones((100, 150, 3), "uint8") ).all() ds["image", 8, 6, 500:550, 700:730] = np.ones((50, 30, 3)) subds = ds[3:15] subsubds = subds[4:9] assert ( subsubds["image", 1, 6, 500:550, 700:730].numpy() == np.ones((50, 30, 3)) ).all() subds = ds[5:7] ds["image", 6, 3:5, 100:135, 700:720] = 5 * np.ones((2, 35, 20, 3)) assert ( subds["image", 1, 3:5, 100:135, 700:720].numpy() == 5 * np.ones((2, 35, 20, 3)) ).all() ds["label", "c"] = 4 * np.ones((10000, 5, 3), "uint8") assert (ds["label/c"].numpy() == 4 * np.ones((10000, 5, 3), "uint8")).all() ds["label", "c", 2, 4] = 6 * np.ones((3)) sds = ds["label", "c"] ssds = sds[1:3, 4] sssds = ssds[1] assert (sssds.numpy() == 6 * np.ones((3))).all() ds.save() sds = ds["/label", 5:15, "c"] sds[2:4, 4, :] = 98 * np.ones((2, 3)) assert (ds[7:9, 4, "label", "/c"].numpy() == 98 * np.ones((2, 3))).all() labels = ds["label", 1:5] d = labels["d"] e = d["e"] e[:] = 77 * np.ones((4, 5, 3)) assert (e.numpy() == 77 * np.ones((4, 5, 3))).all() ds.close()
def test_dataset_enter_exit(): with Dataset( "./data/test/dataset", token=None, shape=(10000,), mode="w", schema=my_schema ) as ds: sds = ds[5] sds["label/a", 50, 50] = 2 assert sds["label", 50, 50, "a"].numpy() == 2 ds["image", 5, 4, 100:200, 150:300, :] = np.ones((100, 150, 3), "uint8") assert ( ds["image", 5, 4, 100:200, 150:300, :].numpy() == np.ones((100, 150, 3), "uint8") ).all() ds["image", 8, 6, 500:550, 700:730] = np.ones((50, 30, 3)) subds = ds[3:15] subsubds = subds[4:9] assert ( subsubds["image", 1, 6, 500:550, 700:730].numpy() == np.ones((50, 30, 3)) ).all()
def test_dataset_view_compute(): dt = { "first": Tensor(shape=(2,)), "second": "float", "text": Text(shape=(None,), max_shape=(12,)), } url = "./data/test/dsv_compute" ds = Dataset(schema=dt, shape=(4,), url=url, mode="w") ds["text", 3] = "hello world" ds["second", 2] = 3.14 ds["first", 2] = np.array([5, 6]) dsv = ds[2:] comp = dsv.compute() comp0 = comp[0] assert (comp0["first"] == np.array([5, 6])).all() assert comp0["second"] == 3.14 assert comp0["text"] == "" comp1 = comp[1] assert (comp1["first"] == np.array([0, 0])).all() assert comp1["second"] == 0 assert comp1["text"] == "hello world"
def main(): schema = { "image": Image(shape=(None, None), max_shape=(28, 28)), "label": ClassLabel(num_classes=10), } path = "./data/examples/new_api_intro2" ds = Dataset(path, shape=(10, ), mode="w", schema=schema) print(len(ds)) for i in range(len(ds)): with Timer("writing single element"): ds["image", i] = np.ones((28, 28), dtype="uint8") ds["label", i] = 3 ds.resize_shape(200) print(ds.shape) print(ds["label", 100:110].numpy()) with Timer("Committing"): ds.flush() ds = Dataset(path) print(ds.schema) print(ds["image", 0].compute())
def test_pickleability(url="./data/test/test_dataset_dynamic_shaped"): schema = { "first": Tensor( shape=(None, None), dtype="int32", max_shape=(100, 100), chunks=(100,), ) } ds = Dataset( url=url, token=None, shape=(1000,), mode="w", schema=schema, ) ds["first"][0] = np.ones((10, 10)) pickled_ds = cloudpickle.dumps(ds) new_ds = pickle.loads(pickled_ds) assert np.all(new_ds["first"][0].compute() == ds["first"][0].compute())
def test_sharded_dataset(): dt = {"first": "float", "second": "float"} datasets = [ Dataset(schema=dt, shape=(10,), url=f"./data/test/test_dataset/{i}", mode="w") for i in range(4) ] ds = ShardedDatasetView(datasets) ds[0]["first"] = 2.3 assert ds[0]["second"].numpy() != 2.3 assert ds[30]["first"].numpy() == 0 assert len(ds) == 40 assert ds.shape == (40,) assert type(ds.schema) == SchemaDict assert ds.__repr__() == "ShardedDatasetView(shape=(40,))" ds[4, "first"] = 3 for _ in ds: pass ds2 = ShardedDatasetView([]) assert ds2.identify_shard(5) == (0, 0)
def test_datasetview_get_dictionary(): ds = Dataset( schema=my_schema, shape=(20,), url="./data/test/datasetview_get_dictionary", mode="w", ) ds["label", 5, "a"] = 5 * np.ones((100, 200)) ds["label", 5, "d", "e"] = 3 * np.ones((5, 3)) dsv = ds[2:10] dsv.disable_lazy() dic = dsv[3, "label"] assert (dic["a"] == 5 * np.ones((100, 200))).all() assert (dic["d"]["e"] == 3 * np.ones((5, 3))).all() dsv.enable_lazy() ds["label", "a"] = 9 * np.ones((20, 100, 200)) ds["label", "d", "e"] = 11 * np.ones((20, 5, 3)) dic2 = dsv["label"] assert (dic2["a"].compute() == 9 * np.ones((8, 100, 200))).all() assert (dic2["d"]["e"].compute() == 11 * np.ones((8, 5, 3))).all() dic3 = ds["label"] assert (dic3["a"].compute() == 9 * np.ones((20, 100, 200))).all() assert (dic3["d"]["e"].compute() == 11 * np.ones((20, 5, 3))).all()
def test_dataset_assign_value(): schema = {"text": Text(shape=(None,), dtype="int64", max_shape=(10,))} url = "./data/test/text_data" ds = Dataset(schema=schema, shape=(7,), url=url, mode="w") slice_ = slice(0, 5, None) key = "text" batch = [ np.array("THTMLY2F9"), np.array("QUUVEU2IU"), np.array("8ZUFCYWKD"), "H9EDFAGHB", "WDLDYN6XG", ] ds[key, slice_] = batch ds[key][5] = np.array("GHLSGBFF8") ds[key][6] = "YGFJN75NF" assert ds["text", 0].compute() == "THTMLY2F9" assert ds["text", 1].compute() == "QUUVEU2IU" assert ds["text", 2].compute() == "8ZUFCYWKD" assert ds["text", 3].compute() == "H9EDFAGHB" assert ds["text", 4].compute() == "WDLDYN6XG" assert ds["text", 5].compute() == "GHLSGBFF8" assert ds["text", 6].compute() == "YGFJN75NF"
def test_sharded_dataset_with_views(): schema = {"first": "float", "second": "float"} ds = Dataset("./data/test_sharded_ds", shape=(10,), schema=schema, mode="w") for i in range(10): ds[i, "first"] = i ds[i, "second"] = 2 * i + 1 dsv = ds[3:5] dsv2 = ds[1] dsv3 = ds[8:] datasets = [dsv, ds, dsv2, dsv3] sharded_ds = ShardedDatasetView(datasets) for i in range(2): assert sharded_ds[i, "first"].compute() == i + 3 assert sharded_ds[i, "second"].compute() == 2 * (i + 3) + 1 for i in range(2, 12): assert sharded_ds[i, "first"].compute() == i - 2 assert sharded_ds[i, "second"].compute() == 2 * (i - 2) + 1 assert sharded_ds[12, "first"].compute() == 1 assert sharded_ds[12, "second"].compute() == 3 for i in range(13, 15): assert sharded_ds[i, "first"].compute() == i - 5 assert sharded_ds[i, "second"].compute() == 2 * (i - 5) + 1
def test_dataset_dynamic_shaped(): schema = { "first": Tensor( shape=(None, None), dtype="int32", max_shape=(100, 100), chunks=(100,), ) } ds = Dataset( "./data/test/test_dataset_dynamic_shaped", token=None, shape=(1000,), mode="w", schema=schema, ) ds["first", 50, 50:60, 50:60] = np.ones((10, 10), "int32") assert (ds["first", 50, 50:60, 50:60].numpy() == np.ones((10, 10), "int32")).all() ds["first", 0, :10, :10] = np.ones((10, 10), "int32") ds["first", 0, 10:20, 10:20] = 5 * np.ones((10, 10), "int32") assert (ds["first", 0, 0:10, 0:10].numpy() == np.ones((10, 10), "int32")).all()
def test_dataset_dynamic_shaped_slicing(): schema = { "first": Tensor( shape=(None, None), dtype="int32", max_shape=(100, 100), chunks=(100,), ) } ds = Dataset( "./data/test/test_dataset_dynamic_shaped", token=None, shape=(100,), mode="w", schema=schema, ) for i in range(100): ds["first", i] = i * np.ones((i, i)) items = ds["first", 0:100].compute() for i in range(100): assert (items[i] == i * np.ones((i, i))).all() assert (ds["first", 1:2].compute()[0] == np.ones((1, 1))).all()
def main(): # Tag is set {Username}/{Dataset} tag = "davitb/basic11" # Create dataset ds = Dataset( tag, shape=(4, ), schema={ "image": schema.Tensor((512, 512), dtype="float"), "label": schema.Tensor((512, 512), dtype="float"), }, ) # Upload Data ds["image"][:] = np.ones((4, 512, 512)) ds["label"][:] = np.ones((4, 512, 512)) ds.commit() # Load the data ds = Dataset(tag) print(ds["image"][0].compute())
def test_dataset_copy_gcs_s3(): ds = Dataset( "s3://snark-test/cp_original_ds_s3_2_a", shape=(100,), schema=simple_schema ) DS2_PATH = "gcs://snark-test/cp_copy_dataset_gcs_2_a" DS3_PATH = "s3://snark-test/cp_copy_ds_s3_3_a" for i in range(100): ds["num", i] = 2 * i try: ds2 = ds.copy(DS2_PATH) except: dsi = Dataset(DS2_PATH) dsi.delete() ds2 = ds.copy(DS2_PATH) try: ds3 = ds2.copy(DS3_PATH) except: dsi = Dataset(DS3_PATH) dsi.delete() ds3 = ds2.copy(DS3_PATH) for i in range(100): assert ds2["num", i].compute() == 2 * i assert ds3["num", i].compute() == 2 * i ds.delete() ds2.delete() ds3.delete()
def test_dataset_copy_hub_local(): password = os.getenv("ACTIVELOOP_HUB_PASSWORD") login_fn("testingacc", password) ds = Dataset("testingacc/cp_original_ds_hub_1", shape=(100,), schema=simple_schema) DS2_PATH = "./data/testing/cp_copy_ds_local_5" DS3_PATH = "testingacc/cp_copy_dataset_testing_2" for i in range(100): ds["num", i] = 2 * i try: ds2 = ds.copy(DS2_PATH) except: dsi = Dataset(DS2_PATH) dsi.delete() ds2 = ds.copy(DS2_PATH) try: ds3 = ds2.copy(DS3_PATH) except: dsi = Dataset(DS3_PATH) dsi.delete() ds3 = ds2.copy(DS3_PATH) for i in range(100): assert ds2["num", i].compute() == 2 * i assert ds3["num", i].compute() == 2 * i ds.delete() ds2.delete() ds3.delete()
def test_dataset_copy_azure_local(): token = {"account_key": os.getenv("ACCOUNT_KEY")} ds = Dataset( "https://activeloop.blob.core.windows.net/activeloop-hub/cp_original_test_ds_azure_1", token=token, shape=(100,), schema=simple_schema, ) DS2_PATH = "./data/testing/cp_copy_ds_local_4" DS3_PATH = "https://activeloop.blob.core.windows.net/activeloop-hub/cp_copy_test_ds_azure_2" for i in range(100): ds["num", i] = 2 * i try: ds2 = ds.copy(DS2_PATH) except: dsi = Dataset(DS2_PATH) dsi.delete() ds2 = ds.copy(DS2_PATH) try: ds3 = ds2.copy( DS3_PATH, token=token, ) except: dsi = Dataset( DS3_PATH, token=token, ) dsi.delete() ds3 = ds2.copy( DS3_PATH, token=token, ) for i in range(100): assert ds2["num", i].compute() == 2 * i assert ds3["num", i].compute() == 2 * i ds.delete() ds2.delete() ds3.delete()
def benchmark_iterate_hub_tensorflow_setup(dataset_name, batch_size, prefetch_factor): dset = Dataset(dataset_name, cache=False, storage_cache=False, mode="r") loader = dset.to_tensorflow().batch(batch_size).prefetch(prefetch_factor) return (loader,)
def test_dataset_copy_s3_local(): ds = Dataset( "./data/testing/cp_original_data_local", shape=(100,), schema=simple_schema ) DS2_PATH = "s3://snark-test/cp_copy_data_s3_1_a" DS3_PATH = "./data/testing/cp_copy_data_local_1" for i in range(100): ds["num", i] = 2 * i try: ds2 = ds.copy(DS2_PATH) except: dsi = Dataset(DS2_PATH) dsi.delete() ds2 = ds.copy(DS2_PATH) try: ds3 = ds2.copy(DS3_PATH) except: dsi = Dataset(DS3_PATH) dsi.delete() ds3 = ds2.copy(DS3_PATH) for i in range(100): assert ds2["num", i].compute() == 2 * i assert ds3["num", i].compute() == 2 * i ds.delete() ds2.delete() ds3.delete()
def test_dataset2(): dt = {"first": "float", "second": "float"} ds = Dataset(schema=dt, shape=(2,), url="./data/test/test_dataset2", mode="w") ds["first"][0] = 2.3 assert ds["second"][0].numpy() != 2.3
def test_dataset_append_and_read(): dt = {"first": "float", "second": "float"} os.makedirs("./data/test/test_dataset_append_and_read", exist_ok=True) shutil.rmtree("./data/test/test_dataset_append_and_read") ds = Dataset( schema=dt, shape=(2,), url="./data/test/test_dataset_append_and_read", mode="a", ) ds["first"][0] = 2.3 ds.meta_information["description"] = "This is my description" assert ds.meta_information["description"] == "This is my description" assert ds["second"][0].numpy() != 2.3 ds.close() ds = Dataset( url="./data/test/test_dataset_append_and_read", mode="r", ) assert ds.meta_information["description"] == "This is my description" ds.meta_information["hello"] = 5 ds.delete() ds.close()
def test_dataset_change_schema(): schema = { "abc": "uint8", "def": { "ghi": Tensor((100, 100)), "rst": Tensor((100, 100, 100)), }, } ds = Dataset("./data/test_schema_change", schema=schema, shape=(100, )) new_schema_1 = { "abc": "uint8", "def": { "ghi": Tensor((200, 100)), "rst": Tensor((100, 100, 100)), }, } new_schema_2 = { "abrs": "uint8", "def": { "ghi": Tensor((100, 100)), "rst": Tensor((100, 100, 100)), }, } new_schema_3 = { "abc": "uint8", "def": { "ghijk": Tensor((100, 100)), "rst": Tensor((100, 100, 100)), }, } new_schema_4 = { "abc": "uint16", "def": { "ghi": Tensor((100, 100)), "rst": Tensor((100, 100, 100)), }, } new_schema_5 = { "abc": "uint8", "def": { "ghi": Tensor((100, 100, 3)), "rst": Tensor((100, 100, 100)), }, } with pytest.raises(SchemaMismatchException): ds = Dataset("./data/test_schema_change", schema=new_schema_1, shape=(100, )) with pytest.raises(SchemaMismatchException): ds = Dataset("./data/test_schema_change", schema=new_schema_2, shape=(100, )) with pytest.raises(SchemaMismatchException): ds = Dataset("./data/test_schema_change", schema=new_schema_3, shape=(100, )) with pytest.raises(SchemaMismatchException): ds = Dataset("./data/test_schema_change", schema=new_schema_4, shape=(100, )) with pytest.raises(SchemaMismatchException): ds = Dataset("./data/test_schema_change", schema=new_schema_5, shape=(100, ))
from hub import Dataset from hub.api.datasetview import TensorView from hub.exceptions import NoneValueException from hub.schema import Tensor import numpy as np import pytest my_schema = { "image": Tensor((None, None, None, None), "uint8", max_shape=(10, 1920, 1080, 4)), "label": float, } ds = Dataset("./data/test/dataset", shape=(100, ), mode="w", schema=my_schema) def test_tensorview_init(): with pytest.raises(NoneValueException): tensorview_object = TensorView(ds, subpath=None) with pytest.raises(NoneValueException): tensorview_object_2 = TensorView(dataset=None, subpath="image") def test_tensorview_getitem(): images_tensorview = ds["image"] with pytest.raises(IndexError): images_tensorview["7", 0:1920, 0:1080, 0:3].compute()
def test_dataset_batch_write_2(): schema = {"image": Image(shape=(None, None, 3), max_shape=(640, 640, 3))} ds = Dataset("./data/batch", shape=(100,), mode="w", schema=schema) ds["image", 0:14] = [np.ones((640 - i, 640, 3)) for i in range(14)]
def test_dataset_bug_2(url="./data/test/dataset", token=None): my_schema = { "image": Tensor((100, 100), "uint8"), } ds = Dataset(url, token=token, shape=(10000,), mode="w", schema=my_schema) ds["image", 0:1] = [np.zeros((100, 100))]
def test_dataset_filter_2(): my_schema = { "fname": Text((None,), max_shape=(10,)), "lname": Text((None,), max_shape=(10,)), } ds = Dataset("./data/tests/filtering", shape=(100,), schema=my_schema, mode="w") for i in range(100): ds["fname", i] = "John" ds["lname", i] = "Doe" for i in [1, 3, 6, 15, 63, 96, 75]: ds["fname", i] = "Active" for i in [15, 31, 25, 75, 3, 6]: ds["lname", i] = "loop" dsv_combined = ds.filter( lambda x: x["fname"].compute() == "Active" and x["lname"].compute() == "loop" ) tsv_combined_fname = dsv_combined["fname"] tsv_combined_lname = dsv_combined["lname"] for item in dsv_combined: assert item.compute() == {"fname": "Active", "lname": "loop"} for item in tsv_combined_fname: assert item.compute() == "Active" for item in tsv_combined_lname: assert item.compute() == "loop" dsv_1 = ds.filter(lambda x: x["fname"].compute() == "Active") dsv_2 = dsv_1.filter(lambda x: x["lname"].compute() == "loop") for item in dsv_1: assert item.compute()["fname"] == "Active" tsv_1 = dsv_1["fname"] tsv_2 = dsv_2["lname"] for item in tsv_1: assert item.compute() == "Active" for item in tsv_2: assert item.compute() == "loop" for item in dsv_2: assert item.compute() == {"fname": "Active", "lname": "loop"} assert dsv_combined.indexes == [3, 6, 15, 75] assert dsv_1.indexes == [1, 3, 6, 15, 63, 75, 96] assert dsv_2.indexes == [3, 6, 15, 75] dsv_3 = ds.filter(lambda x: x["lname"].compute() == "loop") dsv_4 = dsv_3.filter(lambda x: x["fname"].compute() == "Active") for item in dsv_3: assert item.compute()["lname"] == "loop" for item in dsv_4: assert item.compute() == {"fname": "Active", "lname": "loop"} assert dsv_3.indexes == [3, 6, 15, 25, 31, 75] assert dsv_4.indexes == [3, 6, 15, 75] my_schema2 = { "fname": Text((None,), max_shape=(10,)), "lname": Text((None,), max_shape=(10,)), "image": Image((1920, 1080, 3)), } ds = Dataset("./data/tests/filtering2", shape=(100,), schema=my_schema2, mode="w") with pytest.raises(KeyError): ds.filter(lambda x: (x["random"].compute() == np.ones((1920, 1080, 3))).all()) for i in [1, 3, 6, 15, 63, 96, 75]: ds["fname", i] = "Active" dsv = ds.filter(lambda x: x["fname"].compute() == "Active") with pytest.raises(KeyError): dsv.filter(lambda x: (x["random"].compute() == np.ones((1920, 1080, 3))).all())
def test_dataset_copy_exception(): ds = Dataset("./data/test_data_cp", shape=(100,), schema=simple_schema) DS_PATH = "./data/test_data_cp_2" ds2 = Dataset(DS_PATH, shape=(100,), schema=simple_schema) for i in range(100): ds["num", i] = i ds2["num", i] = 2 * i ds.flush() ds2.flush() with pytest.raises(DirectoryNotEmptyException): ds3 = ds.copy(DS_PATH) ds.delete() ds2.delete()
def upload( self, results, url: str, token: dict, progressbar: bool = True, public: bool = True, ): """Batchified upload of results. For each tensor batchify based on its chunk and upload. If tensor is dynamic then still upload element by element. Parameters ---------- dataset: hub.Dataset Dataset object that should be written to results: Output of transform function progressbar: bool public: bool, optional only applicable if using hub storage, ignored otherwise setting this to False allows only the user who created it to access the dataset and the dataset won't be visible in the visualizer to the public Returns ---------- ds: hub.Dataset Uploaded dataset """ if len(list(results.values())) == 0: shape = (0, ) else: shape = (len(list(results.values())[0]), ) ds = Dataset( url, mode="w", shape=shape, schema=self.schema, token=token, cache=False, public=public, ) tasks = [] for key, value in results.items(): length = ds[key].chunksize[0] value = get_value(value) value = str_to_int(value, ds.tokenizer) batched_values = batchify(value, length) chunk_id = list(range(len(batched_values))) index_batched_values = list(zip(chunk_id, batched_values)) ds._tensors[f"/{key}"].disable_dynamicness() results = [ self.upload_chunk.remote(el, key=key, ds=ds) for el in index_batched_values ] tasks.extend(results) results = ray.get(tasks) self.set_dynamic_shapes(results, ds) ds.commit() return ds
def time_random_access( dataset_name="activeloop/mnist", offset=1000, span=1000, field="image" ): dset = Dataset(dataset_name, cache=False, storage_cache=False) with Timer(f"{dataset_name} read at offset {offset:03} of length {span:03}"): dset[field][offset : offset + span].compute()