def test_text_dataset_tokenizer(): schema = { "names": Text(shape=(None,), max_shape=(1000,), dtype="int64"), } ds = Dataset( "./data/test/testing_text", mode="w", schema=schema, shape=(10,), tokenizer=True ) text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." ds["names", 4] = text + " 4" assert ds["names", 4].numpy() == text + " 4" ds["names"][5] = text + " 5" assert ds["names"][5].numpy() == text + " 5" dsv = ds[7:9] dsv["names", 0] = text + " 7" assert dsv["names", 0].numpy() == text + " 7" dsv["names"][1] = text + " 8" assert dsv["names"][1].numpy() == text + " 8" schema2 = { "id": Text(shape=(4,), dtype="int64"), } ds2 = Dataset( "./data/test/testing_text_2", mode="w", schema=schema2, shape=(10,), tokenizer=True, ) ds2[0:5, "id"] = ["abcd", "abcd", "abcd", "abcd", "abcd"] assert ds2[2:4, "id"].compute() == ["abcd", "abcd"]
def dict_to_hub(dic, path=""): d = {} for k, v in dic.items(): k = k.replace("/", "_") cur_path = path + "/" + k if isinstance(v, dict): d[k] = dict_to_hub(v, path=cur_path) else: value_shape = v.shape if hasattr(v, "shape") else () if isinstance(v, torch.Tensor): v = v.numpy() shape = tuple(None for it in value_shape) max_shape = ( max_dict[cur_path] or tuple(10000 for it in value_shape) if not isinstance(v, str) else (10000,) ) dtype = v.dtype.name if hasattr(v, "dtype") else type(v) dtype = "int64" if isinstance(v, str) else dtype d[k] = ( Tensor(shape=shape, dtype=dtype, max_shape=max_shape) if not isinstance(v, str) else Text(shape=(None,), dtype=dtype, max_shape=max_shape) ) return SchemaDict(d)
def test_meta_information(): description = {"author": "testing", "description": "here goes the testing text"} description_changed = { "author": "changed author", "description": "now it's changed", } schema = {"text": Text((None,), max_shape=(1000,))} ds = Dataset( "./data/test_meta", shape=(10,), schema=schema, meta_information=description, mode="w", ) some_text = ["hello world", "hello penguin", "hi penguin"] for i, text in enumerate(some_text): ds["text", i] = text assert type(ds.meta["meta_info"]) == dict assert ds.meta["meta_info"]["author"] == "testing" assert ds.meta["meta_info"]["description"] == "here goes the testing text" ds.close()
def test_threaded(): init_schema = { "image": Tensor(shape=(None, None, None), max_shape=(4, 224, 224), dtype="float32") } schema = { "image": Tensor(shape=(None, None, None), max_shape=(4, 224, 224), dtype="float32"), "label": Tensor(shape=(None, ), max_shape=(6, ), dtype="uint8"), "text_label": Text((None, ), "int64", (14, )), "flight_code": Text((None, ), "int64", (10, )), } ds_init = hub.Dataset( "./data/hub/new_pipeline_threaded2", mode="w", shape=(10, ), schema=init_schema, cache=False, ) for i in range(len(ds_init)): ds_init["image", i] = np.ones((4, 220, 224)) ds_init["image", i] = np.ones((4, 221, 224)) @hub.transform(schema=schema, scheduler="threaded", workers=2) def create_classification_dataset(sample): ts = sample["image"] return [{ "image": ts, "label": np.ones((6, )), "text_label": "PLANTED", "flight_code": "UYKNTHNXR", } for _ in range(5)] ds = create_classification_dataset(ds_init).store( "./data/hub/new_pipeline_threaded_final") assert ds["image", 0].shape[1] == 221
def test_text(): my_schema = {"text": Text((None, ), max_shape=(10, ))} @hub.transform(schema=my_schema) def my_transform(sample): return {"text": np.array("abc")} ds = my_transform([i for i in range(10)]) ds2 = ds.store("./data/test/transform_text") for i in range(10): assert ds2["text", i].compute() == "abc"
def tensor_to_hub(tf_dt, max_shape=None): if tf_dt.dtype.name == "string": max_shape = max_shape or (100000, ) return Text(shape=(None, ), dtype="int64", max_shape=(100000, )) dt = tf_dt.dtype.name if max_shape and len(max_shape) > len(tf_dt.shape): max_shape = max_shape[(len(max_shape) - len(tf_dt.shape)):] max_shape = max_shape or tuple(10000 if dim is None else dim for dim in tf_dt.shape) return Tensor(shape=tf_dt.shape, dtype=dt, max_shape=max_shape)
def test_datasetview_repr(): dt = { "first": Tensor(shape=(2,)), "second": "float", "text": Text(shape=(None,), max_shape=(12,)), } url = "./data/test/dsv_repr" ds = Dataset(schema=dt, shape=(9,), url=url, mode="w", lazy=False) dsv = ds[2:] print_text = "DatasetView(Dataset(schema=SchemaDict({'first': Tensor(shape=(2,), dtype='float64'), 'second': 'float64', 'text': Text(shape=(None,), dtype='int64', max_shape=(12,))}), url='./data/test/dsv_repr', shape=(9,), mode='w'))" assert dsv.__repr__() == print_text
def test_text_dataset(): schema = { "names": Text(shape=(None, ), max_shape=(1000, ), dtype="int64"), } ds = Dataset("./data/test/testing_text", mode="w", schema=schema, shape=(10, )) text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." ds["names", 4] = text assert ds["names", 4].numpy() == text
def test_dataset_filter(): def abc_filter(sample): return sample["ab"].compute().startswith("abc") my_schema = {"img": Tensor((100, 100)), "ab": Text((None,), max_shape=(10,))} ds = Dataset("./data/new_filter", shape=(10,), schema=my_schema) for i in range(10): ds["img", i] = i * np.ones((100, 100)) ds["ab", i] = "abc" + str(i) if i % 2 == 0 else "def" + str(i) ds2 = ds.filter(abc_filter) assert ds2.indexes == [0, 2, 4, 6, 8]
def test_dataset_lazy(): dt = { "first": Tensor(shape=(2, )), "second": "float", "text": Text(shape=(None, ), max_shape=(12, )), } url = "./data/test/ds_lazy" ds = Dataset(schema=dt, shape=(2, ), url=url, mode="w", lazy=False) ds["text", 1] = "hello world" ds["second", 0] = 3.14 ds["first", 0] = np.array([5, 6]) assert ds["text", 1] == "hello world" assert ds["second", 0] == 3.14 assert (ds["first", 0] == np.array([5, 6])).all()
def test_dataset_setting_shape(): schema = {"text": Text(shape=(None,), dtype="int64", max_shape=(10,))} url = "./data/test/text_data" ds = Dataset(schema=schema, shape=(5,), url=url, mode="w") slice_ = slice(0, 5, None) key = "text" batch = [ np.array("THTMLY2F9"), np.array("QUUVEU2IU"), np.array("8ZUFCYWKD"), "H9EDFAGHB", "WDLDYN6XG", ] shape = ds._tensors[f"/{key}"].get_shape_from_value([slice_], batch) assert shape[0][0] == [1]
def test_datasetview_2(): dt = { "first": Tensor(shape=(2,)), "second": "float", "text": Text(shape=(None,), max_shape=(12,)), } ds = Dataset("./data/test/dsv_2/", schema=dt, shape=(9,), mode="w") dsv = ds[2:] with pytest.raises(ValueError): dsv[3] = np.ones((3, 5)) with pytest.raises(KeyError): dsv["abc"] = np.ones((3, 5)) dsv["second"] = np.array([0, 1, 2, 3, 4, 5, 6]) for i in range(7): assert dsv[i, "second"].compute() == i
def dict_to_hub(d): for k, v in d.items(): k = k.replace("/", "_") if isinstance(v, dict): d[k] = dict_to_hub(v) else: value_shape = v.shape if hasattr(v, "shape") else () shape = tuple([None for it in value_shape]) max_shape = tuple([10000 for it in value_shape]) if isinstance(v, torch.Tensor): v = v.numpy() dtype = v.dtype.name if hasattr(v, "dtype") else type(v) dtype = "int64" if isinstance(v, str) else dtype d[k] = ( Tensor(shape=shape, dtype=dtype, max_shape=max_shape) if not isinstance(v, str) else Text( shape=(None, ), dtype=dtype, max_shape=(10000, ))) return SchemaDict(d)
def test_dataset_view_lazy(): dt = { "first": Tensor(shape=(2,)), "second": "float", "text": Text(shape=(None,), max_shape=(12,)), } url = "./data/test/dsv_lazy" ds = Dataset(schema=dt, shape=(4,), url=url, mode="w") ds["text", 3] = "hello world" ds["second", 2] = 3.14 ds["first", 2] = np.array([5, 6]) dsv = ds[2:] dsv.disable_lazy() assert dsv["text", 1] == "hello world" assert dsv["second", 0] == 3.14 assert (dsv["first", 0] == np.array([5, 6])).all() dsv.enable_lazy() assert dsv["text", 1].compute() == "hello world" assert dsv["second", 0].compute() == 3.14 assert (dsv["first", 0].compute() == np.array([5, 6])).all()
def test_dataset_compute(): dt = { "first": Tensor(shape=(2,)), "second": "float", "text": Text(shape=(None,), max_shape=(12,)), } url = "./data/test/ds_compute" ds = Dataset(schema=dt, shape=(2,), url=url, mode="w") ds["text", 1] = "hello world" ds["second", 0] = 3.14 ds["first", 0] = np.array([5, 6]) comp = ds.compute() comp0 = comp[0] assert (comp0["first"] == np.array([5, 6])).all() assert comp0["second"] == 3.14 assert comp0["text"] == "" comp1 = comp[1] assert (comp1["first"] == np.array([0, 0])).all() assert comp1["second"] == 0 assert comp1["text"] == "hello world"
def test_dataset_assign_value(): schema = {"text": Text(shape=(None,), dtype="int64", max_shape=(10,))} url = "./data/test/text_data" ds = Dataset(schema=schema, shape=(7,), url=url, mode="w") slice_ = slice(0, 5, None) key = "text" batch = [ np.array("THTMLY2F9"), np.array("QUUVEU2IU"), np.array("8ZUFCYWKD"), "H9EDFAGHB", "WDLDYN6XG", ] ds[key, slice_] = batch ds[key][5] = np.array("GHLSGBFF8") ds[key][6] = "YGFJN75NF" assert ds["text", 0].compute() == "THTMLY2F9" assert ds["text", 1].compute() == "QUUVEU2IU" assert ds["text", 2].compute() == "8ZUFCYWKD" assert ds["text", 3].compute() == "H9EDFAGHB" assert ds["text", 4].compute() == "WDLDYN6XG" assert ds["text", 5].compute() == "GHLSGBFF8" assert ds["text", 6].compute() == "YGFJN75NF"
def text_to_hub(tf_dt, max_shape=None): max_shape = max_shape or (100000,) dt = "int64" return Text(shape=(None,), dtype=dt, max_shape=max_shape)
) sentences = list(df.sentence.values) labels = list(df.label.values) data = list(zip(sentences, labels)) @transform(schema=self.schema) def load_transform(sample): return {"sentence": sample[0], "labels": sample[1]} ds = load_transform(data) return ds.store(self.tag) def main(url, tag, schema): R = Retrieve(url, tag, schema) R.fetch() R.unpack() R.push() if __name__ == "__main__": url = "https://nyu-mll.github.io/CoLA/cola_public_1.1.zip" tag = "activeloop/CoLA" schema = { "sentence": Text(shape=(None, ), max_shape=(500, )), "labels": Primitive(dtype="int64"), } main(url, tag, schema)
def test_dataset_filter_2(): my_schema = { "fname": Text((None,), max_shape=(10,)), "lname": Text((None,), max_shape=(10,)), } ds = Dataset("./data/tests/filtering", shape=(100,), schema=my_schema, mode="w") for i in range(100): ds["fname", i] = "John" ds["lname", i] = "Doe" for i in [1, 3, 6, 15, 63, 96, 75]: ds["fname", i] = "Active" for i in [15, 31, 25, 75, 3, 6]: ds["lname", i] = "loop" dsv_combined = ds.filter( lambda x: x["fname"].compute() == "Active" and x["lname"].compute() == "loop" ) tsv_combined_fname = dsv_combined["fname"] tsv_combined_lname = dsv_combined["lname"] for item in dsv_combined: assert item.compute() == {"fname": "Active", "lname": "loop"} for item in tsv_combined_fname: assert item.compute() == "Active" for item in tsv_combined_lname: assert item.compute() == "loop" dsv_1 = ds.filter(lambda x: x["fname"].compute() == "Active") dsv_2 = dsv_1.filter(lambda x: x["lname"].compute() == "loop") for item in dsv_1: assert item.compute()["fname"] == "Active" tsv_1 = dsv_1["fname"] tsv_2 = dsv_2["lname"] for item in tsv_1: assert item.compute() == "Active" for item in tsv_2: assert item.compute() == "loop" for item in dsv_2: assert item.compute() == {"fname": "Active", "lname": "loop"} assert dsv_combined.indexes == [3, 6, 15, 75] assert dsv_1.indexes == [1, 3, 6, 15, 63, 75, 96] assert dsv_2.indexes == [3, 6, 15, 75] dsv_3 = ds.filter(lambda x: x["lname"].compute() == "loop") dsv_4 = dsv_3.filter(lambda x: x["fname"].compute() == "Active") for item in dsv_3: assert item.compute()["lname"] == "loop" for item in dsv_4: assert item.compute() == {"fname": "Active", "lname": "loop"} assert dsv_3.indexes == [3, 6, 15, 25, 31, 75] assert dsv_4.indexes == [3, 6, 15, 75] my_schema2 = { "fname": Text((None,), max_shape=(10,)), "lname": Text((None,), max_shape=(10,)), "image": Image((1920, 1080, 3)), } ds = Dataset("./data/tests/filtering2", shape=(100,), schema=my_schema2, mode="w") with pytest.raises(KeyError): ds.filter(lambda x: (x["random"].compute() == np.ones((1920, 1080, 3))).all()) for i in [1, 3, 6, 15, 63, 96, 75]: ds["fname", i] = "Active" dsv = ds.filter(lambda x: x["fname"].compute() == "Active") with pytest.raises(KeyError): dsv.filter(lambda x: (x["random"].compute() == np.ones((1920, 1080, 3))).all())
of our dataset. We have different types of schemas for different types of data like image, tensor, text. More info. on docs. """ mpii_schema = { """ we specify 'shape' as None for variable image size, and we give 'max_shape' arguement a maximum possible size of image. """ "image": schema.Image(shape=(None, None, 3), max_shape=(1920, 1920, 3), dtype="uint8"), "isValidation": "float64", "img_paths": Text(shape=(None, ), max_shape=(15, )), "img_width": "int32", "img_height": "int32", "objpos": Tensor(max_shape=(100, ), dtype="float64"), """ 'joint_self' has nested list structure """ "joint_self": Tensor(shape=(None, None), max_shape=(100, 100), dtype="float64"), "scale_provided": "float64", "annolist_index": "int32",
import numpy as np import zarr import hub from hub.schema import Tensor, Image, Text from hub.utils import Timer my_schema = { "image": Tensor((28, 28, 4), "int32", (28, 28, 4)), "label": Text((None, ), "int64", (20, )), "confidence": "float", } dynamic_schema = { "image": Tensor(shape=(None, None, None), dtype="int32", max_shape=(32, 32, 3)), "label": Text((None, ), "int64", (20, )), } def test_pipeline_basic(): ds = hub.Dataset("./data/test/test_pipeline_basic", mode="w", shape=(100, ), schema=my_schema) for i in range(len(ds)): ds["image", i] = np.ones((28, 28, 4), dtype="int32") ds["label", i] = f"hello {i}" ds["confidence", i] = 0.2