Example #1
0
def test_text_dataset_tokenizer():
    schema = {
        "names": Text(shape=(None,), max_shape=(1000,), dtype="int64"),
    }
    ds = Dataset(
        "./data/test/testing_text", mode="w", schema=schema, shape=(10,), tokenizer=True
    )
    text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
    ds["names", 4] = text + " 4"
    assert ds["names", 4].numpy() == text + " 4"
    ds["names"][5] = text + " 5"
    assert ds["names"][5].numpy() == text + " 5"
    dsv = ds[7:9]
    dsv["names", 0] = text + " 7"
    assert dsv["names", 0].numpy() == text + " 7"
    dsv["names"][1] = text + " 8"
    assert dsv["names"][1].numpy() == text + " 8"

    schema2 = {
        "id": Text(shape=(4,), dtype="int64"),
    }
    ds2 = Dataset(
        "./data/test/testing_text_2",
        mode="w",
        schema=schema2,
        shape=(10,),
        tokenizer=True,
    )
    ds2[0:5, "id"] = ["abcd", "abcd", "abcd", "abcd", "abcd"]
    assert ds2[2:4, "id"].compute() == ["abcd", "abcd"]
Example #2
0
 def dict_to_hub(dic, path=""):
     d = {}
     for k, v in dic.items():
         k = k.replace("/", "_")
         cur_path = path + "/" + k
         if isinstance(v, dict):
             d[k] = dict_to_hub(v, path=cur_path)
         else:
             value_shape = v.shape if hasattr(v, "shape") else ()
             if isinstance(v, torch.Tensor):
                 v = v.numpy()
             shape = tuple(None for it in value_shape)
             max_shape = (
                 max_dict[cur_path] or tuple(10000 for it in value_shape)
                 if not isinstance(v, str)
                 else (10000,)
             )
             dtype = v.dtype.name if hasattr(v, "dtype") else type(v)
             dtype = "int64" if isinstance(v, str) else dtype
             d[k] = (
                 Tensor(shape=shape, dtype=dtype, max_shape=max_shape)
                 if not isinstance(v, str)
                 else Text(shape=(None,), dtype=dtype, max_shape=max_shape)
             )
     return SchemaDict(d)
Example #3
0
def test_meta_information():
    description = {"author": "testing", "description": "here goes the testing text"}

    description_changed = {
        "author": "changed author",
        "description": "now it's changed",
    }

    schema = {"text": Text((None,), max_shape=(1000,))}

    ds = Dataset(
        "./data/test_meta",
        shape=(10,),
        schema=schema,
        meta_information=description,
        mode="w",
    )

    some_text = ["hello world", "hello penguin", "hi penguin"]

    for i, text in enumerate(some_text):
        ds["text", i] = text

    assert type(ds.meta["meta_info"]) == dict
    assert ds.meta["meta_info"]["author"] == "testing"
    assert ds.meta["meta_info"]["description"] == "here goes the testing text"

    ds.close()
Example #4
0
def test_threaded():
    init_schema = {
        "image":
        Tensor(shape=(None, None, None),
               max_shape=(4, 224, 224),
               dtype="float32")
    }
    schema = {
        "image":
        Tensor(shape=(None, None, None),
               max_shape=(4, 224, 224),
               dtype="float32"),
        "label":
        Tensor(shape=(None, ), max_shape=(6, ), dtype="uint8"),
        "text_label":
        Text((None, ), "int64", (14, )),
        "flight_code":
        Text((None, ), "int64", (10, )),
    }

    ds_init = hub.Dataset(
        "./data/hub/new_pipeline_threaded2",
        mode="w",
        shape=(10, ),
        schema=init_schema,
        cache=False,
    )

    for i in range(len(ds_init)):
        ds_init["image", i] = np.ones((4, 220, 224))
        ds_init["image", i] = np.ones((4, 221, 224))

    @hub.transform(schema=schema, scheduler="threaded", workers=2)
    def create_classification_dataset(sample):
        ts = sample["image"]
        return [{
            "image": ts,
            "label": np.ones((6, )),
            "text_label": "PLANTED",
            "flight_code": "UYKNTHNXR",
        } for _ in range(5)]

    ds = create_classification_dataset(ds_init).store(
        "./data/hub/new_pipeline_threaded_final")

    assert ds["image", 0].shape[1] == 221
Example #5
0
def test_text():
    my_schema = {"text": Text((None, ), max_shape=(10, ))}

    @hub.transform(schema=my_schema)
    def my_transform(sample):
        return {"text": np.array("abc")}

    ds = my_transform([i for i in range(10)])
    ds2 = ds.store("./data/test/transform_text")
    for i in range(10):
        assert ds2["text", i].compute() == "abc"
Example #6
0
    def tensor_to_hub(tf_dt, max_shape=None):
        if tf_dt.dtype.name == "string":
            max_shape = max_shape or (100000, )
            return Text(shape=(None, ), dtype="int64", max_shape=(100000, ))
        dt = tf_dt.dtype.name
        if max_shape and len(max_shape) > len(tf_dt.shape):
            max_shape = max_shape[(len(max_shape) - len(tf_dt.shape)):]

        max_shape = max_shape or tuple(10000 if dim is None else dim
                                       for dim in tf_dt.shape)
        return Tensor(shape=tf_dt.shape, dtype=dt, max_shape=max_shape)
Example #7
0
def test_datasetview_repr():
    dt = {
        "first": Tensor(shape=(2,)),
        "second": "float",
        "text": Text(shape=(None,), max_shape=(12,)),
    }
    url = "./data/test/dsv_repr"
    ds = Dataset(schema=dt, shape=(9,), url=url, mode="w", lazy=False)
    dsv = ds[2:]
    print_text = "DatasetView(Dataset(schema=SchemaDict({'first': Tensor(shape=(2,), dtype='float64'), 'second': 'float64', 'text': Text(shape=(None,), dtype='int64', max_shape=(12,))}), url='./data/test/dsv_repr', shape=(9,), mode='w'))"
    assert dsv.__repr__() == print_text
Example #8
0
def test_text_dataset():
    schema = {
        "names": Text(shape=(None, ), max_shape=(1000, ), dtype="int64"),
    }
    ds = Dataset("./data/test/testing_text",
                 mode="w",
                 schema=schema,
                 shape=(10, ))
    text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
    ds["names", 4] = text
    assert ds["names", 4].numpy() == text
Example #9
0
def test_dataset_filter():
    def abc_filter(sample):
        return sample["ab"].compute().startswith("abc")

    my_schema = {"img": Tensor((100, 100)), "ab": Text((None,), max_shape=(10,))}
    ds = Dataset("./data/new_filter", shape=(10,), schema=my_schema)
    for i in range(10):
        ds["img", i] = i * np.ones((100, 100))
        ds["ab", i] = "abc" + str(i) if i % 2 == 0 else "def" + str(i)

    ds2 = ds.filter(abc_filter)
    assert ds2.indexes == [0, 2, 4, 6, 8]
Example #10
0
def test_dataset_lazy():
    dt = {
        "first": Tensor(shape=(2, )),
        "second": "float",
        "text": Text(shape=(None, ), max_shape=(12, )),
    }
    url = "./data/test/ds_lazy"
    ds = Dataset(schema=dt, shape=(2, ), url=url, mode="w", lazy=False)
    ds["text", 1] = "hello world"
    ds["second", 0] = 3.14
    ds["first", 0] = np.array([5, 6])
    assert ds["text", 1] == "hello world"
    assert ds["second", 0] == 3.14
    assert (ds["first", 0] == np.array([5, 6])).all()
Example #11
0
def test_dataset_setting_shape():
    schema = {"text": Text(shape=(None,), dtype="int64", max_shape=(10,))}

    url = "./data/test/text_data"
    ds = Dataset(schema=schema, shape=(5,), url=url, mode="w")
    slice_ = slice(0, 5, None)
    key = "text"
    batch = [
        np.array("THTMLY2F9"),
        np.array("QUUVEU2IU"),
        np.array("8ZUFCYWKD"),
        "H9EDFAGHB",
        "WDLDYN6XG",
    ]
    shape = ds._tensors[f"/{key}"].get_shape_from_value([slice_], batch)
    assert shape[0][0] == [1]
Example #12
0
def test_datasetview_2():
    dt = {
        "first": Tensor(shape=(2,)),
        "second": "float",
        "text": Text(shape=(None,), max_shape=(12,)),
    }
    ds = Dataset("./data/test/dsv_2/", schema=dt, shape=(9,), mode="w")
    dsv = ds[2:]
    with pytest.raises(ValueError):
        dsv[3] = np.ones((3, 5))

    with pytest.raises(KeyError):
        dsv["abc"] = np.ones((3, 5))
    dsv["second"] = np.array([0, 1, 2, 3, 4, 5, 6])
    for i in range(7):
        assert dsv[i, "second"].compute() == i
Example #13
0
 def dict_to_hub(d):
     for k, v in d.items():
         k = k.replace("/", "_")
         if isinstance(v, dict):
             d[k] = dict_to_hub(v)
         else:
             value_shape = v.shape if hasattr(v, "shape") else ()
             shape = tuple([None for it in value_shape])
             max_shape = tuple([10000 for it in value_shape])
             if isinstance(v, torch.Tensor):
                 v = v.numpy()
             dtype = v.dtype.name if hasattr(v, "dtype") else type(v)
             dtype = "int64" if isinstance(v, str) else dtype
             d[k] = (
                 Tensor(shape=shape, dtype=dtype, max_shape=max_shape)
                 if not isinstance(v, str) else Text(
                     shape=(None, ), dtype=dtype, max_shape=(10000, )))
     return SchemaDict(d)
Example #14
0
def test_dataset_view_lazy():
    dt = {
        "first": Tensor(shape=(2,)),
        "second": "float",
        "text": Text(shape=(None,), max_shape=(12,)),
    }
    url = "./data/test/dsv_lazy"
    ds = Dataset(schema=dt, shape=(4,), url=url, mode="w")
    ds["text", 3] = "hello world"
    ds["second", 2] = 3.14
    ds["first", 2] = np.array([5, 6])
    dsv = ds[2:]
    dsv.disable_lazy()
    assert dsv["text", 1] == "hello world"
    assert dsv["second", 0] == 3.14
    assert (dsv["first", 0] == np.array([5, 6])).all()
    dsv.enable_lazy()
    assert dsv["text", 1].compute() == "hello world"
    assert dsv["second", 0].compute() == 3.14
    assert (dsv["first", 0].compute() == np.array([5, 6])).all()
Example #15
0
def test_dataset_compute():
    dt = {
        "first": Tensor(shape=(2,)),
        "second": "float",
        "text": Text(shape=(None,), max_shape=(12,)),
    }
    url = "./data/test/ds_compute"
    ds = Dataset(schema=dt, shape=(2,), url=url, mode="w")
    ds["text", 1] = "hello world"
    ds["second", 0] = 3.14
    ds["first", 0] = np.array([5, 6])
    comp = ds.compute()
    comp0 = comp[0]
    assert (comp0["first"] == np.array([5, 6])).all()
    assert comp0["second"] == 3.14
    assert comp0["text"] == ""
    comp1 = comp[1]
    assert (comp1["first"] == np.array([0, 0])).all()
    assert comp1["second"] == 0
    assert comp1["text"] == "hello world"
Example #16
0
def test_dataset_assign_value():
    schema = {"text": Text(shape=(None,), dtype="int64", max_shape=(10,))}
    url = "./data/test/text_data"
    ds = Dataset(schema=schema, shape=(7,), url=url, mode="w")
    slice_ = slice(0, 5, None)
    key = "text"
    batch = [
        np.array("THTMLY2F9"),
        np.array("QUUVEU2IU"),
        np.array("8ZUFCYWKD"),
        "H9EDFAGHB",
        "WDLDYN6XG",
    ]
    ds[key, slice_] = batch
    ds[key][5] = np.array("GHLSGBFF8")
    ds[key][6] = "YGFJN75NF"
    assert ds["text", 0].compute() == "THTMLY2F9"
    assert ds["text", 1].compute() == "QUUVEU2IU"
    assert ds["text", 2].compute() == "8ZUFCYWKD"
    assert ds["text", 3].compute() == "H9EDFAGHB"
    assert ds["text", 4].compute() == "WDLDYN6XG"
    assert ds["text", 5].compute() == "GHLSGBFF8"
    assert ds["text", 6].compute() == "YGFJN75NF"
Example #17
0
 def text_to_hub(tf_dt, max_shape=None):
     max_shape = max_shape or (100000,)
     dt = "int64"
     return Text(shape=(None,), dtype=dt, max_shape=max_shape)
Example #18
0
        )

        sentences = list(df.sentence.values)
        labels = list(df.label.values)
        data = list(zip(sentences, labels))

        @transform(schema=self.schema)
        def load_transform(sample):
            return {"sentence": sample[0], "labels": sample[1]}

        ds = load_transform(data)
        return ds.store(self.tag)


def main(url, tag, schema):
    R = Retrieve(url, tag, schema)
    R.fetch()
    R.unpack()
    R.push()


if __name__ == "__main__":
    url = "https://nyu-mll.github.io/CoLA/cola_public_1.1.zip"
    tag = "activeloop/CoLA"
    schema = {
        "sentence": Text(shape=(None, ), max_shape=(500, )),
        "labels": Primitive(dtype="int64"),
    }

    main(url, tag, schema)
Example #19
0
def test_dataset_filter_2():
    my_schema = {
        "fname": Text((None,), max_shape=(10,)),
        "lname": Text((None,), max_shape=(10,)),
    }
    ds = Dataset("./data/tests/filtering", shape=(100,), schema=my_schema, mode="w")
    for i in range(100):
        ds["fname", i] = "John"
        ds["lname", i] = "Doe"

    for i in [1, 3, 6, 15, 63, 96, 75]:
        ds["fname", i] = "Active"

    for i in [15, 31, 25, 75, 3, 6]:
        ds["lname", i] = "loop"

    dsv_combined = ds.filter(
        lambda x: x["fname"].compute() == "Active" and x["lname"].compute() == "loop"
    )
    tsv_combined_fname = dsv_combined["fname"]
    tsv_combined_lname = dsv_combined["lname"]
    for item in dsv_combined:
        assert item.compute() == {"fname": "Active", "lname": "loop"}
    for item in tsv_combined_fname:
        assert item.compute() == "Active"
    for item in tsv_combined_lname:
        assert item.compute() == "loop"
    dsv_1 = ds.filter(lambda x: x["fname"].compute() == "Active")
    dsv_2 = dsv_1.filter(lambda x: x["lname"].compute() == "loop")
    for item in dsv_1:
        assert item.compute()["fname"] == "Active"
    tsv_1 = dsv_1["fname"]
    tsv_2 = dsv_2["lname"]
    for item in tsv_1:
        assert item.compute() == "Active"
    for item in tsv_2:
        assert item.compute() == "loop"
    for item in dsv_2:
        assert item.compute() == {"fname": "Active", "lname": "loop"}
    assert dsv_combined.indexes == [3, 6, 15, 75]
    assert dsv_1.indexes == [1, 3, 6, 15, 63, 75, 96]
    assert dsv_2.indexes == [3, 6, 15, 75]

    dsv_3 = ds.filter(lambda x: x["lname"].compute() == "loop")
    dsv_4 = dsv_3.filter(lambda x: x["fname"].compute() == "Active")
    for item in dsv_3:
        assert item.compute()["lname"] == "loop"
    for item in dsv_4:
        assert item.compute() == {"fname": "Active", "lname": "loop"}
    assert dsv_3.indexes == [3, 6, 15, 25, 31, 75]
    assert dsv_4.indexes == [3, 6, 15, 75]

    my_schema2 = {
        "fname": Text((None,), max_shape=(10,)),
        "lname": Text((None,), max_shape=(10,)),
        "image": Image((1920, 1080, 3)),
    }
    ds = Dataset("./data/tests/filtering2", shape=(100,), schema=my_schema2, mode="w")
    with pytest.raises(KeyError):
        ds.filter(lambda x: (x["random"].compute() == np.ones((1920, 1080, 3))).all())

    for i in [1, 3, 6, 15, 63, 96, 75]:
        ds["fname", i] = "Active"
    dsv = ds.filter(lambda x: x["fname"].compute() == "Active")
    with pytest.raises(KeyError):
        dsv.filter(lambda x: (x["random"].compute() == np.ones((1920, 1080, 3))).all())
Example #20
0
of our dataset. We have different types of schemas for different
types of data like image, tensor, text. More info. on docs.
"""
mpii_schema = {
    """
    we specify 'shape' as None for variable image size, and we
    give 'max_shape' arguement a maximum possible size of image.
    """
    "image":
    schema.Image(shape=(None, None, 3),
                 max_shape=(1920, 1920, 3),
                 dtype="uint8"),
    "isValidation":
    "float64",
    "img_paths":
    Text(shape=(None, ), max_shape=(15, )),
    "img_width":
    "int32",
    "img_height":
    "int32",
    "objpos":
    Tensor(max_shape=(100, ), dtype="float64"),
    """
    'joint_self' has nested list structure
    """
    "joint_self":
    Tensor(shape=(None, None), max_shape=(100, 100), dtype="float64"),
    "scale_provided":
    "float64",
    "annolist_index":
    "int32",
Example #21
0
import numpy as np
import zarr

import hub
from hub.schema import Tensor, Image, Text
from hub.utils import Timer

my_schema = {
    "image": Tensor((28, 28, 4), "int32", (28, 28, 4)),
    "label": Text((None, ), "int64", (20, )),
    "confidence": "float",
}

dynamic_schema = {
    "image": Tensor(shape=(None, None, None),
                    dtype="int32",
                    max_shape=(32, 32, 3)),
    "label": Text((None, ), "int64", (20, )),
}


def test_pipeline_basic():
    ds = hub.Dataset("./data/test/test_pipeline_basic",
                     mode="w",
                     shape=(100, ),
                     schema=my_schema)

    for i in range(len(ds)):
        ds["image", i] = np.ones((28, 28, 4), dtype="int32")
        ds["label", i] = f"hello {i}"
        ds["confidence", i] = 0.2