Python Dataset.Dataset Examples

Programming Language: Python

Namespace/Package Name: hub

Class/Type: Dataset

Method/Function: Dataset

Examples at hotexamples.com: 30

`hub.Dataset` is a class in the Python `hub` library that represents a dataset. It is designed to provide a simple and convenient way to work with datasets in Python. The `Dataset` class allows users to easily load and manipulate various types of datasets, such as text, image, or audio data. It also provides methods for preprocessing, splitting, and accessing the dataset, making it a powerful tool for data analysis and machine learning tasks. With `hub.Dataset`, users can efficiently organize and process their data, saving time and effort in their Python projects.

Python Dataset.Dataset - 30 examples found. These are the top rated real world Python examples of hub.Dataset.Dataset extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Dataset(30)

flush(10)

delete(9)

close(7)

to_pytorch(7)

copy(7)

to_tensorflow(6)

filter(5)

commit(5)

from_pytorch(2)

from_tfds(2)

resize_shape(2)

compute(2)

disable_lazy(1)

enable_lazy(1)

append_shape(1)

rename(1)

save(1)

store(1)

from_directory(1)

Example #1

Show file

def test_check_label_name():
    my_schema = {"label": ClassLabel(names=["red", "green", "blue"])}
    ds = Dataset("./data/test/dataset2", shape=(5,), mode="w", schema=my_schema)
    ds["label", 0] = 1
    ds["label", 1] = 2
    ds["label", 0] = 1
    ds["label", 1] = 2
    ds["label", 2] = 0
    assert ds.compute(label_name=True) == [
        {"label": "green"},
        {"label": "blue"},
        {"label": "red"},
        {"label": "red"},
        {"label": "red"},
    ]
    assert ds.compute() == [
        {"label": 1},
        {"label": 2},
        {"label": 0},
        {"label": 0},
        {"label": 0},
    ]
    assert ds[1].compute(label_name=True) == {"label": "blue"}
    assert ds[1].compute() == {"label": 2}
    assert ds[1:3].compute(label_name=True) == [{"label": "blue"}, {"label": "red"}]
    assert ds[1:3].compute() == [{"label": 2}, {"label": 0}]

Example #2

Show file

File: test_dataset.py Project: thomascherickal/Hub

def test_tensorview_slicing():
    dt = {"first": Tensor(shape=(None, None), max_shape=(250, 300))}
    ds = Dataset(schema=dt, shape=(20, ), url="./data/test/model", mode="w")
    tv = ds["first", 5:6, 7:10, 9:10]
    assert tv.numpy().shape == tuple(tv.shape) == (1, 3, 1)
    tv2 = ds["first", 5:6, 7:10, 9]
    assert tv2.numpy().shape == tuple(tv2.shape) == (1, 3)

Example #3

Show file

def test_meta_information():
    description = {
        "author": "testing",
        "description": "here goes the testing text"
    }

    description_changed = {
        "author": "changed author",
        "description": "now it's changed",
    }

    schema = {"text": Text((None, ), max_shape=(1000, ))}

    ds = Dataset(
        "./data/test_meta",
        shape=(10, ),
        schema=schema,
        meta_information=description,
        mode="w",
    )

    some_text = ["hello world", "hello penguin", "hi penguin"]

    for i, text in enumerate(some_text):
        ds["text", i] = text

    assert type(ds.meta["meta_info"]) == dict
    assert ds.meta["meta_info"]["author"] == "testing"
    assert ds.meta["meta_info"]["description"] == "here goes the testing text"

    ds.close()

Example #4

Show file

File: test_dataset.py Project: thomascherickal/Hub

def test_dataset_dynamic_shaped():
    schema = {
        "first":
        Tensor(
            shape=(None, None),
            dtype="int32",
            max_shape=(100, 100),
            chunks=(100, ),
        )
    }
    ds = Dataset(
        "./data/test/test_dataset_dynamic_shaped",
        token=None,
        shape=(1000, ),
        mode="w",
        schema=schema,
    )

    ds["first", 50, 50:60, 50:60] = np.ones((10, 10), "int32")
    assert (ds["first", 50, 50:60, 50:60].numpy() == np.ones((10, 10),
                                                             "int32")).all()

    ds["first", 0, :10, :10] = np.ones((10, 10), "int32")
    ds["first", 0, 10:20, 10:20] = 5 * np.ones((10, 10), "int32")
    assert (ds["first", 0, 0:10, 0:10].numpy() == np.ones((10, 10),
                                                          "int32")).all()

Example #5

Show file

File: test_sharded_dataset.py Project: nosahama/Hub

def test_sharded_dataset_with_views():
    schema = {"first": "float", "second": "float"}
    ds = Dataset("./data/test_sharded_ds",
                 shape=(10, ),
                 schema=schema,
                 mode="w")
    for i in range(10):
        ds[i, "first"] = i
        ds[i, "second"] = 2 * i + 1

    dsv = ds[3:5]
    dsv2 = ds[1]
    dsv3 = ds[8:]
    datasets = [dsv, ds, dsv2, dsv3]
    sharded_ds = ShardedDatasetView(datasets)
    for i in range(2):
        assert sharded_ds[i, "first"].compute() == i + 3
        assert sharded_ds[i, "second"].compute() == 2 * (i + 3) + 1
    for i in range(2, 12):
        assert sharded_ds[i, "first"].compute() == i - 2
        assert sharded_ds[i, "second"].compute() == 2 * (i - 2) + 1
    assert sharded_ds[12, "first"].compute() == 1
    assert sharded_ds[12, "second"].compute() == 3
    for i in range(13, 15):
        assert sharded_ds[i, "first"].compute() == i - 5
        assert sharded_ds[i, "second"].compute() == 2 * (i - 5) + 1

Example #6

Show file

File: test_dataset.py Project: istranic/Hub

def test_dataset_dynamic_shaped_slicing():
    schema = {
        "first":
        Tensor(
            shape=(None, None),
            dtype="int32",
            max_shape=(100, 100),
            chunks=(100, ),
        )
    }
    ds = Dataset(
        "./data/test/test_dataset_dynamic_shaped",
        token=None,
        shape=(100, ),
        mode="w",
        schema=schema,
    )

    for i in range(100):
        ds["first", i] = i * np.ones((i, i))
    items = ds["first", 0:100].compute()
    for i in range(100):
        assert (items[i] == i * np.ones((i, i))).all()

    assert (ds["first", 1:2].compute()[0] == np.ones((1, 1))).all()

Example #7

Show file

File: test_sharded_dataset.py Project: nosahama/Hub

def test_sharded_dataset():
    dt = {"first": "float", "second": "float"}
    datasets = [
        Dataset(schema=dt,
                shape=(10, ),
                url=f"./data/test/test_dataset/{i}",
                mode="w") for i in range(4)
    ]

    ds = ShardedDatasetView(datasets)

    ds[0]["first"] = 2.3
    assert ds[0]["second"].numpy() != 2.3
    assert ds[30]["first"].numpy() == 0
    assert len(ds) == 40
    assert ds.shape == (40, )
    assert type(ds.schema) == SchemaDict
    assert ds.__repr__() == "ShardedDatasetView(shape=(40,))"
    with pytest.raises(AdvancedSlicingNotSupported):
        ds[5:8]
    ds[4, "first"] = 3
    for _ in ds:
        pass

    ds2 = ShardedDatasetView([])
    assert ds2.identify_shard(5) == (0, 0)

Example #8

Show file

def test_dataset_bug_1(url="./data/test/dataset", token=None):
    my_schema = {
        "image": Tensor(
            (None, 1920, 1080, None), "uint8", max_shape=(10, 1920, 1080, 4)
        ),
    }
    ds = Dataset(url, token=token, shape=(10000,), mode="w", schema=my_schema)
    ds["image", 1] = np.ones((2, 1920, 1080, 1))

Example #9

Show file

def test_append_dataset():
    dt = {"first": Tensor(shape=(250, 300)), "second": "float"}
    url = "./data/test/model"
    ds = Dataset(schema=dt, shape=(100,), url=url, mode="w")
    ds.append_shape(20)
    ds["first"][0] = np.ones((250, 300))

    assert len(ds) == 120
    assert ds["first"].shape[0] == 120
    assert ds["first", 5:10].shape[0] == 5
    assert ds["second"].shape[0] == 120
    ds.flush()

    ds = Dataset(url)
    assert ds["first"].shape[0] == 120
    assert ds["first", 5:10].shape[0] == 5
    assert ds["second"].shape[0] == 120

Example #10

Show file

def test_dataset_2():
    dt = {"first": "float", "second": "float"}
    ds = Dataset(schema=dt, shape=(2,), url="./data/test/test_dataset2", mode="w")
    ds.meta_information["description"] = "This is my description"

    ds["first"][0] = 2.3
    assert ds.meta_information["description"] == "This is my description"
    assert ds["second"][0].numpy() != 2.3

Example #11

Show file

def time_random_access(dataset_name="activeloop/mnist",
                       offset=1000,
                       span=1000,
                       field="image"):
    dset = Dataset(dataset_name, cache=False, storage_cache=False)
    with Timer(
            f"{dataset_name} read at offset {offset:03} of length {span:03}"):
        dset[field][offset:offset + span].compute()

Example #12

Show file

def test_datasetview_slicing():
    dt = {"first": Tensor((100, 100))}
    ds = Dataset(schema=dt, shape=(20,), url="./data/test/model", mode="w")

    assert ds["first", 0].numpy().shape == (100, 100)
    assert ds["first", 0:1].numpy().shape == (1, 100, 100)
    assert ds[0]["first"].numpy().shape == (100, 100)
    assert ds[0:1]["first"].numpy().shape == (1, 100, 100)

Example #13

Show file

def test_append_resize():
    dt = {"first": Tensor(shape=(250, 300)), "second": "float"}
    url = "./data/test/append_resize"
    ds = Dataset(schema=dt, shape=(100,), url=url, mode="a")
    ds.append_shape(20)
    assert len(ds) == 120
    ds.resize_shape(150)
    assert len(ds) == 150

Example #14

Show file

def test_dataset2():
    dt = {"first": "float", "second": "float"}
    ds = Dataset(schema=dt,
                 shape=(2, ),
                 url="./data/test/test_dataset2",
                 mode="w")

    ds["first"][0] = 2.3
    assert ds["second"][0].numpy() != 2.3

Example #15

Show file

def test_tensorview_slicing():
    dt = {"first": Tensor(shape=(None, None), max_shape=(250, 300))}
    ds = Dataset(schema=dt, shape=(20,), url="./data/test/tensorivew_slicing", mode="w")
    tv = ds["first", 5:6, 7:10, 9:10]
    tv.disable_lazy()
    tv.enable_lazy()
    assert tv.compute()[0].shape == tuple(tv.shape[0]) == (3, 1)
    tv2 = ds["first", 5:6, 7:10, 9]
    assert tv2.numpy()[0].shape == tuple(tv2.shape[0]) == (3,)

Example #16

Show file

def test_class_label_2():
    cl1 = ClassLabel(names=["apple", "banana", "cat"])
    cl2 = ClassLabel((None, ), (10, ), names=["apple", "banana", "cat"])
    cl3 = ClassLabel((3, ), names=["apple", "banana", "cat"])
    my_schema = {"cl1": cl1, "cl2": cl2, "cl3": cl3}

    ds = Dataset("./data/cl_2d_3d", schema=my_schema, shape=(10), mode="w")

    ds["cl1", 0] = cl1.str2int("cat")
    ds["cl1", 1] = cl1.str2int("apple")
    ds["cl1", 2] = cl1.str2int("apple")
    ds["cl1", 3:5] = [cl1.str2int("banana"), cl1.str2int("banana")]
    assert ds["cl1", 1].compute(True) == "apple"
    assert ds["cl1", 0:3].compute(True) == ["cat", "apple", "apple"]
    assert ds["cl1", 3:5].compute(True) == ["banana", "banana"]

    ds["cl2", 0] = np.array(
        [cl2.str2int("cat"),
         cl2.str2int("cat"),
         cl2.str2int("apple")])
    ds["cl2", 1] = np.array([cl2.str2int("apple"), cl2.str2int("banana")])
    ds["cl2", 2] = np.array([
        cl2.str2int("cat"),
        cl2.str2int("apple"),
        cl2.str2int("banana"),
        cl2.str2int("apple"),
        cl2.str2int("banana"),
    ])
    ds["cl2", 3] = np.array([cl2.str2int("cat")])
    assert ds["cl2", 0].compute(True) == ["cat", "cat", "apple"]
    assert ds["cl2", 1].compute(True) == ["apple", "banana"]
    assert ds["cl2", 2].compute(True) == [
        "cat", "apple", "banana", "apple", "banana"
    ]
    assert ds["cl2", 3].compute(True) == ["cat"]

    ds["cl3", 0] = np.array(
        [cl3.str2int("apple"),
         cl3.str2int("apple"),
         cl3.str2int("apple")])
    ds["cl3", 1] = np.array(
        [cl3.str2int("banana"),
         cl3.str2int("banana"),
         cl3.str2int("banana")])
    ds["cl3", 2] = np.array(
        [cl3.str2int("cat"),
         cl3.str2int("cat"),
         cl3.str2int("cat")])
    assert ds["cl3", 0].compute(True) == ["apple", "apple", "apple"]
    assert ds["cl3", 1].compute(True) == ["banana", "banana", "banana"]
    assert ds["cl3", 2].compute(True) == ["cat", "cat", "cat"]
    assert ds["cl3", 0:3].compute(True) == [
        ["apple", "apple", "apple"],
        ["banana", "banana", "banana"],
        ["cat", "cat", "cat"],
    ]

Example #17

Show file

def test_tensorview_iter():
    schema = {"abc": "int32"}
    ds = Dataset(
        schema=schema, shape=(20,), url="./data/test/tensorivew_slicing", mode="w"
    )
    for i in range(20):
        ds["abc", i] = i
    tv = ds["abc", 3]
    for item in tv:
        assert item.compute() == 3

Example #18

Show file

def test_dataset_append_and_read():
    dt = {"first": "float", "second": "float"}
    ds = Dataset(
        schema=dt,
        shape=(2,),
        url="./data/test/test_dataset_append_and_read",
        mode="a",
    )

    ds["first"][0] = 2.3
    assert ds["second"][0].numpy() != 2.3
    ds.close()

    ds = Dataset(
        url="./data/test/test_dataset_append_and_read",
        mode="r",
    )
    ds.delete()
    ds.close()

Example #19

Show file

File: benchmark_iterate_hub_local_tensorflow.py Project: stjordanis/Hub-1

def benchmark_iterate_hub_local_tensorflow_setup(
    dataset_name, dataset_split, batch_size, prefetch_factor
):
    dset = Dataset.from_tfds(dataset_name, split=dataset_split)
    path = os.path.join(".", "hub_data", "tfds")
    dset.store(path)
    dset = Dataset(path, cache=False, storage_cache=False, mode="r")

    loader = dset.to_tensorflow().batch(batch_size).prefetch(prefetch_factor)

    return (loader,)

Example #20

Show file

File: test_dataset.py Project: thomascherickal/Hub

def test_text_dataset():
    schema = {
        "names": Text(shape=(None, ), max_shape=(1000, ), dtype="int64"),
    }
    ds = Dataset("./data/test/testing_text",
                 mode="w",
                 schema=schema,
                 shape=(10, ))
    text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
    ds["names", 4] = text
    assert ds["names", 4].numpy() == text

Example #21

Show file

def test_dataset_filter_4():
    schema = {
        "img": Image((None, None, 3), max_shape=(100, 100, 3)),
        "cl": ClassLabel(names=["cat", "dog", "horse"]),
    }
    ds = Dataset("./data/tests/filtering_4", shape=(100,), schema=schema, mode="w")
    for i in range(100):
        ds["cl", i] = 0 if i < 10 else 1
        ds["img", i] = i * np.ones((5, 6, 3))
    ds_filtered = ds.filter(lambda x: x["cl"].compute() == 0)
    assert (ds_filtered[3:8, "cl"].compute() == np.zeros((5,))).all()

Example #22

Show file

def test_datasetview_repr():
    dt = {
        "first": Tensor(shape=(2,)),
        "second": "float",
        "text": Text(shape=(None,), max_shape=(12,)),
    }
    url = "./data/test/dsv_repr"
    ds = Dataset(schema=dt, shape=(9,), url=url, mode="w", lazy=False)
    dsv = ds[2:]
    print_text = "DatasetView(Dataset(schema=SchemaDict({'first': Tensor(shape=(2,), dtype='float64'), 'second': 'float64', 'text': Text(shape=(None,), dtype='int64', max_shape=(12,))}), url='./data/test/dsv_repr', shape=(9,), mode='w'))"
    assert dsv.__repr__() == print_text

Example #23

Show file

def benchmark():
    schema = {"image": Tensor((256, 256, 3), dtype="uint8")}
    arr = (np.random.rand(256, 256, 3) * 100).astype("uint8")
    # ds = Dataset("s3://snark-test/superficial_dataset", mode="w", schema=schema, shape=(5000,))
    # for i in tqdm(range(len(ds))):
    #     ds["image", i] = arr
    # ds.close()
    ds = Dataset("s3://snark-test/superficial_dataset")
    tds = ds.to_pytorch()
    dl = torch.utils.data.DataLoader(tds, batch_size=32, num_workers=16)
    for i, b in enumerate(tqdm(dl)):
        pass

Example #24

Show file

File: test_sharded_dataset.py Project: stjordanis/Hub-1

def test_sharded_dataset_advanced_slice():
    schema = {"first": "float", "second": "float"}
    ds = Dataset("./data/test_sharded_ds", shape=(10,), schema=schema, mode="w")
    for i in range(10):
        ds[i, "first"] = i
        ds[i, "second"] = 2 * i + 1

    dsv = ds[3:5]
    dsv2 = ds[1]
    dsv3 = ds[8:]
    datasets = [dsv, ds, dsv2, dsv3]
    sharded_ds = ShardedDatasetView(datasets)
    assert sharded_ds["first", :].compute().tolist() == [
        3,
        4,
        0,
        1,
        2,
        3,
        4,
        5,
        6,
        7,
        8,
        9,
        1,
        8,
        9,
    ]
    assert sharded_ds["first"].compute().tolist() == [
        3,
        4,
        0,
        1,
        2,
        3,
        4,
        5,
        6,
        7,
        8,
        9,
        1,
        8,
        9,
    ]
    assert sharded_ds["first", -4:].compute().tolist() == [9, 1, 8, 9]
    assert sharded_ds[1:3].compute()[0] == {"first": 4.0, "second": 9.0}
    assert sharded_ds[1:3].compute()[1] == {"first": 0.0, "second": 1.0}
    sharded_ds["first", 1:5] = [10, 11, 12, 13]
    assert sharded_ds["first", 1:5].compute().tolist() == [10, 11, 12, 13]
    sharded_ds["first", 12] = 50
    assert sharded_ds["first", 12].compute() == 50

Example #25

Show file

File: benchmark_dataset_comparison.py Project: zomglings/Hub

def time_iter_hub_wasabi_tensorflow(
    dataset_info, batch_size=BATCH_SIZE, prefetch_factor=PREFETCH_SIZE, process=None
):
    dset = Dataset(dataset_info["hub_name"], cache=False, storage_cache=False, mode="r")
    loader = dset.to_tensorflow().batch(batch_size).prefetch(prefetch_factor)

    with Timer("Hub (remote - Wasabi) `.to_tensorflow()`"):
        for batch in loader:
            image = batch["image"]
            label = batch["label"]
            if process is not None:
                process(image, label)

Example #26

Show file

def test_dataset_filter():
    def abc_filter(sample):
        return sample["ab"].compute().startswith("abc")

    my_schema = {"img": Tensor((100, 100)), "ab": Text((None,), max_shape=(10,))}
    ds = Dataset("./data/new_filter", shape=(10,), schema=my_schema)
    for i in range(10):
        ds["img", i] = i * np.ones((100, 100))
        ds["ab", i] = "abc" + str(i) if i % 2 == 0 else "def" + str(i)

    ds2 = ds.filter(abc_filter)
    assert ds2.indexes == [0, 2, 4, 6, 8]

Example #27

Show file

def test_dataset_copy_azure_local():
    token = {"account_key": os.getenv("ACCOUNT_KEY")}
    ds = Dataset(
        "https://activeloop.blob.core.windows.net/activeloop-hub/cp_original_test_ds_azure_1",
        token=token,
        shape=(100,),
        schema=simple_schema,
    )
    DS2_PATH = "./data/testing/cp_copy_ds_local_4"
    DS3_PATH = "https://activeloop.blob.core.windows.net/activeloop-hub/cp_copy_test_ds_azure_2"
    for i in range(100):
        ds["num", i] = 2 * i
    try:
        ds2 = ds.copy(DS2_PATH)
    except:
        dsi = Dataset(DS2_PATH)
        dsi.delete()
        ds2 = ds.copy(DS2_PATH)

    try:
        ds3 = ds2.copy(
            DS3_PATH,
            token=token,
        )
    except:
        dsi = Dataset(
            DS3_PATH,
            token=token,
        )
        dsi.delete()
        ds3 = ds2.copy(
            DS3_PATH,
            token=token,
        )
    for i in range(100):
        assert ds2["num", i].compute() == 2 * i
        assert ds3["num", i].compute() == 2 * i
    ds.delete()
    ds2.delete()
    ds3.delete()

Example #28

Show file

def test_datasetview_get_dictionary():
    ds = Dataset(
        schema=my_schema,
        shape=(20, ),
        url="./data/test/datasetview_get_dictionary",
        mode="w",
    )
    ds["label", 5, "a"] = 5 * np.ones((100, 200))
    ds["label", 5, "d", "e"] = 3 * np.ones((5, 3))
    dsv = ds[2:10]
    dic = dsv[3, "label"]
    assert (dic["a"].compute() == 5 * np.ones((100, 200))).all()
    assert (dic["d"]["e"].compute() == 3 * np.ones((5, 3))).all()

Example #29

Show file

def test_dataset_with_chunks():
    ds = Dataset(
        "./data/test/dataset_with_chunks",
        token=None,
        shape=(10000, ),
        mode="w",
        schema=my_schema_with_chunks,
    )
    ds["label/a", 5, 50, 50] = 8
    assert ds["label/a", 5, 50, 50].numpy() == 8
    ds["image", 5, 4, 100:200, 150:300, :] = np.ones((100, 150, 3), "uint8")
    assert (ds["image", 5, 4, 100:200, 150:300, :].numpy() == np.ones(
        (100, 150, 3), "uint8")).all()

Example #30

Show file

def test_text_dataset():
    schema = {
        "names": Text(shape=(None,), max_shape=(1000,), dtype="int64"),
    }
    ds = Dataset("./data/test/testing_text", mode="w", schema=schema, shape=(10,))
    text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
    ds["names", 4] = text + "4"
    assert ds["names", 4].numpy() == text + "4"
    ds["names"][5] = text + "5"
    assert ds["names"][5].numpy() == text + "5"
    dsv = ds[7:9]
    dsv["names", 0] = text + "7"
    assert dsv["names", 0].numpy() == text + "7"
    dsv["names"][1] = text + "8"
    assert dsv["names"][1].numpy() == text + "8"

    schema2 = {
        "id": Text(shape=(4,), dtype="int64"),
    }
    ds2 = Dataset("./data/test/testing_text_2", mode="w", schema=schema2, shape=(10,))
    ds2[0:5, "id"] = ["abcd", "efgh", "ijkl", "mnop", "qrst"]
    assert ds2[2:4, "id"].compute() == ["ijkl", "mnop"]