def create_image(path_to_direcotry): from PIL import Image shape = (512, 512, 3) for i in range(10): img = np.ones(shape, dtype="uint8") img = Image.fromarray(img) img.save(os.path.join(path_to_direcotry, str(i) + ".png"))
def image_to_hub(tf_dt, max_shape=None): dt = tf_dt.dtype.name if max_shape and len(max_shape) > len(tf_dt.shape): max_shape = max_shape[(len(max_shape) - len(tf_dt.shape)):] max_shape = max_shape or tuple(10000 if dim is None else dim for dim in tf_dt.shape) return Image(shape=tf_dt.shape, dtype=dt, max_shape=max_shape)
def test_dataset_filter_4(): schema = { "img": Image((None, None, 3), max_shape=(100, 100, 3)), "cl": ClassLabel(names=["cat", "dog", "horse"]), } ds = Dataset("./data/tests/filtering_4", shape=(100,), schema=schema, mode="w") for i in range(100): ds["cl", i] = 0 if i < 10 else 1 ds["img", i] = i * np.ones((5, 6, 3)) ds_filtered = ds.filter(lambda x: x["cl"].compute() == 0) assert (ds_filtered[3:8, "cl"].compute() == np.zeros((5,))).all()
def test_dataset_batch_write(): schema = {"image": Image(shape=(None, None, 3), max_shape=(100, 100, 3))} ds = Dataset("./data/batch", shape=(10,), mode="w", schema=schema) ds["image", 0:4] = 4 * np.ones((4, 67, 65, 3)) assert (ds["image", 0].numpy() == 4 * np.ones((67, 65, 3))).all() assert (ds["image", 1].numpy() == 4 * np.ones((67, 65, 3))).all() assert (ds["image", 2].numpy() == 4 * np.ones((67, 65, 3))).all() assert (ds["image", 3].numpy() == 4 * np.ones((67, 65, 3))).all() ds["image", 5:7] = [2 * np.ones((60, 65, 3)), 3 * np.ones((54, 30, 3))] assert (ds["image", 5].numpy() == 2 * np.ones((60, 65, 3))).all() assert (ds["image", 6].numpy() == 3 * np.ones((54, 30, 3))).all()
def test_dataset_filter_3(): schema = { "img": Image((None, None, 3), max_shape=(100, 100, 3)), "cl": ClassLabel(names=["cat", "dog", "horse"]), } ds = Dataset("./data/tests/filtering_3", shape=(100,), schema=schema, mode="w") for i in range(100): ds["cl", i] = 0 if i % 5 == 0 else 1 ds["img", i] = i * np.ones((5, 6, 3)) ds["cl", 4] = 2 ds_filtered = ds.filter(lambda x: x["cl"].compute() == 0) assert ds_filtered.indexes == [5 * i for i in range(20)] ds_filtered_2 = ds.filter(lambda x: x["cl"].compute() == 2) assert (ds_filtered_2["img"].compute() == 4 * np.ones((1, 5, 6, 3))).all() for item in ds_filtered_2: assert (item["img"].compute() == 4 * np.ones((5, 6, 3))).all() assert item["cl"].compute() == 2
def test_commit_checkout_2(): my_schema = { "abc": "uint32", "img": Image((1000, 1000, 3), dtype="uint16"), } ds = hub.Dataset("./data/test_versioning/eg_3", shape=(100, ), schema=my_schema, mode="w") for i in range(100): ds["img", i] = i * np.ones((1000, 1000, 3)) a = ds.commit("first") # chunk 7.0.0.0 gets rewritten ds["img", 21] = 2 * ds["img", 21].compute() # the rest part of the chunk stays intact assert (ds["img", 21].compute() == 2 * 21 * np.ones((1000, 1000, 3))).all() assert (ds["img", 22].compute() == 22 * np.ones((1000, 1000, 3))).all() assert (ds["img", 23].compute() == 23 * np.ones((1000, 1000, 3))).all() # other chunks are still accessed from original chunk, for eg chunk 11 that contains 35th sample has single copy assert (ds["img", 35].compute() == 35 * np.ones((1000, 1000, 3))).all() b = ds.commit("second") # going back to first commit ds.checkout(a) # sanity check assert (ds["img", 21].compute() == 21 * np.ones((1000, 1000, 3))).all() ds.checkout("another", create=True) ds["img", 21] = 3 * ds["img", 21].compute() assert (ds["img", 21].compute() == 3 * 21 * np.ones((1000, 1000, 3))).all( ) # and not 6 * 21 as it would have been, had we checked out from b ds.commit("first2") ds.checkout("master") assert (ds["img", 21].compute() == 2 * 21 * np.ones((1000, 1000, 3))).all() ds.log()
def test_multiprocessing(sample_size=200, width=100, channels=4, dtype="uint8"): my_schema = { "image": Image( (width, width, channels), dtype, (width, width, channels), chunks=(sample_size // 20), compressor="LZ4", ), } with Timer("multiprocesing"): @hub.transform(schema=my_schema, scheduler="threaded", workers=4) def my_transform(x): a = np.random.random((width, width, channels)) for i in range(100): a *= np.random.random((width, width, channels)) return { "image": (np.ones( (width, width, channels), dtype=dtype) * 255), } ds = hub.Dataset( "./data/test/test_pipeline_basic_4", mode="w", shape=(sample_size, ), schema=my_schema, cache=2 * 26, ) ds_t = my_transform(ds).store("./data/test/test_pipeline_basic_4") assert (ds_t["image", :].compute() == 255).all()
def test_tensor_flattening(): t = { "image": Image(shape=(300, 400, 3), dtype="uint8"), "label": Tensor( shape=(5000,), dtype="<U20", ), "gradient": { "x": "int32", "y": "int32", }, } result = tuple(flatten(t)) paths = [r[1] for r in result] dtypes = [r[0] for r in result] assert paths == ["/image", "/label", "/gradient/x", "/gradient/y"] assert isinstance(dtypes[0], Image) assert isinstance(dtypes[1], Tensor) assert isinstance(dtypes[2], Primitive) assert isinstance(dtypes[3], Primitive)
def test_dynamic_version_control(): my_schema = {"img": Image((None, None, 3), max_shape=(1000, 1000, 3))} ds = hub.Dataset("./data/dynamic_versioning", shape=(10, ), schema=my_schema, mode="w") for i in range(10): ds["img", i] = i * np.ones((100, 100, 3)) a = ds.commit("first") for i in range(10): ds["img", i] = 2 * i * np.ones((150, 150, 3)) ds.checkout(a) for i in range(10): assert (ds["img", i].compute() == i * np.ones((100, 100, 3))).all() ds.checkout("master") for i in range(10): assert (ds["img", i].compute() == 2 * i * np.ones((150, 150, 3))).all()
def main(): with Timer("Time"): schema = { "image": Image( (None, None, 4), dtype="uint8", chunks=(1, 2048, 2048, 4), max_shape=(100000, 100000, 4), ) } ds = hub.Dataset("./data/examples/big_image", mode="w", schema=schema, shape=(10000, )) print(ds["image"].shape, ds["image"].dtype) ds["image", 3, 0:2048, 0:2048] = np.ones( (2048, 2048, 4), dtype="uint8") # single chunk read/write print(ds._tensors["/image"].get_shape((3, ))) ds.commit()
def main(): schema = { "image": Image(shape=(None, None), max_shape=(28, 28)), "label": ClassLabel(num_classes=10), } path = "./data/examples/new_api_intro2" ds = Dataset(path, shape=(10, ), mode="w", schema=schema) print(len(ds)) for i in range(len(ds)): with Timer("writing single element"): ds["image", i] = np.ones((28, 28), dtype="uint8") ds["label", i] = 3 ds.resize_shape(200) print(ds.shape) print(ds["label", 100:110].numpy()) with Timer("Committing"): ds.flush() ds = Dataset(path) print(ds.schema) print(ds["image", 0].compute())
def test_dataset_batch_write_2(): schema = {"image": Image(shape=(None, None, 3), max_shape=(640, 640, 3))} ds = Dataset("./data/batch", shape=(100,), mode="w", schema=schema) ds["image", 0:14] = [np.ones((640 - i, 640, 3)) for i in range(14)]
def test_dataset_filter_2(): my_schema = { "fname": Text((None,), max_shape=(10,)), "lname": Text((None,), max_shape=(10,)), } ds = Dataset("./data/tests/filtering", shape=(100,), schema=my_schema, mode="w") for i in range(100): ds["fname", i] = "John" ds["lname", i] = "Doe" for i in [1, 3, 6, 15, 63, 96, 75]: ds["fname", i] = "Active" for i in [15, 31, 25, 75, 3, 6]: ds["lname", i] = "loop" dsv_combined = ds.filter( lambda x: x["fname"].compute() == "Active" and x["lname"].compute() == "loop" ) tsv_combined_fname = dsv_combined["fname"] tsv_combined_lname = dsv_combined["lname"] for item in dsv_combined: assert item.compute() == {"fname": "Active", "lname": "loop"} for item in tsv_combined_fname: assert item.compute() == "Active" for item in tsv_combined_lname: assert item.compute() == "loop" dsv_1 = ds.filter(lambda x: x["fname"].compute() == "Active") dsv_2 = dsv_1.filter(lambda x: x["lname"].compute() == "loop") for item in dsv_1: assert item.compute()["fname"] == "Active" tsv_1 = dsv_1["fname"] tsv_2 = dsv_2["lname"] for item in tsv_1: assert item.compute() == "Active" for item in tsv_2: assert item.compute() == "loop" for item in dsv_2: assert item.compute() == {"fname": "Active", "lname": "loop"} assert dsv_combined.indexes == [3, 6, 15, 75] assert dsv_1.indexes == [1, 3, 6, 15, 63, 75, 96] assert dsv_2.indexes == [3, 6, 15, 75] dsv_3 = ds.filter(lambda x: x["lname"].compute() == "loop") dsv_4 = dsv_3.filter(lambda x: x["fname"].compute() == "Active") for item in dsv_3: assert item.compute()["lname"] == "loop" for item in dsv_4: assert item.compute() == {"fname": "Active", "lname": "loop"} assert dsv_3.indexes == [3, 6, 15, 25, 31, 75] assert dsv_4.indexes == [3, 6, 15, 75] my_schema2 = { "fname": Text((None,), max_shape=(10,)), "lname": Text((None,), max_shape=(10,)), "image": Image((1920, 1080, 3)), } ds = Dataset("./data/tests/filtering2", shape=(100,), schema=my_schema2, mode="w") with pytest.raises(KeyError): ds.filter(lambda x: (x["random"].compute() == np.ones((1920, 1080, 3))).all()) for i in [1, 3, 6, 15, 63, 96, 75]: ds["fname", i] = "Active" dsv = ds.filter(lambda x: x["fname"].compute() == "Active") with pytest.raises(KeyError): dsv.filter(lambda x: (x["random"].compute() == np.ones((1920, 1080, 3))).all())
import numpy as np import hub from hub.schema import Image, ClassLabel from hub.utils import Timer schema = { "image": Image((28, 28), chunks=(1000, 28, 28)), "label": ClassLabel(num_classes=10), } def main(): sample_count = 70000 step = 10 with Timer("Time"): ds = hub.Dataset( "./data/examples/mnist_upload_speed_benchmark", mode="w", schema=schema, shape=(sample_count, ), cache=2**26, ) arr = (np.random.rand(step, 28, 28) * 100).astype("uint8") for i in range(0, sample_count, step): # with Timer(f"Sample {i}"): ds["image", i:i + step] = arr
import hub from hub.schema import Image import numpy as np my_schema = { "abc": "uint32", "img": Image((1000, 1000, 3), dtype="uint16"), } ds = hub.Dataset("./data/test_versioning/eg_3", shape=(100, ), schema=my_schema, mode="w") for i in range(100): ds["img", i] = i * np.ones((1000, 1000, 3)) a = ds.commit("first") # chunk 7.0.0.0 gets rewritten ds["img", 21] = 2 * ds["img", 21].compute() # the rest part of the chunk stays intact assert (ds["img", 21].compute() == 2 * 21 * np.ones((1000, 1000, 3))).all() assert (ds["img", 22].compute() == 22 * np.ones((1000, 1000, 3))).all() assert (ds["img", 23].compute() == 23 * np.ones((1000, 1000, 3))).all() # other chunks are still accessed from original chunk, for eg chunk 11 that contains 35th sample has single copy assert (ds["img", 35].compute() == 35 * np.ones((1000, 1000, 3))).all() b = ds.commit("second") # going back to first commit ds.checkout(a)