Ejemplo n.º 1
0
def test_image_feature_type_to_arrow():
    features = Features({"image": Image()})
    assert features.arrow_schema == pa.schema({"image": Image().pa_type})
    features = Features({"struct_containing_an_image": {"image": Image()}})
    assert features.arrow_schema == pa.schema({"struct_containing_an_image": pa.struct({"image": Image().pa_type})})
    features = Features({"sequence_of_images": Sequence(Image())})
    assert features.arrow_schema == pa.schema({"sequence_of_images": pa.list_(Image().pa_type)})
Ejemplo n.º 2
0
    def test_push_dataset_to_hub_custom_features_image(self):
        image_path = os.path.join(os.path.dirname(__file__), "features",
                                  "data", "test_image_rgb.jpg")
        data = {"x": [image_path, None], "y": [0, -1]}
        features = Features({"x": Image(), "y": Value("int32")})
        ds = Dataset.from_dict(data, features=features)

        for embed_external_files in [True, False]:
            ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
            try:
                ds.push_to_hub(ds_name,
                               embed_external_files=embed_external_files,
                               token=self._token)
                hub_ds = load_dataset(ds_name,
                                      split="train",
                                      download_mode="force_redownload")

                self.assertListEqual(ds.column_names, hub_ds.column_names)
                self.assertListEqual(list(ds.features.keys()),
                                     list(hub_ds.features.keys()))
                self.assertDictEqual(ds.features, hub_ds.features)
                self.assertEqual(ds[:], hub_ds[:])
                hub_ds = hub_ds.cast_column("x", Image(decode=False))
                elem = hub_ds[0]["x"]
                path, bytes_ = elem["path"], elem["bytes"]
                self.assertTrue(bool(path) == (not embed_external_files))
                self.assertTrue(bool(bytes_) == embed_external_files)
            finally:
                self._api.delete_repo(ds_name.split("/")[1],
                                      organization=ds_name.split("/")[0],
                                      token=self._token,
                                      repo_type="dataset")
Ejemplo n.º 3
0
def test_dataset_with_image_feature_with_none():
    data = {"image": [None]}
    features = Features({"image": Image()})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"image"}
    assert item["image"] is None
    batch = dset[:1]
    assert len(batch) == 1
    assert batch.keys() == {"image"}
    assert isinstance(batch["image"], list) and all(item is None for item in batch["image"])
    column = dset["image"]
    assert len(column) == 1
    assert isinstance(column, list) and all(item is None for item in column)

    # nested tests

    data = {"images": [[None]]}
    features = Features({"images": Sequence(Image())})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"images"}
    assert all(i is None for i in item["images"])

    data = {"nested": [{"image": None}]}
    features = Features({"nested": {"image": Image()}})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"nested"}
    assert item["nested"].keys() == {"image"}
    assert item["nested"]["image"] is None
Ejemplo n.º 4
0
def test_image_decode_example(shared_datadir):
    import PIL.Image

    image_path = str(shared_datadir / "test_image_rgb.jpg")
    image = Image()
    decoded_example = image.decode_example(image_path)

    assert isinstance(decoded_example, PIL.Image.Image)
    assert os.path.samefile(decoded_example.filename, image_path)
    assert decoded_example.size == (640, 480)
    assert decoded_example.mode == "RGB"
Ejemplo n.º 5
0
def test_image_feature_encode_example(shared_datadir, build_example):
    import PIL.Image

    image_path = str(shared_datadir / "test_image_rgb.jpg")
    image = Image()
    encoded_example = image.encode_example(build_example(image_path))
    assert isinstance(encoded_example, dict)
    assert encoded_example.keys() == {"bytes", "path"}
    assert encoded_example["bytes"] is not None or encoded_example["path"] is not None
    decoded_example = image.decode_example(encoded_example)
    assert isinstance(decoded_example, PIL.Image.Image)
Ejemplo n.º 6
0
def test_dataset_concatenate_image_features(shared_datadir):
    # we use a different data structure between 1 and 2 to make sure they are compatible with each other
    image_path = str(shared_datadir / "test_image_rgb.jpg")
    data1 = {"image": [image_path]}
    dset1 = Dataset.from_dict(data1, features=Features({"image": Image()}))
    data2 = {"image": [{"bytes": open(image_path, "rb").read()}]}
    dset2 = Dataset.from_dict(data2, features=Features({"image": Image()}))
    concatenated_dataset = concatenate_datasets([dset1, dset2])
    assert len(concatenated_dataset) == len(dset1) + len(dset2)
    assert concatenated_dataset[0]["image"] == dset1[0]["image"]
    assert concatenated_dataset[1]["image"] == dset2[0]["image"]
Ejemplo n.º 7
0
def test_dataset_cast_to_image_features(shared_datadir, build_data):
    import PIL.Image

    image_path = str(shared_datadir / "test_image_rgb.jpg")
    data = build_data(image_path)
    dset = Dataset.from_dict(data)
    item = dset.cast(Features({"image": Image()}))[0]
    assert item.keys() == {"image"}
    assert isinstance(item["image"], PIL.Image.Image)
    item = dset.cast_column("image", Image())[0]
    assert item.keys() == {"image"}
    assert isinstance(item["image"], PIL.Image.Image)
Ejemplo n.º 8
0
def test_image_decode_example(shared_datadir):
    import PIL.Image

    image_path = str(shared_datadir / "test_image_rgb.jpg")
    image = Image()
    decoded_example = image.decode_example({"path": image_path, "bytes": None})

    assert isinstance(decoded_example, PIL.Image.Image)
    assert os.path.samefile(decoded_example.filename, image_path)
    assert decoded_example.size == (640, 480)
    assert decoded_example.mode == "RGB"

    with pytest.raises(RuntimeError):
        Image(decode=False).decode_example(image_path)
Ejemplo n.º 9
0
def data_files_with_zip_archives(tmp_path, image_file):
    from PIL import Image, ImageOps

    data_dir = tmp_path / "imagefolder_data_dir_with_zip_archives"
    data_dir.mkdir(parents=True, exist_ok=True)
    archive_dir = data_dir / "archive"
    archive_dir.mkdir(parents=True, exist_ok=True)
    subdir = archive_dir / "subdir"
    subdir.mkdir(parents=True, exist_ok=True)

    image_filename = archive_dir / "image_rgb.jpg"
    shutil.copyfile(image_file, image_filename)
    image_filename2 = subdir / "image_rgb2.jpg"  # in subdir
    # make sure they're two different images
    # Indeed we won't be able to compare the image.filename, since the archive is not extracted in streaming mode
    ImageOps.flip(Image.open(image_file)).save(image_filename2)

    image_metadata_filename = archive_dir / "metadata.jsonl"
    image_metadata = textwrap.dedent("""\
        {"file_name": "image_rgb.jpg", "caption": "Nice image"}
        {"file_name": "subdir/image_rgb2.jpg", "caption": "Nice second image"}
        """)
    with open(image_metadata_filename, "w", encoding="utf-8") as f:
        f.write(image_metadata)

    shutil.make_archive(archive_dir, "zip", archive_dir)
    shutil.rmtree(str(archive_dir))

    data_files_with_zip_archives = DataFilesDict.from_local_or_remote(
        get_patterns_locally(data_dir), data_dir)

    assert len(data_files_with_zip_archives) == 1
    assert len(data_files_with_zip_archives["train"]) == 1
    return data_files_with_zip_archives
Ejemplo n.º 10
0
def test_dataset_with_image_feature_tar_jpg(tar_jpg_path):
    import PIL.Image

    data = {"image": []}
    for file_path, file_obj in iter_archive(tar_jpg_path):
        data["image"].append({"path": file_path, "bytes": file_obj.read()})
        break

    features = Features({"image": Image()})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"image"}
    assert isinstance(item["image"], PIL.Image.Image)
    assert item["image"].filename == ""
    assert item["image"].format == "JPEG"
    assert item["image"].size == (640, 480)
    assert item["image"].mode == "RGB"
    batch = dset[:1]
    assert len(batch) == 1
    assert batch.keys() == {"image"}
    assert isinstance(batch["image"], list) and all(isinstance(item, PIL.Image.Image) for item in batch["image"])
    assert batch["image"][0].filename == ""
    assert batch["image"][0].format == "JPEG"
    assert batch["image"][0].size == (640, 480)
    assert batch["image"][0].mode == "RGB"
    column = dset["image"]
    assert len(column) == 1
    assert isinstance(column, list) and all(isinstance(item, PIL.Image.Image) for item in column)
    assert column[0].filename == ""
    assert column[0].format == "JPEG"
    assert column[0].size == (640, 480)
    assert column[0].mode == "RGB"
Ejemplo n.º 11
0
def test_dataset_with_image_feature_from_np_array():
    import PIL.Image

    image_array = np.arange(640 * 480, dtype=np.uint8).reshape(480, 640)
    data = {"image": [image_array]}
    features = Features({"image": Image()})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"image"}
    assert isinstance(item["image"], PIL.Image.Image)
    np.testing.assert_array_equal(np.array(item["image"]), image_array)
    assert item["image"].filename == ""
    assert item["image"].format == "PNG"
    assert item["image"].size == (640, 480)
    batch = dset[:1]
    assert len(batch) == 1
    assert batch.keys() == {"image"}
    assert isinstance(batch["image"], list) and all(isinstance(item, PIL.Image.Image) for item in batch["image"])
    np.testing.assert_array_equal(np.array(batch["image"][0]), image_array)
    assert batch["image"][0].filename == ""
    assert batch["image"][0].format == "PNG"
    assert batch["image"][0].size == (640, 480)
    column = dset["image"]
    assert len(column) == 1
    assert isinstance(column, list) and all(isinstance(item, PIL.Image.Image) for item in column)
    np.testing.assert_array_equal(np.array(column[0]), image_array)
    assert column[0].filename == ""
    assert column[0].format == "PNG"
    assert column[0].size == (640, 480)
Ejemplo n.º 12
0
def test_dataset_with_image_feature(shared_datadir):
    import PIL.Image

    image_path = str(shared_datadir / "test_image_rgb.jpg")
    data = {"image": [image_path]}
    features = Features({"image": Image()})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"image"}
    assert isinstance(item["image"], PIL.Image.Image)
    assert os.path.samefile(item["image"].filename, image_path)
    assert item["image"].format == "JPEG"
    assert item["image"].size == (640, 480)
    assert item["image"].mode == "RGB"
    batch = dset[:1]
    assert len(batch) == 1
    assert batch.keys() == {"image"}
    assert isinstance(batch["image"], list) and all(isinstance(item, PIL.Image.Image) for item in batch["image"])
    assert os.path.samefile(batch["image"][0].filename, image_path)
    assert batch["image"][0].format == "JPEG"
    assert batch["image"][0].size == (640, 480)
    assert batch["image"][0].mode == "RGB"
    column = dset["image"]
    assert len(column) == 1
    assert isinstance(column, list) and all(isinstance(item, PIL.Image.Image) for item in column)
    assert os.path.samefile(column[0].filename, image_path)
    assert column[0].format == "JPEG"
    assert column[0].size == (640, 480)
    assert column[0].mode == "RGB"
Ejemplo n.º 13
0
def test_formatted_dataset_with_image_feature_undecoded(shared_datadir):
    image_path = str(shared_datadir / "test_image_rgb.jpg")
    data = {"image": [image_path]}
    features = Features({"image": Image(decode=False)})
    dset = Dataset.from_dict(data, features=features)
    with dset.formatted_as("numpy"):
        item = dset[0]
        assert item.keys() == {"image"}
        assert item["image"] == {"path": image_path, "bytes": None}
        batch = dset[:1]
        assert batch.keys() == {"image"}
        assert len(batch["image"]) == 1
        assert batch["image"][0] == {"path": image_path, "bytes": None}
        column = dset["image"]
        assert len(column) == 1
        assert column[0] == {"path": image_path, "bytes": None}

    with dset.formatted_as("pandas"):
        item = dset[0]
        assert item.shape == (1, 1)
        assert item.columns == ["image"]
        assert item["image"][0] == {"path": image_path, "bytes": None}
        batch = dset[:1]
        assert batch.shape == (1, 1)
        assert batch.columns == ["image"]
        assert batch["image"][0] == {"path": image_path, "bytes": None}
        column = dset["image"]
        assert len(column) == 1
        assert column[0] == {"path": image_path, "bytes": None}
Ejemplo n.º 14
0
def test_dataset_with_image_feature_map(shared_datadir):
    image_path = str(shared_datadir / "test_image_rgb.jpg")
    data = {"image": [image_path], "caption": ["cats sleeping"]}
    features = Features({"image": Image(), "caption": Value("string")})
    dset = Dataset.from_dict(data, features=features)

    for item in dset:
        assert item.keys() == {"image", "caption"}
        assert item == {
            "image": {
                "path": image_path,
                "bytes": None
            },
            "caption": "cats sleeping"
        }

    # no decoding

    def process_caption(example):
        example["caption"] = "Two " + example["caption"]
        return example

    processed_dset = dset.map(process_caption)
    for item in processed_dset:
        assert item.keys() == {"image", "caption"}
        assert item == {
            "image": {
                "path": image_path,
                "bytes": None
            },
            "caption": "Two cats sleeping"
        }

    # decoding example

    def process_image_by_example(example):
        example["mode"] = example["image"].mode
        return example

    decoded_dset = dset.map(process_image_by_example)
    for item in decoded_dset:
        assert item.keys() == {"image", "caption", "mode"}
        assert os.path.samefile(item["image"]["path"], image_path)
        assert item["caption"] == "cats sleeping"
        assert item["mode"] == "RGB"

    # decoding batch

    def process_image_by_batch(batch):
        batch["mode"] = [image.mode for image in batch["image"]]
        return batch

    decoded_dset = dset.map(process_image_by_batch, batched=True)
    for item in decoded_dset:
        assert item.keys() == {"image", "caption", "mode"}
        assert os.path.samefile(item["image"]["path"], image_path)
        assert item["caption"] == "cats sleeping"
        assert item["mode"] == "RGB"
Ejemplo n.º 15
0
def test_image_instantiation():
    image = Image()
    assert image.id is None
    assert image.dtype == "PIL.Image.Image"
    assert image.pa_type == pa.struct({
        "bytes": pa.binary(),
        "path": pa.string()
    })
    assert image._type == "Image"
Ejemplo n.º 16
0
def test_generate_examples_with_metadata_in_wrong_location(
        image_file, image_file_with_metadata, drop_metadata):
    _, image_metadata_file = image_file_with_metadata
    if not drop_metadata:
        features = Features({"image": Image(), "caption": Value("string")})
    else:
        features = Features({"image": Image()})
    imagefolder = ImageFolder(drop_metadata=drop_metadata, features=features)
    generator = imagefolder._generate_examples(
        [(image_file, image_file)],
        {"train": [(image_metadata_file, image_metadata_file)]}, "train")
    if not drop_metadata:
        with pytest.raises(ValueError):
            list(generator)
    else:
        assert all(
            example.keys() == {"image"} and all(val is not None
                                                for val in example.values())
            for _, example in generator)
Ejemplo n.º 17
0
def test_dataset_with_image_feature_map_undecoded(shared_datadir):
    image_path = str(shared_datadir / "test_image_rgb.jpg")
    data = {"image": [image_path]}
    features = Features({"image": Image(decode=False)})
    dset = Dataset.from_dict(data, features=features)

    def assert_image_example_undecoded(example):
        assert example["image"] == {"path": image_path, "bytes": None}

    dset.map(assert_image_example_undecoded)

    def assert_image_batch_undecoded(batch):
        for image in batch["image"]:
            assert image == {"path": image_path, "bytes": None}

    dset.map(assert_image_batch_undecoded, batched=True)
Ejemplo n.º 18
0
def test_image_instantiation():
    image = Image()
    assert image.id is None
    assert image.dtype == "dict"
    assert image.pa_type is None
    assert image._type == "Image"
Ejemplo n.º 19
0
def test_dataset_with_image_feature_map_change_image(shared_datadir):
    import PIL.Image

    image_path = str(shared_datadir / "test_image_rgb.jpg")
    pil_image = Image().decode_example({"path": image_path, "bytes": None})
    data = {"image": [image_path]}
    features = Features({"image": Image()})
    dset = Dataset.from_dict(data, features=features)

    for item in dset._iter(decoded=False):
        assert item.keys() == {"image"}
        assert item == {
            "image": {
                "bytes": None,
                "path": image_path,
            }
        }

    # return pil image

    def process_image_resize_by_example(example):
        example["image"] = example["image"].resize((100, 100))
        return example

    decoded_dset = dset.map(process_image_resize_by_example)
    for item in decoded_dset._iter(decoded=False):
        assert item.keys() == {"image"}
        assert item == {"image": {"bytes": image_to_bytes(pil_image.resize((100, 100))), "path": None}}

    def process_image_resize_by_batch(batch):
        batch["image"] = [image.resize((100, 100)) for image in batch["image"]]
        return batch

    decoded_dset = dset.map(process_image_resize_by_batch, batched=True)
    for item in decoded_dset._iter(decoded=False):
        assert item.keys() == {"image"}
        assert item == {"image": {"bytes": image_to_bytes(pil_image.resize((100, 100))), "path": None}}

    # return np.ndarray (e.g. when using albumentations)

    def process_image_resize_by_example_return_np_array(example):
        example["image"] = np.array(example["image"].resize((100, 100)))
        return example

    decoded_dset = dset.map(process_image_resize_by_example_return_np_array)
    for item in decoded_dset._iter(decoded=False):
        assert item.keys() == {"image"}
        assert item == {
            "image": {
                "bytes": image_to_bytes(PIL.Image.fromarray(np.array(pil_image.resize((100, 100))))),
                "path": None,
            }
        }

    def process_image_resize_by_batch_return_np_array(batch):
        batch["image"] = [np.array(image.resize((100, 100))) for image in batch["image"]]
        return batch

    decoded_dset = dset.map(process_image_resize_by_batch_return_np_array, batched=True)
    for item in decoded_dset._iter(decoded=False):
        assert item.keys() == {"image"}
        assert item == {
            "image": {
                "bytes": image_to_bytes(PIL.Image.fromarray(np.array(pil_image.resize((100, 100))))),
                "path": None,
            }
        }