def test_image_feature_type_to_arrow(): features = Features({"image": Image()}) assert features.arrow_schema == pa.schema({"image": Image().pa_type}) features = Features({"struct_containing_an_image": {"image": Image()}}) assert features.arrow_schema == pa.schema({"struct_containing_an_image": pa.struct({"image": Image().pa_type})}) features = Features({"sequence_of_images": Sequence(Image())}) assert features.arrow_schema == pa.schema({"sequence_of_images": pa.list_(Image().pa_type)})
def test_push_dataset_to_hub_custom_features_image(self): image_path = os.path.join(os.path.dirname(__file__), "features", "data", "test_image_rgb.jpg") data = {"x": [image_path, None], "y": [0, -1]} features = Features({"x": Image(), "y": Value("int32")}) ds = Dataset.from_dict(data, features=features) for embed_external_files in [True, False]: ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token) hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload") self.assertListEqual(ds.column_names, hub_ds.column_names) self.assertListEqual(list(ds.features.keys()), list(hub_ds.features.keys())) self.assertDictEqual(ds.features, hub_ds.features) self.assertEqual(ds[:], hub_ds[:]) hub_ds = hub_ds.cast_column("x", Image(decode=False)) elem = hub_ds[0]["x"] path, bytes_ = elem["path"], elem["bytes"] self.assertTrue(bool(path) == (not embed_external_files)) self.assertTrue(bool(bytes_) == embed_external_files) finally: self._api.delete_repo(ds_name.split("/")[1], organization=ds_name.split("/")[0], token=self._token, repo_type="dataset")
def test_dataset_with_image_feature_with_none(): data = {"image": [None]} features = Features({"image": Image()}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"image"} assert item["image"] is None batch = dset[:1] assert len(batch) == 1 assert batch.keys() == {"image"} assert isinstance(batch["image"], list) and all(item is None for item in batch["image"]) column = dset["image"] assert len(column) == 1 assert isinstance(column, list) and all(item is None for item in column) # nested tests data = {"images": [[None]]} features = Features({"images": Sequence(Image())}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"images"} assert all(i is None for i in item["images"]) data = {"nested": [{"image": None}]} features = Features({"nested": {"image": Image()}}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"nested"} assert item["nested"].keys() == {"image"} assert item["nested"]["image"] is None
def test_image_decode_example(shared_datadir): import PIL.Image image_path = str(shared_datadir / "test_image_rgb.jpg") image = Image() decoded_example = image.decode_example(image_path) assert isinstance(decoded_example, PIL.Image.Image) assert os.path.samefile(decoded_example.filename, image_path) assert decoded_example.size == (640, 480) assert decoded_example.mode == "RGB"
def test_image_feature_encode_example(shared_datadir, build_example): import PIL.Image image_path = str(shared_datadir / "test_image_rgb.jpg") image = Image() encoded_example = image.encode_example(build_example(image_path)) assert isinstance(encoded_example, dict) assert encoded_example.keys() == {"bytes", "path"} assert encoded_example["bytes"] is not None or encoded_example["path"] is not None decoded_example = image.decode_example(encoded_example) assert isinstance(decoded_example, PIL.Image.Image)
def test_dataset_concatenate_image_features(shared_datadir): # we use a different data structure between 1 and 2 to make sure they are compatible with each other image_path = str(shared_datadir / "test_image_rgb.jpg") data1 = {"image": [image_path]} dset1 = Dataset.from_dict(data1, features=Features({"image": Image()})) data2 = {"image": [{"bytes": open(image_path, "rb").read()}]} dset2 = Dataset.from_dict(data2, features=Features({"image": Image()})) concatenated_dataset = concatenate_datasets([dset1, dset2]) assert len(concatenated_dataset) == len(dset1) + len(dset2) assert concatenated_dataset[0]["image"] == dset1[0]["image"] assert concatenated_dataset[1]["image"] == dset2[0]["image"]
def test_dataset_cast_to_image_features(shared_datadir, build_data): import PIL.Image image_path = str(shared_datadir / "test_image_rgb.jpg") data = build_data(image_path) dset = Dataset.from_dict(data) item = dset.cast(Features({"image": Image()}))[0] assert item.keys() == {"image"} assert isinstance(item["image"], PIL.Image.Image) item = dset.cast_column("image", Image())[0] assert item.keys() == {"image"} assert isinstance(item["image"], PIL.Image.Image)
def test_image_decode_example(shared_datadir): import PIL.Image image_path = str(shared_datadir / "test_image_rgb.jpg") image = Image() decoded_example = image.decode_example({"path": image_path, "bytes": None}) assert isinstance(decoded_example, PIL.Image.Image) assert os.path.samefile(decoded_example.filename, image_path) assert decoded_example.size == (640, 480) assert decoded_example.mode == "RGB" with pytest.raises(RuntimeError): Image(decode=False).decode_example(image_path)
def data_files_with_zip_archives(tmp_path, image_file): from PIL import Image, ImageOps data_dir = tmp_path / "imagefolder_data_dir_with_zip_archives" data_dir.mkdir(parents=True, exist_ok=True) archive_dir = data_dir / "archive" archive_dir.mkdir(parents=True, exist_ok=True) subdir = archive_dir / "subdir" subdir.mkdir(parents=True, exist_ok=True) image_filename = archive_dir / "image_rgb.jpg" shutil.copyfile(image_file, image_filename) image_filename2 = subdir / "image_rgb2.jpg" # in subdir # make sure they're two different images # Indeed we won't be able to compare the image.filename, since the archive is not extracted in streaming mode ImageOps.flip(Image.open(image_file)).save(image_filename2) image_metadata_filename = archive_dir / "metadata.jsonl" image_metadata = textwrap.dedent("""\ {"file_name": "image_rgb.jpg", "caption": "Nice image"} {"file_name": "subdir/image_rgb2.jpg", "caption": "Nice second image"} """) with open(image_metadata_filename, "w", encoding="utf-8") as f: f.write(image_metadata) shutil.make_archive(archive_dir, "zip", archive_dir) shutil.rmtree(str(archive_dir)) data_files_with_zip_archives = DataFilesDict.from_local_or_remote( get_patterns_locally(data_dir), data_dir) assert len(data_files_with_zip_archives) == 1 assert len(data_files_with_zip_archives["train"]) == 1 return data_files_with_zip_archives
def test_dataset_with_image_feature_tar_jpg(tar_jpg_path): import PIL.Image data = {"image": []} for file_path, file_obj in iter_archive(tar_jpg_path): data["image"].append({"path": file_path, "bytes": file_obj.read()}) break features = Features({"image": Image()}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"image"} assert isinstance(item["image"], PIL.Image.Image) assert item["image"].filename == "" assert item["image"].format == "JPEG" assert item["image"].size == (640, 480) assert item["image"].mode == "RGB" batch = dset[:1] assert len(batch) == 1 assert batch.keys() == {"image"} assert isinstance(batch["image"], list) and all(isinstance(item, PIL.Image.Image) for item in batch["image"]) assert batch["image"][0].filename == "" assert batch["image"][0].format == "JPEG" assert batch["image"][0].size == (640, 480) assert batch["image"][0].mode == "RGB" column = dset["image"] assert len(column) == 1 assert isinstance(column, list) and all(isinstance(item, PIL.Image.Image) for item in column) assert column[0].filename == "" assert column[0].format == "JPEG" assert column[0].size == (640, 480) assert column[0].mode == "RGB"
def test_dataset_with_image_feature_from_np_array(): import PIL.Image image_array = np.arange(640 * 480, dtype=np.uint8).reshape(480, 640) data = {"image": [image_array]} features = Features({"image": Image()}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"image"} assert isinstance(item["image"], PIL.Image.Image) np.testing.assert_array_equal(np.array(item["image"]), image_array) assert item["image"].filename == "" assert item["image"].format == "PNG" assert item["image"].size == (640, 480) batch = dset[:1] assert len(batch) == 1 assert batch.keys() == {"image"} assert isinstance(batch["image"], list) and all(isinstance(item, PIL.Image.Image) for item in batch["image"]) np.testing.assert_array_equal(np.array(batch["image"][0]), image_array) assert batch["image"][0].filename == "" assert batch["image"][0].format == "PNG" assert batch["image"][0].size == (640, 480) column = dset["image"] assert len(column) == 1 assert isinstance(column, list) and all(isinstance(item, PIL.Image.Image) for item in column) np.testing.assert_array_equal(np.array(column[0]), image_array) assert column[0].filename == "" assert column[0].format == "PNG" assert column[0].size == (640, 480)
def test_dataset_with_image_feature(shared_datadir): import PIL.Image image_path = str(shared_datadir / "test_image_rgb.jpg") data = {"image": [image_path]} features = Features({"image": Image()}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"image"} assert isinstance(item["image"], PIL.Image.Image) assert os.path.samefile(item["image"].filename, image_path) assert item["image"].format == "JPEG" assert item["image"].size == (640, 480) assert item["image"].mode == "RGB" batch = dset[:1] assert len(batch) == 1 assert batch.keys() == {"image"} assert isinstance(batch["image"], list) and all(isinstance(item, PIL.Image.Image) for item in batch["image"]) assert os.path.samefile(batch["image"][0].filename, image_path) assert batch["image"][0].format == "JPEG" assert batch["image"][0].size == (640, 480) assert batch["image"][0].mode == "RGB" column = dset["image"] assert len(column) == 1 assert isinstance(column, list) and all(isinstance(item, PIL.Image.Image) for item in column) assert os.path.samefile(column[0].filename, image_path) assert column[0].format == "JPEG" assert column[0].size == (640, 480) assert column[0].mode == "RGB"
def test_formatted_dataset_with_image_feature_undecoded(shared_datadir): image_path = str(shared_datadir / "test_image_rgb.jpg") data = {"image": [image_path]} features = Features({"image": Image(decode=False)}) dset = Dataset.from_dict(data, features=features) with dset.formatted_as("numpy"): item = dset[0] assert item.keys() == {"image"} assert item["image"] == {"path": image_path, "bytes": None} batch = dset[:1] assert batch.keys() == {"image"} assert len(batch["image"]) == 1 assert batch["image"][0] == {"path": image_path, "bytes": None} column = dset["image"] assert len(column) == 1 assert column[0] == {"path": image_path, "bytes": None} with dset.formatted_as("pandas"): item = dset[0] assert item.shape == (1, 1) assert item.columns == ["image"] assert item["image"][0] == {"path": image_path, "bytes": None} batch = dset[:1] assert batch.shape == (1, 1) assert batch.columns == ["image"] assert batch["image"][0] == {"path": image_path, "bytes": None} column = dset["image"] assert len(column) == 1 assert column[0] == {"path": image_path, "bytes": None}
def test_dataset_with_image_feature_map(shared_datadir): image_path = str(shared_datadir / "test_image_rgb.jpg") data = {"image": [image_path], "caption": ["cats sleeping"]} features = Features({"image": Image(), "caption": Value("string")}) dset = Dataset.from_dict(data, features=features) for item in dset: assert item.keys() == {"image", "caption"} assert item == { "image": { "path": image_path, "bytes": None }, "caption": "cats sleeping" } # no decoding def process_caption(example): example["caption"] = "Two " + example["caption"] return example processed_dset = dset.map(process_caption) for item in processed_dset: assert item.keys() == {"image", "caption"} assert item == { "image": { "path": image_path, "bytes": None }, "caption": "Two cats sleeping" } # decoding example def process_image_by_example(example): example["mode"] = example["image"].mode return example decoded_dset = dset.map(process_image_by_example) for item in decoded_dset: assert item.keys() == {"image", "caption", "mode"} assert os.path.samefile(item["image"]["path"], image_path) assert item["caption"] == "cats sleeping" assert item["mode"] == "RGB" # decoding batch def process_image_by_batch(batch): batch["mode"] = [image.mode for image in batch["image"]] return batch decoded_dset = dset.map(process_image_by_batch, batched=True) for item in decoded_dset: assert item.keys() == {"image", "caption", "mode"} assert os.path.samefile(item["image"]["path"], image_path) assert item["caption"] == "cats sleeping" assert item["mode"] == "RGB"
def test_image_instantiation(): image = Image() assert image.id is None assert image.dtype == "PIL.Image.Image" assert image.pa_type == pa.struct({ "bytes": pa.binary(), "path": pa.string() }) assert image._type == "Image"
def test_generate_examples_with_metadata_in_wrong_location( image_file, image_file_with_metadata, drop_metadata): _, image_metadata_file = image_file_with_metadata if not drop_metadata: features = Features({"image": Image(), "caption": Value("string")}) else: features = Features({"image": Image()}) imagefolder = ImageFolder(drop_metadata=drop_metadata, features=features) generator = imagefolder._generate_examples( [(image_file, image_file)], {"train": [(image_metadata_file, image_metadata_file)]}, "train") if not drop_metadata: with pytest.raises(ValueError): list(generator) else: assert all( example.keys() == {"image"} and all(val is not None for val in example.values()) for _, example in generator)
def test_dataset_with_image_feature_map_undecoded(shared_datadir): image_path = str(shared_datadir / "test_image_rgb.jpg") data = {"image": [image_path]} features = Features({"image": Image(decode=False)}) dset = Dataset.from_dict(data, features=features) def assert_image_example_undecoded(example): assert example["image"] == {"path": image_path, "bytes": None} dset.map(assert_image_example_undecoded) def assert_image_batch_undecoded(batch): for image in batch["image"]: assert image == {"path": image_path, "bytes": None} dset.map(assert_image_batch_undecoded, batched=True)
def test_image_instantiation(): image = Image() assert image.id is None assert image.dtype == "dict" assert image.pa_type is None assert image._type == "Image"
def test_dataset_with_image_feature_map_change_image(shared_datadir): import PIL.Image image_path = str(shared_datadir / "test_image_rgb.jpg") pil_image = Image().decode_example({"path": image_path, "bytes": None}) data = {"image": [image_path]} features = Features({"image": Image()}) dset = Dataset.from_dict(data, features=features) for item in dset._iter(decoded=False): assert item.keys() == {"image"} assert item == { "image": { "bytes": None, "path": image_path, } } # return pil image def process_image_resize_by_example(example): example["image"] = example["image"].resize((100, 100)) return example decoded_dset = dset.map(process_image_resize_by_example) for item in decoded_dset._iter(decoded=False): assert item.keys() == {"image"} assert item == {"image": {"bytes": image_to_bytes(pil_image.resize((100, 100))), "path": None}} def process_image_resize_by_batch(batch): batch["image"] = [image.resize((100, 100)) for image in batch["image"]] return batch decoded_dset = dset.map(process_image_resize_by_batch, batched=True) for item in decoded_dset._iter(decoded=False): assert item.keys() == {"image"} assert item == {"image": {"bytes": image_to_bytes(pil_image.resize((100, 100))), "path": None}} # return np.ndarray (e.g. when using albumentations) def process_image_resize_by_example_return_np_array(example): example["image"] = np.array(example["image"].resize((100, 100))) return example decoded_dset = dset.map(process_image_resize_by_example_return_np_array) for item in decoded_dset._iter(decoded=False): assert item.keys() == {"image"} assert item == { "image": { "bytes": image_to_bytes(PIL.Image.fromarray(np.array(pil_image.resize((100, 100))))), "path": None, } } def process_image_resize_by_batch_return_np_array(batch): batch["image"] = [np.array(image.resize((100, 100))) for image in batch["image"]] return batch decoded_dset = dset.map(process_image_resize_by_batch_return_np_array, batched=True) for item in decoded_dset._iter(decoded=False): assert item.keys() == {"image"} assert item == { "image": { "bytes": image_to_bytes(PIL.Image.fromarray(np.array(pil_image.resize((100, 100))))), "path": None, } }