def test_imagefolder_drop_labels(image_file, drop_labels):
    imagefolder = ImageFolder(drop_labels=drop_labels)
    generator = imagefolder._generate_examples([(image_file, image_file)])
    if not drop_labels:
        assert all(example.keys() == {"image", "label"} for _, example in generator)
    else:
        assert all(example.keys() == {"image"} for _, example in generator)
Example #2
0
def test_generate_examples_drop_labels(image_file, drop_labels):
    imagefolder = ImageFolder(drop_labels=drop_labels)
    generator = imagefolder._generate_examples([(image_file, image_file)],
                                               None, "train")
    if not drop_labels:
        assert all(example.keys() == {"image", "label"} and all(
            val is not None for val in example.values())
                   for _, example in generator)
    else:
        assert all(
            example.keys() == {"image"} and all(val is not None
                                                for val in example.values())
            for _, example in generator)
Example #3
0
def test_generate_examples_with_metadata_in_wrong_location(
        image_file, image_file_with_metadata, drop_metadata):
    _, image_metadata_file = image_file_with_metadata
    if not drop_metadata:
        features = Features({"image": Image(), "caption": Value("string")})
    else:
        features = Features({"image": Image()})
    imagefolder = ImageFolder(drop_metadata=drop_metadata, features=features)
    generator = imagefolder._generate_examples(
        [(image_file, image_file)],
        {"train": [(image_metadata_file, image_metadata_file)]}, "train")
    if not drop_metadata:
        with pytest.raises(ValueError):
            list(generator)
    else:
        assert all(
            example.keys() == {"image"} and all(val is not None
                                                for val in example.values())
            for _, example in generator)
Example #4
0
def test_data_files_with_wrong_image_file_name_column_in_metadata_file(
        cache_dir, tmp_path, image_file):
    data_dir = tmp_path / "data_dir_with_bad_metadata"
    data_dir.mkdir(parents=True, exist_ok=True)
    shutil.copyfile(image_file, data_dir / "image_rgb.jpg")
    image_metadata_filename = data_dir / "metadata.jsonl"
    image_metadata = textwrap.dedent(  # with bad column "bad_file_name" instead of "file_name"
        """\
        {"bad_file_name": "image_rgb.jpg", "caption": "Nice image"}
        """)
    with open(image_metadata_filename, "w", encoding="utf-8") as f:
        f.write(image_metadata)

    data_files_with_bad_metadata = DataFilesDict.from_local_or_remote(
        get_patterns_locally(data_dir), data_dir)
    imagefolder = ImageFolder(data_files=data_files_with_bad_metadata,
                              cache_dir=cache_dir)
    with pytest.raises(ValueError) as exc_info:
        imagefolder.download_and_prepare()
    assert "`file_name` must be present" in str(exc_info.value)
Example #5
0
def test_data_files_with_wrong_metadata_file_name(cache_dir, tmp_path,
                                                  image_file):
    data_dir = tmp_path / "data_dir_with_bad_metadata"
    data_dir.mkdir(parents=True, exist_ok=True)
    shutil.copyfile(image_file, data_dir / "image_rgb.jpg")
    image_metadata_filename = data_dir / "bad_metadata.jsonl"  # bad file
    image_metadata = textwrap.dedent("""\
        {"file_name": "image_rgb.jpg", "caption": "Nice image"}
        """)
    with open(image_metadata_filename, "w", encoding="utf-8") as f:
        f.write(image_metadata)

    data_files_with_bad_metadata = DataFilesDict.from_local_or_remote(
        get_patterns_locally(data_dir), data_dir)
    imagefolder = ImageFolder(data_files=data_files_with_bad_metadata,
                              cache_dir=cache_dir)
    imagefolder.download_and_prepare()
    dataset = imagefolder.as_dataset(split="train")
    # check that there are no metadata, since the metadata file name doesn't have the right name
    assert "caption" not in dataset.column_names
Example #6
0
def test_data_files_with_metadata_and_splits(
        streaming, cache_dir, n_splits, data_files_with_one_split_and_metadata,
        data_files_with_two_splits_and_metadata):
    data_files = data_files_with_one_split_and_metadata if n_splits == 1 else data_files_with_two_splits_and_metadata
    imagefolder = ImageFolder(data_files=data_files, cache_dir=cache_dir)
    imagefolder.download_and_prepare()
    datasets = imagefolder.as_streaming_dataset(
    ) if streaming else imagefolder.as_dataset()
    for split, data_files in data_files.items():
        expected_num_of_images = len(
            data_files) - 1  # don't count the metadata file
        assert split in datasets
        dataset = list(datasets[split])
        assert len(dataset) == expected_num_of_images
        # make sure each sample has its own image and metadata
        assert len(set(example["image"].filename
                       for example in dataset)) == expected_num_of_images
        assert len(set(example["caption"]
                       for example in dataset)) == expected_num_of_images
        assert all(example["caption"] is not None for example in dataset)
Example #7
0
def test_data_files_with_metadata_and_archives(streaming, cache_dir,
                                               data_files_with_zip_archives):
    if streaming:
        extend_module_for_streaming(ImageFolder.__module__)
    imagefolder = ImageFolder(data_files=data_files_with_zip_archives,
                              cache_dir=cache_dir)
    imagefolder.download_and_prepare()
    datasets = imagefolder.as_streaming_dataset(
    ) if streaming else imagefolder.as_dataset()
    for split, data_files in data_files_with_zip_archives.items():
        num_of_archives = len(
            data_files)  # the metadata file is inside the archive
        expected_num_of_images = 2 * num_of_archives
        assert split in datasets
        dataset = list(datasets[split])
        assert len(dataset) == expected_num_of_images
        # make sure each sample has its own image and metadata
        assert len(
            set([np.array(example["image"])[0, 0, 0]
                 for example in dataset])) == expected_num_of_images
        assert len(set(example["caption"]
                       for example in dataset)) == expected_num_of_images
        assert all(example["caption"] is not None for example in dataset)