def inject_fake_data(self, tmpdir, config): root = pathlib.Path(tmpdir) / "caltech101" images = root / "101_ObjectCategories" annotations = root / "Annotations" categories = (("Faces", "Faces_2"), ("helicopter", "helicopter"), ("ying_yang", "ying_yang")) num_images_per_category = 2 for image_category, annotation_category in categories: datasets_utils.create_image_folder( root=images, name=image_category, file_name_fn=lambda idx: f"image_{idx + 1:04d}.jpg", num_examples=num_images_per_category, ) self._create_annotation_folder( root=annotations, name=annotation_category, file_name_fn=lambda idx: f"annotation_{idx + 1:04d}.mat", num_examples=num_images_per_category, ) # This is included in the original archive, but is removed by the dataset. Thus, an empty directory suffices. os.makedirs(images / "BACKGROUND_Google") return num_images_per_category * len(categories)
def country211(info, root, config): split_name_mapper = { "train": "train", "val": "valid", "test": "test", } split_folder = pathlib.Path(root, "country211", split_name_mapper[config["split"]]) split_folder.mkdir(parents=True, exist_ok=True) num_examples = { "train": 3, "val": 4, "test": 5, }[config["split"]] classes = ("AD", "BS", "GR") for cls in classes: create_image_folder( split_folder, name=cls, file_name_fn=lambda idx: f"{idx}.jpg", num_examples=num_examples, ) make_tar(root, f"{split_folder.parent.name}.tgz", split_folder.parent, compression="gz") return num_examples * len(classes)
def inject_fake_data(self, tmpdir, config): year, is_test_set = (("2007", True) if config["year"] == "2007-test" or config["image_set"] == "test" else (config["year"], False)) image_set = config["image_set"] base_dir = pathlib.Path(tmpdir) if year == "2011": base_dir /= "TrainVal" base_dir = base_dir / "VOCdevkit" / f"VOC{year}" os.makedirs(base_dir) num_images, num_images_per_image_set = self._create_image_set_files( base_dir, "ImageSets", is_test_set) datasets_utils.create_image_folder(base_dir, "JPEGImages", lambda idx: f"{idx:06d}.jpg", num_images) datasets_utils.create_image_folder(base_dir, "SegmentationClass", lambda idx: f"{idx:06d}.png", num_images) annotation = self._create_annotation_files(base_dir, "Annotations", num_images) return dict(num_examples=num_images_per_image_set[image_set], annotation=annotation)
def imagenet(info, root, config): from scipy.io import savemat categories = info.categories wnids = [info.extra.category_to_wnid[category] for category in categories] if config.split == "train": num_samples = len(wnids) archive_name = "ILSVRC2012_img_train.tar" files = [] for wnid in wnids: create_image_folder( root=root, name=wnid, file_name_fn=lambda image_idx: f"{wnid}_{image_idx:04d}.JPEG", num_examples=1, ) files.append(make_tar(root, f"{wnid}.tar")) elif config.split == "val": num_samples = 3 archive_name = "ILSVRC2012_img_val.tar" files = [ create_image_file(root, f"ILSVRC2012_val_{idx + 1:08d}.JPEG") for idx in range(num_samples) ] devkit_root = root / "ILSVRC2012_devkit_t12" data_root = devkit_root / "data" data_root.mkdir(parents=True) with open(data_root / "ILSVRC2012_validation_ground_truth.txt", "w") as file: for label in torch.randint(0, len(wnids), (num_samples, )).tolist(): file.write(f"{label}\n") num_children = 0 synsets = [(idx, wnid, category, "", num_children, [], 0, 0) for idx, (category, wnid) in enumerate(zip(categories, wnids), 1)] num_children = 1 synsets.extend( (0, "", "", "", num_children, [], 0, 0) for _ in range(5)) savemat(data_root / "meta.mat", dict(synsets=synsets)) make_tar(root, devkit_root.with_suffix(".tar.gz").name, compression="gz") else: # config.split == "test" num_samples = 5 archive_name = "ILSVRC2012_img_test_v10102019.tar" files = [ create_image_file(root, f"ILSVRC2012_test_{idx + 1:08d}.JPEG") for idx in range(num_samples) ] make_tar(root, archive_name, *files) return num_samples
def generate(self, root): classification_anns_meta = ( dict(cls="Abyssinian", label=0, species="cat"), dict(cls="Keeshond", label=18, species="dog"), dict(cls="Yorkshire Terrier", label=36, species="dog"), ) split_and_classification_anns = [ self._meta_to_split_and_classification_ann(meta, idx) for meta, idx in itertools.product(classification_anns_meta, (1, 2, 10)) ] image_ids, *_ = zip(*split_and_classification_anns) image_files = create_image_folder( root, "images", file_name_fn=lambda idx: f"{image_ids[idx]}.jpg", num_examples=len(image_ids)) anns_folder = root / "annotations" anns_folder.mkdir() random.shuffle(split_and_classification_anns) splits = ("trainval", "test") num_samples_map = {} for offset, split in enumerate(splits): split_and_classification_anns_in_split = split_and_classification_anns[ offset::len(splits)] with open(anns_folder / f"{split}.txt", "w") as file: writer = csv.writer(file, delimiter=" ") for split_and_classification_ann in split_and_classification_anns_in_split: writer.writerow(split_and_classification_ann) num_samples_map[split] = len( split_and_classification_anns_in_split) segmentation_files = create_image_folder( anns_folder, "trimaps", file_name_fn=lambda idx: f"{image_ids[idx]}.png", num_examples=len(image_ids)) # The dataset has some rogue files for path in image_files[:3]: path.with_suffix(".mat").touch() for path in segmentation_files: path.with_name(f".{path.name}").touch() make_tar(root, "images.tar.gz", compression="gz") make_tar(root, anns_folder.with_suffix(".tar.gz").name, compression="gz") return num_samples_map
def caltech101(info, root, config): def create_ann_file(root, name): import scipy.io box_coord = make_tensor((1, 4), dtype=torch.int32, low=0).numpy().astype(np.uint16) obj_contour = make_tensor((2, int(torch.randint(3, 6, size=()))), dtype=torch.float64, low=0).numpy() scipy.io.savemat(str(pathlib.Path(root) / name), dict(box_coord=box_coord, obj_contour=obj_contour)) def create_ann_folder(root, name, file_name_fn, num_examples): root = pathlib.Path(root) / name root.mkdir(parents=True) for idx in range(num_examples): create_ann_file(root, file_name_fn(idx)) images_root = root / "101_ObjectCategories" anns_root = root / "Annotations" ann_category_map = { "Faces_2": "Faces", "Faces_3": "Faces_easy", "Motorbikes_16": "Motorbikes", "Airplanes_Side_2": "airplanes", } num_images_per_category = 2 for category in info.categories: create_image_folder( root=images_root, name=category, file_name_fn=lambda idx: f"image_{idx + 1:04d}.jpg", num_examples=num_images_per_category, ) create_ann_folder( root=anns_root, name=ann_category_map.get(category, category), file_name_fn=lambda idx: f"annotation_{idx + 1:04d}.mat", num_examples=num_images_per_category, ) (images_root / "BACKGROUND_Goodle").mkdir() make_tar(root, f"{images_root.name}.tar.gz", images_root, compression="gz") make_tar(root, f"{anns_root.name}.tar", anns_root) return num_images_per_category * len(info.categories)
def generate(cls, root): archive_folder = root / "benchmark_RELEASE" dataset_folder = archive_folder / "dataset" dataset_folder.mkdir(parents=True, exist_ok=True) ids, num_samples_map = cls._make_split_files(defaultdict(lambda: dataset_folder, {"train_noval": root})) sizes = cls._make_anns_folder(dataset_folder, "cls", ids) create_image_folder( dataset_folder, "img", lambda idx: f"{ids[idx]}.jpg", num_examples=len(ids), size=lambda idx: sizes[idx] ) make_tar(root, "benchmark.tgz", archive_folder, compression="gz") return num_samples_map
def imagenet(info, root, config): wnids = tuple(info.extra.wnid_to_category.keys()) if config.split == "train": images_root = root / "ILSVRC2012_img_train" num_samples = len(wnids) for wnid in wnids: files = create_image_folder( root=images_root, name=wnid, file_name_fn=lambda image_idx: f"{wnid}_{image_idx:04d}.JPEG", num_examples=1, ) make_tar(images_root, f"{wnid}.tar", files[0].parent) elif config.split == "val": num_samples = 3 files = create_image_folder( root=root, name="ILSVRC2012_img_val", file_name_fn=lambda image_idx: f"ILSVRC2012_val_{image_idx + 1:08d}.JPEG", num_examples=num_samples, ) images_root = files[0].parent else: # config.split == "test" images_root = root / "ILSVRC2012_img_test_v10102019" num_samples = 3 create_image_folder( root=images_root, name="test", file_name_fn=lambda image_idx: f"ILSVRC2012_test_{image_idx + 1:08d}.JPEG", num_examples=num_samples, ) make_tar(root, f"{images_root.name}.tar", images_root) devkit_root = root / "ILSVRC2012_devkit_t12" devkit_root.mkdir() data_root = devkit_root / "data" data_root.mkdir() with open(data_root / "ILSVRC2012_validation_ground_truth.txt", "w") as file: for label in torch.randint(0, len(wnids), (num_samples, )).tolist(): file.write(f"{label}\n") make_tar(root, f"{devkit_root}.tar.gz", devkit_root, compression="gz") return num_samples
def eurosat(info, root, config): data_folder = pathlib.Path(root, "eurosat", "2750") data_folder.mkdir(parents=True) num_examples_per_class = 3 classes = ("AnnualCrop", "Forest") for cls in classes: create_image_folder( root=data_folder, name=cls, file_name_fn=lambda idx: f"{cls}_{idx}.jpg", num_examples=num_examples_per_class, ) make_zip(root, "EuroSAT.zip", data_folder) return len(classes) * num_examples_per_class
def _create_lmdb(self, root, cls): lmdb = datasets_utils.lazy_importer.lmdb hexdigits_lowercase = string.digits + string.ascii_lowercase[:6] folder = f"{cls}_lmdb" num_images = torch.randint(1, 4, size=()).item() format = "webp" files = datasets_utils.create_image_folder( root, folder, lambda idx: f"{idx}.{format}", num_images) with lmdb.open(str(root / folder)) as env, env.begin(write=True) as txn: for file in files: key = "".join( random.choice(hexdigits_lowercase) for _ in range(40)).encode() buffer = io.BytesIO() Image.open(file).save(buffer, format) buffer.seek(0) value = buffer.read() txn.put(key, value) os.remove(file) return num_images
def inject_fake_data(self, tmpdir, config): base_folder = pathlib.Path(tmpdir) / "celeba" os.makedirs(base_folder) num_images, num_images_per_split = self._create_split_txt(base_folder) datasets_utils.create_image_folder(base_folder, "img_align_celeba", lambda idx: f"{idx + 1:06d}.jpg", num_images) attr_names = self._create_attr_txt(base_folder, num_images) self._create_identity_txt(base_folder, num_images) self._create_bbox_txt(base_folder, num_images) self._create_landmarks_txt(base_folder, num_images) return dict(num_examples=num_images_per_split[config["split"]], attr_names=attr_names)
def inject_fake_data(self, tmpdir, config): tmpdir = pathlib.Path(tmpdir) / "caltech256" / "256_ObjectCategories" categories = ((1, "ak47"), (127, "laptop-101"), (257, "clutter")) num_images_per_category = 2 for idx, category in categories: datasets_utils.create_image_folder( tmpdir, name=f"{idx:03d}.{category}", file_name_fn=lambda image_idx: f"{idx:03d}_{image_idx + 1:04d}.jpg", num_examples=num_images_per_category, ) return num_images_per_category * len(categories)
def _make_images_archive(cls, root, name, *, num_samples): image_paths = create_image_folder( root, name, file_name_fn=lambda idx: f"{idx:012d}.jpg", num_examples=num_samples ) images_meta = [] for path in image_paths: with PIL.Image.open(path) as image: width, height = image.size images_meta.append(dict(file_name=path.name, id=int(path.stem), width=width, height=height)) make_zip(root, f"{name}.zip") return images_meta
def dtd(info, root, _): data_folder = root / "dtd" num_images_per_class = 3 image_folder = data_folder / "images" categories = {"banded", "marbled", "zigzagged"} image_ids_per_category = { category: [ str(path.relative_to(path.parents[1]).as_posix()) for path in create_image_folder( image_folder, category, file_name_fn=lambda idx: f"{category}_{idx:04d}.jpg", num_examples=num_images_per_class, ) ] for category in categories } meta_folder = data_folder / "labels" meta_folder.mkdir() with open(meta_folder / "labels_joint_anno.txt", "w") as file: for cls, image_ids in image_ids_per_category.items(): for image_id in image_ids: joint_categories = random.choices( list(categories - {cls}), k=int(torch.randint(len(categories) - 1, ()))) file.write( " ".join([image_id, *sorted([cls, *joint_categories])]) + "\n") image_ids = list(itertools.chain(*image_ids_per_category.values())) splits = ("train", "val", "test") num_samples_map = {} for fold in range(1, 11): random.shuffle(image_ids) for offset, split in enumerate(splits): image_ids_in_config = image_ids[offset::len(splits)] with open(meta_folder / f"{split}{fold}.txt", "w") as file: file.write("\n".join(image_ids_in_config) + "\n") num_samples_map[info.make_config( split=split, fold=str(fold))] = len(image_ids_in_config) make_tar(root, "dtd-r1.0.1.tar.gz", data_folder, compression="gz") return num_samples_map
def caltech256(info, root, config): dir = root / "256_ObjectCategories" num_images_per_category = 2 for idx, category in enumerate(info.categories, 1): files = create_image_folder( dir, name=f"{idx:03d}.{category}", file_name_fn=lambda image_idx: f"{idx:03d}_{image_idx + 1:04d}.jpg", num_examples=num_images_per_category, ) if category == "spider": open(files[0].parent / "RENAME2", "w").close() make_tar(root, f"{dir.name}.tar", dir) return num_images_per_category * len(info.categories)
def _make_images(cls, images_folder): image_files = [] for category_idx, category in [ (1, "Black_footed_Albatross"), (100, "Brown_Pelican"), (200, "Common_Yellowthroat"), ]: image_files.extend( create_image_folder( images_folder, cls._category_folder(category, category_idx), lambda image_idx: f"{cls._file_stem(category, image_idx)}.jpg", num_examples=5, )) return image_files
def generate(cls, root): image_file_names, num_samples_map = cls._make_split_file(root) image_files = create_image_folder( root, "img_align_celeba", file_name_fn=lambda idx: image_file_names[idx], num_examples=len(image_file_names) ) make_zip(root, image_files[0].parent.with_suffix(".zip").name) for make_ann_file_fn in ( cls._make_identity_file, cls._make_attributes_file, cls._make_bounding_boxes_file, cls._make_landmarks_file, ): make_ann_file_fn(root, image_file_names) return num_samples_map
def clevr(info, root, config): data_folder = root / "CLEVR_v1.0" num_samples_map = { "train": 3, "val": 2, "test": 1, } images_folder = data_folder / "images" image_files = { split: create_image_folder( images_folder, split, file_name_fn=lambda idx: f"CLEVR_{split}_{idx:06d}.jpg", num_examples=num_samples, ) for split, num_samples in num_samples_map.items() } scenes_folder = data_folder / "scenes" scenes_folder.mkdir() for split in ["train", "val"]: with open(scenes_folder / f"CLEVR_{split}_scenes.json", "w") as file: json.dump( { "scenes": [ { "image_filename": image_file.name, # We currently only return the number of objects in a scene. # Thus, it is sufficient for now to only mock the number of elements. "objects": [None] * int(torch.randint(1, 5, ())), } for image_file in image_files[split] ] }, file, ) make_zip(root, f"{data_folder.name}.zip") return { config_: num_samples_map[config_.split] for config_ in info._configs }
def inject_fake_data(self, tmpdir, config): tmpdir = pathlib.Path(tmpdir) num_images = 3 num_annotations_per_image = 2 image_folder = tmpdir / "images" files = datasets_utils.create_image_folder( tmpdir, name="images", file_name_fn=lambda idx: f"{idx:012d}.jpg", num_examples=num_images) file_names = [file.relative_to(image_folder) for file in files] annotation_folder = tmpdir / "annotations" os.makedirs(annotation_folder) annotation_file, info = self._create_annotation_file( annotation_folder, file_names, num_annotations_per_image) info["num_examples"] = num_images return (str(image_folder), str(annotation_file)), info
def inject_fake_data(self, tmpdir, config): tmpdir = pathlib.Path(tmpdir) num_images = 3 num_annotations_per_image = 2 files = datasets_utils.create_image_folder( tmpdir, name=self._IMAGE_FOLDER, file_name_fn=lambda idx: f"{idx:012d}.jpg", num_examples=num_images) file_names = [ file.relative_to(tmpdir / self._IMAGE_FOLDER) for file in files ] annotation_folder = tmpdir / self._ANNOTATIONS_FOLDER os.makedirs(annotation_folder) info = self._create_annotation_file(annotation_folder, self._ANNOTATIONS_FILE, file_names, num_annotations_per_image) info["num_examples"] = num_images return info
def gtsrb(info, root, config): num_examples_per_class = 5 if config.split == "train" else 3 classes = ("00000", "00042", "00012") num_examples = num_examples_per_class * len(classes) csv_columns = [ "Filename", "Width", "Height", "Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2", "ClassId" ] def _make_ann_file(path, num_examples, class_idx): if class_idx == "random": class_idx = torch.randint(1, len(classes) + 1, size=(1, )).item() with open(path, "w") as csv_file: writer = csv.DictWriter(csv_file, fieldnames=csv_columns, delimiter=";") writer.writeheader() for image_idx in range(num_examples): writer.writerow({ "Filename": f"{image_idx:05d}.ppm", "Width": torch.randint(1, 100, size=()).item(), "Height": torch.randint(1, 100, size=()).item(), "Roi.X1": torch.randint(1, 100, size=()).item(), "Roi.Y1": torch.randint(1, 100, size=()).item(), "Roi.X2": torch.randint(1, 100, size=()).item(), "Roi.Y2": torch.randint(1, 100, size=()).item(), "ClassId": class_idx, }) if config["split"] == "train": train_folder = root / "GTSRB" / "Training" train_folder.mkdir(parents=True) for class_idx in classes: create_image_folder( train_folder, name=class_idx, file_name_fn=lambda image_idx: f"{class_idx}_{image_idx:05d}.ppm", num_examples=num_examples_per_class, ) _make_ann_file( path=train_folder / class_idx / f"GT-{class_idx}.csv", num_examples=num_examples_per_class, class_idx=int(class_idx), ) make_zip(root, "GTSRB-Training_fixed.zip", train_folder) else: test_folder = root / "GTSRB" / "Final_Test" test_folder.mkdir(parents=True) create_image_folder( test_folder, name="Images", file_name_fn=lambda image_idx: f"{image_idx:05d}.ppm", num_examples=num_examples, ) make_zip(root, "GTSRB_Final_Test_Images.zip", test_folder) _make_ann_file( path=root / "GT-final_test.csv", num_examples=num_examples, class_idx="random", ) make_zip(root, "GTSRB_Final_Test_GT.zip", "GT-final_test.csv") return num_examples