def _make_archive(cls, root): archive_folder = root / "CUB_200_2011" images_folder = archive_folder / "images" image_files = cls._make_images(images_folder) image_ids = list(range(1, len(image_files) + 1)) with open(archive_folder / "images.txt", "w") as file: file.write("\n".join( f"{id} {path.relative_to(images_folder).as_posix()}" for id, path in zip(image_ids, image_files))) split_ids = torch.randint(2, (len(image_ids), )).tolist() counts = Counter(split_ids) num_samples_map = {"train": counts[1], "test": counts[0]} with open(archive_folder / "train_test_split.txt", "w") as file: file.write("\n".join( f"{image_id} {split_id}" for image_id, split_id in zip(image_ids, split_ids))) with open(archive_folder / "bounding_boxes.txt", "w") as file: file.write("\n".join(" ".join( str(item) for item in [ image_id, *make_tensor(( 4, ), dtype=torch.int, low=0).to(torch.float).tolist() ]) for image_id in image_ids)) make_tar(root, archive_folder.with_suffix(".tgz").name, compression="gz") return image_files, num_samples_map
def _make_anns(cls, root, image_files): from scipy.io import savemat anns_folder = root / "annotations-mat" for image_file in image_files: ann_file = anns_folder / image_file.with_suffix( ".mat").relative_to(image_file.parents[1]) ann_file.parent.mkdir(parents=True, exist_ok=True) savemat( ann_file, { "seg": torch.randint(256, make_tensor( (2, ), low=3, dtype=torch.int).tolist(), dtype=torch.uint8).numpy(), "bbox": dict( zip(("left", "top", "right", "bottom"), make_tensor((4, ), dtype=torch.uint8).tolist())), }, ) readme_file = anns_folder / "README.txt" readme_file.touch() cls._make_hidden_rouge_file(readme_file) make_tar(root, "annotations.tgz", anns_folder, compression="gz")
def _make_splits(cls, root, image_files): split_folder = root / "lists" split_folder.mkdir() random.shuffle(image_files) splits = ("train", "test") num_samples_map = {} for offset, split in enumerate(splits): image_files_in_split = image_files[offset :: len(splits)] split_file = split_folder / f"{split}.txt" with open(split_file, "w") as file: file.write( "\n".join( sorted( str(image_file.relative_to(image_file.parents[1]).as_posix()) for image_file in image_files_in_split ) ) ) cls._make_hidden_rouge_file(split_file) num_samples_map[split] = len(image_files_in_split) make_tar(root, split_folder.with_suffix(".tgz").name, compression="gz") return num_samples_map
def country211(info, root, config): split_name_mapper = { "train": "train", "val": "valid", "test": "test", } split_folder = pathlib.Path(root, "country211", split_name_mapper[config["split"]]) split_folder.mkdir(parents=True, exist_ok=True) num_examples = { "train": 3, "val": 4, "test": 5, }[config["split"]] classes = ("AD", "BS", "GR") for cls in classes: create_image_folder( split_folder, name=cls, file_name_fn=lambda idx: f"{idx}.jpg", num_examples=num_examples, ) make_tar(root, f"{split_folder.parent.name}.tgz", split_folder.parent, compression="gz") return num_examples * len(classes)
def imagenet(info, root, config): from scipy.io import savemat categories = info.categories wnids = [info.extra.category_to_wnid[category] for category in categories] if config.split == "train": num_samples = len(wnids) archive_name = "ILSVRC2012_img_train.tar" files = [] for wnid in wnids: create_image_folder( root=root, name=wnid, file_name_fn=lambda image_idx: f"{wnid}_{image_idx:04d}.JPEG", num_examples=1, ) files.append(make_tar(root, f"{wnid}.tar")) elif config.split == "val": num_samples = 3 archive_name = "ILSVRC2012_img_val.tar" files = [ create_image_file(root, f"ILSVRC2012_val_{idx + 1:08d}.JPEG") for idx in range(num_samples) ] devkit_root = root / "ILSVRC2012_devkit_t12" data_root = devkit_root / "data" data_root.mkdir(parents=True) with open(data_root / "ILSVRC2012_validation_ground_truth.txt", "w") as file: for label in torch.randint(0, len(wnids), (num_samples, )).tolist(): file.write(f"{label}\n") num_children = 0 synsets = [(idx, wnid, category, "", num_children, [], 0, 0) for idx, (category, wnid) in enumerate(zip(categories, wnids), 1)] num_children = 1 synsets.extend( (0, "", "", "", num_children, [], 0, 0) for _ in range(5)) savemat(data_root / "meta.mat", dict(synsets=synsets)) make_tar(root, devkit_root.with_suffix(".tar.gz").name, compression="gz") else: # config.split == "test" num_samples = 5 archive_name = "ILSVRC2012_img_test_v10102019.tar" files = [ create_image_file(root, f"ILSVRC2012_test_{idx + 1:08d}.JPEG") for idx in range(num_samples) ] make_tar(root, archive_name, *files) return num_samples
def generate(cls, root): images_folder = root / "images" image_files = cls._make_images(images_folder) cls._make_hidden_rouge_file(*image_files) make_tar(root, images_folder.with_suffix(".tgz").name, compression="gz") num_samples_map = cls._make_splits(root, image_files) cls._make_anns(root, image_files) return num_samples_map
def generate(self, root): classification_anns_meta = ( dict(cls="Abyssinian", label=0, species="cat"), dict(cls="Keeshond", label=18, species="dog"), dict(cls="Yorkshire Terrier", label=36, species="dog"), ) split_and_classification_anns = [ self._meta_to_split_and_classification_ann(meta, idx) for meta, idx in itertools.product(classification_anns_meta, (1, 2, 10)) ] image_ids, *_ = zip(*split_and_classification_anns) image_files = create_image_folder( root, "images", file_name_fn=lambda idx: f"{image_ids[idx]}.jpg", num_examples=len(image_ids)) anns_folder = root / "annotations" anns_folder.mkdir() random.shuffle(split_and_classification_anns) splits = ("trainval", "test") num_samples_map = {} for offset, split in enumerate(splits): split_and_classification_anns_in_split = split_and_classification_anns[ offset::len(splits)] with open(anns_folder / f"{split}.txt", "w") as file: writer = csv.writer(file, delimiter=" ") for split_and_classification_ann in split_and_classification_anns_in_split: writer.writerow(split_and_classification_ann) num_samples_map[split] = len( split_and_classification_anns_in_split) segmentation_files = create_image_folder( anns_folder, "trimaps", file_name_fn=lambda idx: f"{image_ids[idx]}.png", num_examples=len(image_ids)) # The dataset has some rogue files for path in image_files[:3]: path.with_suffix(".mat").touch() for path in segmentation_files: path.with_name(f".{path.name}").touch() make_tar(root, "images.tar.gz", compression="gz") make_tar(root, anns_folder.with_suffix(".tar.gz").name, compression="gz") return num_samples_map
def _make_segmentations(cls, root, image_files): segmentations_folder = root / "segmentations" for image_file in image_files: folder = segmentations_folder.joinpath(image_file.relative_to(image_file.parents[1])) folder.mkdir(exist_ok=True, parents=True) create_image_file( folder, image_file.with_suffix(".png").name, size=[1, *make_tensor((2,), low=3, dtype=torch.int).tolist()], ) make_tar(root, segmentations_folder.with_suffix(".tgz").name, compression="gz")
def caltech101(info, root, config): def create_ann_file(root, name): import scipy.io box_coord = make_tensor((1, 4), dtype=torch.int32, low=0).numpy().astype(np.uint16) obj_contour = make_tensor((2, int(torch.randint(3, 6, size=()))), dtype=torch.float64, low=0).numpy() scipy.io.savemat(str(pathlib.Path(root) / name), dict(box_coord=box_coord, obj_contour=obj_contour)) def create_ann_folder(root, name, file_name_fn, num_examples): root = pathlib.Path(root) / name root.mkdir(parents=True) for idx in range(num_examples): create_ann_file(root, file_name_fn(idx)) images_root = root / "101_ObjectCategories" anns_root = root / "Annotations" ann_category_map = { "Faces_2": "Faces", "Faces_3": "Faces_easy", "Motorbikes_16": "Motorbikes", "Airplanes_Side_2": "airplanes", } num_images_per_category = 2 for category in info.categories: create_image_folder( root=images_root, name=category, file_name_fn=lambda idx: f"image_{idx + 1:04d}.jpg", num_examples=num_images_per_category, ) create_ann_folder( root=anns_root, name=ann_category_map.get(category, category), file_name_fn=lambda idx: f"annotation_{idx + 1:04d}.mat", num_examples=num_images_per_category, ) (images_root / "BACKGROUND_Goodle").mkdir() make_tar(root, f"{images_root.name}.tar.gz", images_root, compression="gz") make_tar(root, f"{anns_root.name}.tar", anns_root) return num_images_per_category * len(info.categories)
def generate(cls, root): archive_folder = root / "benchmark_RELEASE" dataset_folder = archive_folder / "dataset" dataset_folder.mkdir(parents=True, exist_ok=True) ids, num_samples_map = cls._make_split_files(defaultdict(lambda: dataset_folder, {"train_noval": root})) sizes = cls._make_anns_folder(dataset_folder, "cls", ids) create_image_folder( dataset_folder, "img", lambda idx: f"{ids[idx]}.jpg", num_examples=len(ids), size=lambda idx: sizes[idx] ) make_tar(root, "benchmark.tgz", archive_folder, compression="gz") return num_samples_map
def dtd(info, root, _): data_folder = root / "dtd" num_images_per_class = 3 image_folder = data_folder / "images" categories = {"banded", "marbled", "zigzagged"} image_ids_per_category = { category: [ str(path.relative_to(path.parents[1]).as_posix()) for path in create_image_folder( image_folder, category, file_name_fn=lambda idx: f"{category}_{idx:04d}.jpg", num_examples=num_images_per_class, ) ] for category in categories } meta_folder = data_folder / "labels" meta_folder.mkdir() with open(meta_folder / "labels_joint_anno.txt", "w") as file: for cls, image_ids in image_ids_per_category.items(): for image_id in image_ids: joint_categories = random.choices( list(categories - {cls}), k=int(torch.randint(len(categories) - 1, ()))) file.write( " ".join([image_id, *sorted([cls, *joint_categories])]) + "\n") image_ids = list(itertools.chain(*image_ids_per_category.values())) splits = ("train", "val", "test") num_samples_map = {} for fold in range(1, 11): random.shuffle(image_ids) for offset, split in enumerate(splits): image_ids_in_config = image_ids[offset::len(splits)] with open(meta_folder / f"{split}{fold}.txt", "w") as file: file.write("\n".join(image_ids_in_config) + "\n") num_samples_map[info.make_config( split=split, fold=str(fold))] = len(image_ids_in_config) make_tar(root, "dtd-r1.0.1.tar.gz", data_folder, compression="gz") return num_samples_map
def _make_tar(self, root, *, name="archive.tar", remove=True): folder, files = self._make_folder(root, name=name.split(".")[0]) archive = make_tar(root, name, folder, remove=remove) files = { str(archive / pathlib.Path(file).relative_to(root)): content for file, content in files.items() } return archive, files
def generate(cls, root, *, year, trainval): archive_folder = root if year == "2011": archive_folder /= "TrainVal" data_folder = archive_folder / "VOCdevkit" / f"VOC{year}" data_folder.mkdir(parents=True, exist_ok=True) ids, num_samples_map = cls._make_split_files(data_folder, year=year, trainval=trainval) for make_folder_fn, name, suffix in [ (create_image_folder, "JPEGImages", ".jpg"), (create_image_folder, "SegmentationClass", ".png"), (cls._make_detection_anns_folder, "Annotations", ".xml"), ]: make_folder_fn(data_folder, name, file_name_fn=lambda idx: ids[idx] + suffix, num_examples=len(ids)) make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], data_folder) return num_samples_map
def caltech256(info, root, config): dir = root / "256_ObjectCategories" num_images_per_category = 2 for idx, category in enumerate(info.categories, 1): files = create_image_folder( dir, name=f"{idx:03d}.{category}", file_name_fn=lambda image_idx: f"{idx:03d}_{image_idx + 1:04d}.jpg", num_examples=num_images_per_category, ) if category == "spider": open(files[0].parent / "RENAME2", "w").close() make_tar(root, f"{dir.name}.tar", dir) return num_images_per_category * len(info.categories)
def generate( cls, root, name, *, folder, train_files, test_files, num_categories, labels_key, ): folder = root / folder folder.mkdir() files = (*train_files, *test_files) for file in files: cls._create_batch_file( folder, file, num_categories=num_categories, labels_key=labels_key, ) make_tar(root, name, folder, compression="gz")
def imagenet(info, root, config): wnids = tuple(info.extra.wnid_to_category.keys()) if config.split == "train": images_root = root / "ILSVRC2012_img_train" num_samples = len(wnids) for wnid in wnids: files = create_image_folder( root=images_root, name=wnid, file_name_fn=lambda image_idx: f"{wnid}_{image_idx:04d}.JPEG", num_examples=1, ) make_tar(images_root, f"{wnid}.tar", files[0].parent) elif config.split == "val": num_samples = 3 files = create_image_folder( root=root, name="ILSVRC2012_img_val", file_name_fn=lambda image_idx: f"ILSVRC2012_val_{image_idx + 1:08d}.JPEG", num_examples=num_samples, ) images_root = files[0].parent else: # config.split == "test" images_root = root / "ILSVRC2012_img_test_v10102019" num_samples = 3 create_image_folder( root=images_root, name="test", file_name_fn=lambda image_idx: f"ILSVRC2012_test_{image_idx + 1:08d}.JPEG", num_examples=num_samples, ) make_tar(root, f"{images_root.name}.tar", images_root) devkit_root = root / "ILSVRC2012_devkit_t12" devkit_root.mkdir() data_root = devkit_root / "data" data_root.mkdir() with open(data_root / "ILSVRC2012_validation_ground_truth.txt", "w") as file: for label in torch.randint(0, len(wnids), (num_samples, )).tolist(): file.write(f"{label}\n") make_tar(root, f"{devkit_root}.tar.gz", devkit_root, compression="gz") return num_samples