def load(name: str, *, root: Optional[Union[str, pathlib.Path]] = None, **config: Any) -> Dataset: dataset_cls = find(BUILTIN_DATASETS, name) if root is None: root = pathlib.Path(home()) / name return dataset_cls(root, **config)
def load( name: str, *, decoder: Optional[Callable[[io.IOBase], torch.Tensor]] = pil, split: str = "train", **options: Any, ) -> IterDataPipe[Dict[str, Any]]: dataset = find(name) config = dataset.info.make_config(split=split, **options) root = home() / name return dataset.to_datapipe(root, config=config, decoder=decoder)
def load( name: str, *, skip_integrity_check: bool = False, **options: Any, ) -> IterDataPipe[Dict[str, Any]]: dataset = find(name) config = dataset.info.make_config(**options) root = os.path.join(home(), dataset.name) return dataset.load(root, config=config, skip_integrity_check=skip_integrity_check)
def main(*names, force=False): home = pathlib.Path(datasets.home()) for name in names: path = BUILTIN_DIR / f"{name}.categories" if path.exists() and not force: continue dataset = find(name) try: categories = dataset._generate_categories(home / name) except NotImplementedError: continue with open(path, "w") as file: writer = csv.writer(file, lineterminator="\n") for category in categories: writer.writerow(( category, ) if isinstance(category, str) else category)
def load( name: str, *, decoder: Optional[ Callable[[io.IOBase], torch.Tensor]] = DEFAULT_DECODER, # type: ignore[assignment] skip_integrity_check: bool = False, **options: Any, ) -> IterDataPipe[Dict[str, Any]]: dataset = find(name) if decoder is DEFAULT_DECODER: decoder = DEFAULT_DECODER_MAP.get(dataset.info.type) config = dataset.info.make_config(**options) root = os.path.join(home(), dataset.name) return dataset.load(root, config=config, decoder=decoder, skip_integrity_check=skip_integrity_check)
def legacy_root(self, temp_root): new_root = pathlib.Path(new_datasets.home()) / self.name legacy_root = pathlib.Path(tempfile.mkdtemp(dir=temp_root)) if os.stat(new_root).st_dev != os.stat(legacy_root).st_dev: warnings.warn( "The temporary root directory for the legacy dataset was created on a different storage device than " "the raw data that is used by the new dataset. If the devices have different I/O stats, this will " "distort the benchmark. You can use the '--temp-root' flag to relocate the root directory of the " "temporary directories.", RuntimeWarning, ) try: for file_name in self._find_resource_file_names(): (legacy_root / file_name).symlink_to(new_root / file_name) if self.prepare_legacy_root: self.prepare_legacy_root(self, legacy_root) with self.patch_download_and_integrity_checks(): yield legacy_root finally: shutil.rmtree(legacy_root)
def _make_datapipe( self, resource_dps: List[IterDataPipe], *, config: DatasetConfig, decoder: Optional[Callable[[io.IOBase], torch.Tensor]], ) -> IterDataPipe[Dict[str, Any]]: dp = resource_dps[0] dp = TarArchiveReader(dp) dp = Filter(dp, self._is_not_rogue_file) dp = Shuffler(dp, buffer_size=INFINITE_BUFFER_SIZE) return Mapper(dp, self._collate_and_decode_sample, fn_kwargs=dict(decoder=decoder)) def generate_categories_file(self, root: Union[str, pathlib.Path]) -> None: dp = self.resources(self.default_config)[0].to_datapipe( pathlib.Path(root) / self.name) dp = TarArchiveReader(dp) dir_names = {pathlib.Path(path).parent.name for path, _ in dp} categories = [name.split(".")[1] for name in sorted(dir_names)] create_categories_file(HERE, self.name, categories) if __name__ == "__main__": from torchvision.prototype.datasets import home root = home() Caltech101().generate_categories_file(root) Caltech256().generate_categories_file(root)