Beispiel #1
0
def load(name: str,
         *,
         root: Optional[Union[str, pathlib.Path]] = None,
         **config: Any) -> Dataset:
    dataset_cls = find(BUILTIN_DATASETS, name)

    if root is None:
        root = pathlib.Path(home()) / name

    return dataset_cls(root, **config)
Beispiel #2
0
def load(
    name: str,
    *,
    decoder: Optional[Callable[[io.IOBase], torch.Tensor]] = pil,
    split: str = "train",
    **options: Any,
) -> IterDataPipe[Dict[str, Any]]:
    dataset = find(name)

    config = dataset.info.make_config(split=split, **options)
    root = home() / name

    return dataset.to_datapipe(root, config=config, decoder=decoder)
Beispiel #3
0
def load(
    name: str,
    *,
    skip_integrity_check: bool = False,
    **options: Any,
) -> IterDataPipe[Dict[str, Any]]:
    dataset = find(name)

    config = dataset.info.make_config(**options)
    root = os.path.join(home(), dataset.name)

    return dataset.load(root,
                        config=config,
                        skip_integrity_check=skip_integrity_check)
def main(*names, force=False):
    home = pathlib.Path(datasets.home())

    for name in names:
        path = BUILTIN_DIR / f"{name}.categories"
        if path.exists() and not force:
            continue

        dataset = find(name)
        try:
            categories = dataset._generate_categories(home / name)
        except NotImplementedError:
            continue

        with open(path, "w") as file:
            writer = csv.writer(file, lineterminator="\n")
            for category in categories:
                writer.writerow((
                    category, ) if isinstance(category, str) else category)
Beispiel #5
0
def load(
    name: str,
    *,
    decoder: Optional[
        Callable[[io.IOBase],
                 torch.Tensor]] = DEFAULT_DECODER,  # type: ignore[assignment]
    skip_integrity_check: bool = False,
    **options: Any,
) -> IterDataPipe[Dict[str, Any]]:
    dataset = find(name)

    if decoder is DEFAULT_DECODER:
        decoder = DEFAULT_DECODER_MAP.get(dataset.info.type)

    config = dataset.info.make_config(**options)
    root = os.path.join(home(), dataset.name)

    return dataset.load(root,
                        config=config,
                        decoder=decoder,
                        skip_integrity_check=skip_integrity_check)
Beispiel #6
0
    def legacy_root(self, temp_root):
        new_root = pathlib.Path(new_datasets.home()) / self.name
        legacy_root = pathlib.Path(tempfile.mkdtemp(dir=temp_root))

        if os.stat(new_root).st_dev != os.stat(legacy_root).st_dev:
            warnings.warn(
                "The temporary root directory for the legacy dataset was created on a different storage device than "
                "the raw data that is used by the new dataset. If the devices have different I/O stats, this will "
                "distort the benchmark. You can use the '--temp-root' flag to relocate the root directory of the "
                "temporary directories.",
                RuntimeWarning,
            )

        try:
            for file_name in self._find_resource_file_names():
                (legacy_root / file_name).symlink_to(new_root / file_name)

            if self.prepare_legacy_root:
                self.prepare_legacy_root(self, legacy_root)

            with self.patch_download_and_integrity_checks():
                yield legacy_root
        finally:
            shutil.rmtree(legacy_root)
Beispiel #7
0
    def _make_datapipe(
        self,
        resource_dps: List[IterDataPipe],
        *,
        config: DatasetConfig,
        decoder: Optional[Callable[[io.IOBase], torch.Tensor]],
    ) -> IterDataPipe[Dict[str, Any]]:
        dp = resource_dps[0]
        dp = TarArchiveReader(dp)
        dp = Filter(dp, self._is_not_rogue_file)
        dp = Shuffler(dp, buffer_size=INFINITE_BUFFER_SIZE)
        return Mapper(dp,
                      self._collate_and_decode_sample,
                      fn_kwargs=dict(decoder=decoder))

    def generate_categories_file(self, root: Union[str, pathlib.Path]) -> None:
        dp = self.resources(self.default_config)[0].to_datapipe(
            pathlib.Path(root) / self.name)
        dp = TarArchiveReader(dp)
        dir_names = {pathlib.Path(path).parent.name for path, _ in dp}
        categories = [name.split(".")[1] for name in sorted(dir_names)]
        create_categories_file(HERE, self.name, categories)


if __name__ == "__main__":
    from torchvision.prototype.datasets import home

    root = home()
    Caltech101().generate_categories_file(root)
    Caltech256().generate_categories_file(root)