Exemple #1
0
    def run(self):
        fl_logger().setLevel(ERROR)
        if self._name is not None and self._all_configs:
            print(
                "Both parameters `config` and `all_configs` can't be used at once."
            )
            exit(1)
        path, name = self._dataset, self._name
        module_path, hash = prepare_module(path)
        builder_cls = import_main_class(module_path)

        if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0:
            n_builders = len(builder_cls.BUILDER_CONFIGS) // self._num_proc
            n_builders += (len(builder_cls.BUILDER_CONFIGS) %
                           self._num_proc) > self._proc_rank
        else:
            n_builders = 1 if self._proc_rank == 0 else 0

        def get_builders() -> Generator[DatasetBuilder, None, None]:
            if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0:
                for i, config in enumerate(builder_cls.BUILDER_CONFIGS):
                    if i % self._num_proc == self._proc_rank:
                        yield builder_cls(name=config.name,
                                          hash=hash,
                                          cache_dir=self._cache_dir,
                                          data_dir=self._data_dir)
            else:
                if self._proc_rank == 0:
                    yield builder_cls(name=name,
                                      hash=hash,
                                      cache_dir=self._cache_dir,
                                      data_dir=self._data_dir)

        for j, builder in enumerate(get_builders()):
            print(
                f"Testing builder '{builder.config.name}' ({j + 1}/{n_builders})"
            )
            builder.download_and_prepare(
                download_mode=REUSE_CACHE_IF_EXISTS
                if not self._force_redownload else FORCE_REDOWNLOAD,
                ignore_verifications=self._ignore_verifications,
                try_from_hf_gcs=False,
            )
            builder.as_dataset()
            if self._save_infos:
                builder._save_infos()

            # If save_infos=True, the dataset infos file is created next to the loaded module file.
            # Let's move it to the original directory of the dataset script, to allow the user to
            # upload them on S3 at the same time afterwards.
            if self._save_infos:
                dataset_infos_path = os.path.join(
                    builder_cls.get_imported_module_dir(),
                    DATASET_INFOS_DICT_FILE_NAME)
                name = Path(path).name + ".py"
                combined_path = os.path.join(path, name)
                if os.path.isfile(path):
                    dataset_dir = os.path.dirname(path)
                elif os.path.isfile(combined_path):
                    dataset_dir = path
                else:  # in case of a remote dataset
                    dataset_dir = None
                    print("Dataset Infos file saved at {}".format(
                        dataset_infos_path))

                # Move dataset_info back to the user
                if dataset_dir is not None:
                    user_dataset_infos_path = os.path.join(
                        dataset_dir, DATASET_INFOS_DICT_FILE_NAME)
                    copyfile(dataset_infos_path, user_dataset_infos_path)
                    print("Dataset Infos file saved at {}".format(
                        user_dataset_infos_path))

            # If clear_cache=True, the download forlder and the dataset builder cache directory are deleted
            if self._clear_cache:
                if os.path.isdir(builder._cache_dir):
                    logger.warning(f"Clearing cache at {builder._cache_dir}")
                    rmtree(builder._cache_dir)
                download_dir = os.path.join(self._cache_dir, "downloads")
                if os.path.isdir(download_dir):
                    logger.warning(f"Clearing cache at {download_dir}")
                    rmtree(download_dir)

        print("Test successful.")
Exemple #2
0
    def run(self):
        fl_logger().setLevel(ERROR)
        if self._name is not None and self._all_configs:
            print(
                "Both parameters `config` and `all_configs` can't be used at once."
            )
            exit(1)
        path, name = self._dataset, self._name
        module = dataset_module_factory(path)
        builder_cls = import_main_class(module.module_path)
        n_builders = len(
            builder_cls.BUILDER_CONFIGS
        ) if self._all_configs and builder_cls.BUILDER_CONFIGS else 1

        def get_builders() -> Generator[DatasetBuilder, None, None]:
            if self._all_configs and builder_cls.BUILDER_CONFIGS:
                for i, config in enumerate(builder_cls.BUILDER_CONFIGS):
                    if "name" in module.builder_kwargs:
                        yield builder_cls(
                            cache_dir=self._cache_dir,
                            data_dir=self._data_dir,
                            **module.builder_kwargs,
                        )
                    else:
                        yield builder_cls(
                            name=config.name,
                            cache_dir=self._cache_dir,
                            data_dir=self._data_dir,
                            **module.builder_kwargs,
                        )
            else:
                if "name" in module.builder_kwargs:
                    yield builder_cls(cache_dir=self._cache_dir,
                                      data_dir=self._data_dir,
                                      **module.builder_kwargs)
                else:
                    yield builder_cls(name=name,
                                      cache_dir=self._cache_dir,
                                      data_dir=self._data_dir,
                                      **module.builder_kwargs)

        for j, builder in enumerate(get_builders()):
            print(
                f"Testing builder '{builder.config.name}' ({j + 1}/{n_builders})"
            )
            builder._record_infos = True
            builder.download_and_prepare(
                download_mode=DownloadMode.REUSE_CACHE_IF_EXISTS if
                not self._force_redownload else DownloadMode.FORCE_REDOWNLOAD,
                ignore_verifications=self._ignore_verifications,
                try_from_hf_gcs=False,
            )
            builder.as_dataset()
            if self._save_infos:
                builder._save_infos()

            # If save_infos=True, the dataset infos file is created next to the loaded module file.
            # Let's move it to the original directory of the dataset script, to allow the user to
            # upload them on S3 at the same time afterwards.
            if self._save_infos:
                dataset_infos_path = os.path.join(
                    builder_cls.get_imported_module_dir(),
                    datasets.config.DATASETDICT_INFOS_FILENAME)
                name = Path(path).name + ".py"
                combined_path = os.path.join(path, name)
                if os.path.isfile(path):
                    dataset_dir = os.path.dirname(path)
                elif os.path.isfile(combined_path):
                    dataset_dir = path
                elif os.path.isdir(
                        path
                ):  # for local directories containing only data files
                    dataset_dir = path
                else:  # in case of a remote dataset
                    dataset_dir = None
                    print(f"Dataset Infos file saved at {dataset_infos_path}")

                # Move dataset_info back to the user
                if dataset_dir is not None:
                    user_dataset_infos_path = os.path.join(
                        dataset_dir,
                        datasets.config.DATASETDICT_INFOS_FILENAME)
                    copyfile(dataset_infos_path, user_dataset_infos_path)
                    print(
                        f"Dataset Infos file saved at {user_dataset_infos_path}"
                    )

            # If clear_cache=True, the download folder and the dataset builder cache directory are deleted
            if self._clear_cache:
                if os.path.isdir(builder._cache_dir):
                    logger.warning(f"Clearing cache at {builder._cache_dir}")
                    rmtree(builder._cache_dir)
                download_dir = os.path.join(
                    self._cache_dir, datasets.config.DOWNLOADED_DATASETS_DIR)
                if os.path.isdir(download_dir):
                    logger.warning(f"Clearing cache at {download_dir}")
                    rmtree(download_dir)

        print("Test successful.")