def load_builder_class(self, dataset_name, is_local=False): # Download/copy dataset script if is_local is True: dataset_module = dataset_module_factory( os.path.join("datasets", dataset_name)) else: dataset_module = dataset_module_factory( dataset_name, download_config=DownloadConfig(force_download=True)) # Get dataset builder class builder_cls = import_main_class(dataset_module.module_path) return builder_cls
def test_load_real_dataset(self, dataset_name): path = "./datasets/" + dataset_name dataset_module = dataset_module_factory( path, download_config=DownloadConfig(local_files_only=True)) builder_cls = import_main_class(dataset_module.module_path) name = builder_cls.BUILDER_CONFIGS[ 0].name if builder_cls.BUILDER_CONFIGS else None with tempfile.TemporaryDirectory() as temp_cache_dir: dataset = load_dataset(path, name=name, cache_dir=temp_cache_dir, download_mode=GenerateMode.FORCE_REDOWNLOAD) for split in dataset.keys(): self.assertTrue(len(dataset[split]) > 0) del dataset
def run(self): set_verbosity_warning() dataset_module = dataset_module_factory(self._path_to_dataset) builder_cls = import_main_class(dataset_module.module_path) # use `None` as config if no configs builder_configs = builder_cls.BUILDER_CONFIGS or [None] auto_generate_results = [] with tempfile.TemporaryDirectory() as tmp_dir: for builder_config in builder_configs: if builder_config is None: name = None version = builder_cls.VERSION else: version = builder_config.version name = builder_config.name dataset_builder = builder_cls(name=name, hash=dataset_module.hash, cache_dir=tmp_dir) mock_dl_manager = MockDownloadManager( dataset_name=self._dataset_name, config=builder_config, version=version, use_local_dummy_data=True, load_existing_dummy_data=False, ) if self._auto_generate: auto_generate_results.append( self._autogenerate_dummy_data( dataset_builder=dataset_builder, mock_dl_manager=mock_dl_manager, keep_uncompressed=self._keep_uncompressed, )) else: self._print_dummy_data_instructions( dataset_builder=dataset_builder, mock_dl_manager=mock_dl_manager) if self._auto_generate and not self._keep_uncompressed: if all(auto_generate_results): print( f"Automatic dummy data generation succeeded for all configs of '{self._path_to_dataset}'" ) else: print( f"Automatic dummy data generation failed for some configs of '{self._path_to_dataset}'" )
def test_load_real_dataset_all_configs(self, dataset_name): path = "./datasets/" + dataset_name dataset_module = dataset_module_factory( path, download_config=DownloadConfig(local_files_only=True)) builder_cls = import_main_class(dataset_module.module_path) config_names = ([ config.name for config in builder_cls.BUILDER_CONFIGS ] if len(builder_cls.BUILDER_CONFIGS) > 0 else [None]) for name in config_names: with tempfile.TemporaryDirectory() as temp_cache_dir: dataset = load_dataset( path, name=name, cache_dir=temp_cache_dir, download_mode=DownloadMode.FORCE_REDOWNLOAD) for split in dataset.keys(): self.assertTrue(len(dataset[split]) > 0) del dataset
def test_dataset_info_available(self, dataset, config_name): with TemporaryDirectory() as tmp_dir: dataset_module = dataset_module_factory(os.path.join( "datasets", dataset), cache_dir=tmp_dir, local_files_only=True) builder_cls = import_main_class(dataset_module.module_path, dataset=True) builder_instance: DatasetBuilder = builder_cls( cache_dir=tmp_dir, name=config_name, hash=dataset_module.hash, ) dataset_info_url = os.path.join( HF_GCP_BASE_URL, builder_instance._relative_data_dir(with_hash=False), config.DATASET_INFO_FILENAME).replace(os.sep, "/") datset_info_path = cached_path(dataset_info_url, cache_dir=tmp_dir) self.assertTrue(os.path.exists(datset_info_path))
def run(self): fl_logger().setLevel(ERROR) if self._name is not None and self._all_configs: print( "Both parameters `config` and `all_configs` can't be used at once." ) exit(1) path, name = self._dataset, self._name module = dataset_module_factory(path) builder_cls = import_main_class(module.module_path) if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0: n_builders = len(builder_cls.BUILDER_CONFIGS) // self._num_proc n_builders += (len(builder_cls.BUILDER_CONFIGS) % self._num_proc) > self._proc_rank else: n_builders = 1 if self._proc_rank == 0 else 0 def get_builders() -> Generator[DatasetBuilder, None, None]: if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0: for i, config in enumerate(builder_cls.BUILDER_CONFIGS): if i % self._num_proc == self._proc_rank: if "name" in module.builder_kwargs: yield builder_cls( cache_dir=self._cache_dir, data_dir=self._data_dir, **module.builder_kwargs, ) else: yield builder_cls( name=config.name, cache_dir=self._cache_dir, data_dir=self._data_dir, **module.builder_kwargs, ) else: if self._proc_rank == 0: if "name" in module.builder_kwargs: yield builder_cls(cache_dir=self._cache_dir, data_dir=self._data_dir, **module.builder_kwargs) else: yield builder_cls(name=name, cache_dir=self._cache_dir, data_dir=self._data_dir, **module.builder_kwargs) for j, builder in enumerate(get_builders()): print( f"Testing builder '{builder.config.name}' ({j + 1}/{n_builders})" ) builder._record_infos = True builder.download_and_prepare( download_mode=DownloadMode.REUSE_CACHE_IF_EXISTS if not self._force_redownload else DownloadMode.FORCE_REDOWNLOAD, ignore_verifications=self._ignore_verifications, try_from_hf_gcs=False, ) builder.as_dataset() if self._save_infos: builder._save_infos() # If save_infos=True, the dataset infos file is created next to the loaded module file. # Let's move it to the original directory of the dataset script, to allow the user to # upload them on S3 at the same time afterwards. if self._save_infos: dataset_infos_path = os.path.join( builder_cls.get_imported_module_dir(), datasets.config.DATASETDICT_INFOS_FILENAME) name = Path(path).name + ".py" combined_path = os.path.join(path, name) if os.path.isfile(path): dataset_dir = os.path.dirname(path) elif os.path.isfile(combined_path): dataset_dir = path elif os.path.isdir( path ): # for local directories containing only data files dataset_dir = path else: # in case of a remote dataset dataset_dir = None print(f"Dataset Infos file saved at {dataset_infos_path}") # Move dataset_info back to the user if dataset_dir is not None: user_dataset_infos_path = os.path.join( dataset_dir, datasets.config.DATASETDICT_INFOS_FILENAME) copyfile(dataset_infos_path, user_dataset_infos_path) print( f"Dataset Infos file saved at {user_dataset_infos_path}" ) # If clear_cache=True, the download folder and the dataset builder cache directory are deleted if self._clear_cache: if os.path.isdir(builder._cache_dir): logger.warning(f"Clearing cache at {builder._cache_dir}") rmtree(builder._cache_dir) download_dir = os.path.join( self._cache_dir, datasets.config.DOWNLOADED_DATASETS_DIR) if os.path.isdir(download_dir): logger.warning(f"Clearing cache at {download_dir}") rmtree(download_dir) print("Test successful.")
def run(self): import apache_beam as beam if self._name is not None and self._all_configs: print( "Both parameters `name` and `all_configs` can't be used at once." ) exit(1) path, name = self._dataset, self._name dataset_module = dataset_module_factory(path) builder_cls = import_main_class(dataset_module.module_path) builders: List[DatasetBuilder] = [] if self._beam_pipeline_options: beam_options = beam.options.pipeline_options.PipelineOptions( flags=[ f"--{opt.strip()}" for opt in self._beam_pipeline_options.split(",") if opt ]) else: beam_options = None if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0: for builder_config in builder_cls.BUILDER_CONFIGS: builders.append( builder_cls( name=builder_config.name, data_dir=self._data_dir, hash=dataset_module.hash, beam_options=beam_options, cache_dir=self._cache_dir, base_path=dataset_module.builder_kwargs.get( "base_path"), namespace=dataset_module.builder_kwargs.get( "namespace"), )) else: builders.append( builder_cls( name=name, data_dir=self._data_dir, beam_options=beam_options, cache_dir=self._cache_dir, base_path=dataset_module.builder_kwargs.get("base_path"), namespace=dataset_module.builder_kwargs.get("namespace"), )) for builder in builders: builder.download_and_prepare( download_mode=DownloadMode.REUSE_CACHE_IF_EXISTS if not self._force_redownload else DownloadMode.FORCE_REDOWNLOAD, download_config=DownloadConfig( cache_dir=config.DOWNLOADED_DATASETS_PATH), save_infos=self._save_infos, ignore_verifications=self._ignore_verifications, try_from_hf_gcs=False, ) print("Apache beam run successful.") # If save_infos=True, the dataset infos file is created next to the loaded module file. # Let's move it to the original directory of the dataset script, to allow the user to # upload them on S3 at the same time afterwards. if self._save_infos: dataset_infos_path = os.path.join( builder_cls.get_imported_module_dir(), config.DATASETDICT_INFOS_FILENAME) name = Path(path).name + ".py" combined_path = os.path.join(path, name) if os.path.isfile(path): dataset_dir = os.path.dirname(path) elif os.path.isfile(combined_path): dataset_dir = path else: # in case of a remote dataset print(f"Dataset Infos file saved at {dataset_infos_path}") exit(1) # Move datasetinfo back to the user user_dataset_infos_path = os.path.join( dataset_dir, config.DATASETDICT_INFOS_FILENAME) copyfile(dataset_infos_path, user_dataset_infos_path) print(f"Dataset Infos file saved at {user_dataset_infos_path}")