def test_loaded_streaming_dataset_has_use_auth_token(dataset_loading_script_dir, data_dir): from datasets.utils.streaming_download_manager import xopen use_auth_token = "foo" load_dataset(dataset_loading_script_dir, streaming=True, data_dir=data_dir, use_auth_token=use_auth_token) module_path, _ = prepare_module(dataset_loading_script_dir) module = importlib.import_module(module_path) assert isinstance(module.open, partial) assert module.open.func is xopen assert module.open.keywords == {"use_auth_token": use_auth_token}
def run(self): set_verbosity_warning() module_path, hash = prepare_module(self._path_to_dataset) builder_cls = import_main_class(module_path) # use `None` as config if no configs configs = builder_cls.BUILDER_CONFIGS or [None] auto_generate_results = [] with tempfile.TemporaryDirectory() as tmp_dir: for config in configs: if config is None: name = None version = builder_cls.VERSION else: version = config.version name = config.name dataset_builder = builder_cls(name=name, hash=hash, cache_dir=tmp_dir) mock_dl_manager = MockDownloadManager( dataset_name=self._dataset_name, config=config, version=version, is_local=True, load_existing_dummy_data=False, ) if self._auto_generate: auto_generate_results.append( self._autogenerate_dummy_data( dataset_builder=dataset_builder, mock_dl_manager=mock_dl_manager, keep_uncompressed=self._keep_uncompressed, )) else: self._print_dummy_data_instructions( dataset_builder=dataset_builder, mock_dl_manager=mock_dl_manager) if self._auto_generate and not self._keep_uncompressed: if all(auto_generate_results): print( f"Automatic dummy data generation succeeded for all configs of '{self._path_to_dataset}'" ) else: print( f"Automatic dummy data generation failed for some configs of '{self._path_to_dataset}'" )
def get_compatible_task_template(task: str, dataset: str, config: str = None): module, module_hash = prepare_module(dataset) builder_cls = import_main_class(module) builder = builder_cls(hash=module_hash, name=config) templates = builder.info.task_templates if templates: compatible_templates = [template for template in templates if template.task == task] if not compatible_templates: raise ValueError(f"❌ Task `{task}` is not compatible with dataset `{dataset}`!") if len(compatible_templates) > 1: raise ValueError( f"❌ Expected 1 task template but found {len(compatible_templates)}! Please ensure that `datasets.DatasetInfo.task_templates` contains a unique set of task types." ) return compatible_templates[0] else: return None
def test_dataset_info_available(self, dataset, config_name): with TemporaryDirectory() as tmp_dir: local_module_path, local_hash = prepare_module( os.path.join("datasets", dataset), dataset=True, cache_dir=tmp_dir, local_files_only=True ) builder_cls = import_main_class(local_module_path, dataset=True) builder_instance: DatasetBuilder = builder_cls( cache_dir=tmp_dir, name=config_name, hash=local_hash, ) dataset_info_url = os.path.join( HF_GCP_BASE_URL, builder_instance._relative_data_dir(with_hash=False), DATASET_INFO_FILENAME ).replace(os.sep, "/") datset_info_path = cached_path(dataset_info_url, cache_dir=tmp_dir) self.assertTrue(os.path.exists(datset_info_path))
def run(self): fl_logger().setLevel(ERROR) if self._name is not None and self._all_configs: print( "Both parameters `config` and `all_configs` can't be used at once." ) exit(1) path, name = self._dataset, self._name module_path, hash = prepare_module(path) builder_cls = import_main_class(module_path) if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0: n_builders = len(builder_cls.BUILDER_CONFIGS) // self._num_proc n_builders += (len(builder_cls.BUILDER_CONFIGS) % self._num_proc) > self._proc_rank else: n_builders = 1 if self._proc_rank == 0 else 0 def get_builders() -> Generator[DatasetBuilder, None, None]: if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0: for i, config in enumerate(builder_cls.BUILDER_CONFIGS): if i % self._num_proc == self._proc_rank: yield builder_cls(name=config.name, hash=hash, cache_dir=self._cache_dir, data_dir=self._data_dir) else: if self._proc_rank == 0: yield builder_cls(name=name, hash=hash, cache_dir=self._cache_dir, data_dir=self._data_dir) for j, builder in enumerate(get_builders()): print( f"Testing builder '{builder.config.name}' ({j + 1}/{n_builders})" ) builder.download_and_prepare( download_mode=REUSE_CACHE_IF_EXISTS if not self._force_redownload else FORCE_REDOWNLOAD, ignore_verifications=self._ignore_verifications, try_from_hf_gcs=False, ) builder.as_dataset() if self._save_infos: builder._save_infos() # If save_infos=True, the dataset infos file is created next to the loaded module file. # Let's move it to the original directory of the dataset script, to allow the user to # upload them on S3 at the same time afterwards. if self._save_infos: dataset_infos_path = os.path.join( builder_cls.get_imported_module_dir(), DATASET_INFOS_DICT_FILE_NAME) name = Path(path).name + ".py" combined_path = os.path.join(path, name) if os.path.isfile(path): dataset_dir = os.path.dirname(path) elif os.path.isfile(combined_path): dataset_dir = path else: # in case of a remote dataset dataset_dir = None print("Dataset Infos file saved at {}".format( dataset_infos_path)) # Move dataset_info back to the user if dataset_dir is not None: user_dataset_infos_path = os.path.join( dataset_dir, DATASET_INFOS_DICT_FILE_NAME) copyfile(dataset_infos_path, user_dataset_infos_path) print("Dataset Infos file saved at {}".format( user_dataset_infos_path)) # If clear_cache=True, the download forlder and the dataset builder cache directory are deleted if self._clear_cache: if os.path.isdir(builder._cache_dir): logger.warning(f"Clearing cache at {builder._cache_dir}") rmtree(builder._cache_dir) download_dir = os.path.join(self._cache_dir, "downloads") if os.path.isdir(download_dir): logger.warning(f"Clearing cache at {download_dir}") rmtree(download_dir) print("Test successful.")
def run(self): import apache_beam as beam if self._name is not None and self._all_configs: print("Both parameters `name` and `all_configs` can't be used at once.") exit(1) path, name = self._dataset, self._name module_path, hash = prepare_module(path) builder_cls = import_main_class(module_path) builders: List[DatasetBuilder] = [] if self._beam_pipeline_options: beam_options = beam.options.pipeline_options.PipelineOptions( flags=["--%s" % opt.strip() for opt in self._beam_pipeline_options.split(",") if opt] ) else: beam_options = None if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0: for config in builder_cls.BUILDER_CONFIGS: builders.append( builder_cls( name=config.name, data_dir=self._data_dir, hash=hash, beam_options=beam_options, cache_dir=self._cache_dir, ) ) else: builders.append( builder_cls(name=name, data_dir=self._data_dir, beam_options=beam_options, cache_dir=self._cache_dir) ) for builder in builders: builder.download_and_prepare( download_mode=REUSE_CACHE_IF_EXISTS if not self._force_redownload else FORCE_REDOWNLOAD, download_config=DownloadConfig(cache_dir=os.path.join(HF_DATASETS_CACHE, "downloads")), save_infos=self._save_infos, ignore_verifications=self._ignore_verifications, try_from_hf_gcs=False, ) print("Apache beam run successful.") # If save_infos=True, the dataset infos file is created next to the loaded module file. # Let's move it to the original directory of the dataset script, to allow the user to # upload them on S3 at the same time afterwards. if self._save_infos: dataset_infos_path = os.path.join(builder_cls.get_imported_module_dir(), DATASET_INFOS_DICT_FILE_NAME) name = list(filter(lambda x: x, path.split("/")))[-1] + ".py" combined_path = os.path.join(path, name) if os.path.isfile(path): dataset_dir = os.path.dirname(path) elif os.path.isfile(combined_path): dataset_dir = path else: # in case of a remote dataset print("Dataset Infos file saved at {}".format(dataset_infos_path)) exit(1) # Move datasetinfo back to the user user_dataset_infos_path = os.path.join(dataset_dir, DATASET_INFOS_DICT_FILE_NAME) copyfile(dataset_infos_path, user_dataset_infos_path) print("Dataset Infos file saved at {}".format(user_dataset_infos_path))
def get_dataset_splits(dataset: str, config: str = None): module, module_hash = prepare_module(dataset) builder_cls = import_main_class(module) builder = builder_cls(hash=module_hash, name=config) splits = builder.info.splits.keys() return list(splits)
def run(self): module_path, hash = prepare_module(self._path_to_dataset) builder_cls = import_main_class(module_path) # use `None` as config if no configs configs = builder_cls.BUILDER_CONFIGS or [None] for config in configs: if config is None: name = None version = builder_cls.VERSION else: version = config.version name = config.name dataset_builder = builder_cls(name=name, hash=hash) mock_dl_manager = MockDownloadManager( dataset_name=self._dataset_name, config=config, version=version, is_local=True) dummy_data_folder = os.path.join(self._path_to_dataset, mock_dl_manager.dummy_data_folder) logger.info( f"Creating dummy folder structure for {dummy_data_folder}... ") os.makedirs(dummy_data_folder, exist_ok=True) try: generator_splits = dataset_builder._split_generators( mock_dl_manager) except FileNotFoundError as e: print( f"Dataset {self._dataset_name} with config {config} seems to already open files in the method `_split_generators(...)`. You might consider to instead only open files in the method `_generate_examples(...)` instead. If this is not possible the dummy data has to be created with less guidance. Make sure you create the file {e.filename}." ) files_to_create = set() split_names = [] dummy_file_name = mock_dl_manager.dummy_file_name for split in generator_splits: logger.info( f"Collecting dummy data file paths to create for {split.name}" ) split_names.append(split.name) gen_kwargs = split.gen_kwargs generator = dataset_builder._generate_examples(**gen_kwargs) try: dummy_data_guidance_print = "\n" + 30 * "=" + "DUMMY DATA INSTRUCTIONS" + 30 * "=" + "\n" config_string = f"config {config.name} of " if config is not None else "" dummy_data_guidance_print += ( "- In order to create the dummy data for " + config_string + f"{self._dataset_name}, please go into the folder '{dummy_data_folder}' with `cd {dummy_data_folder}` . \n\n" ) # trigger generate function for key, record in generator: pass dummy_data_guidance_print += f"- It appears that the function `_generate_examples(...)` expects one or more files in the folder {dummy_file_name} using the function `glob.glob(...)`. In this case, please refer to the `_generate_examples(...)` method to see under which filename the dummy data files should be created. \n\n" except FileNotFoundError as e: files_to_create.add(e.filename) split_names = ", ".join(split_names) if len(files_to_create) > 0: # no glob.glob(...) in `_generate_examples(...)` if len(files_to_create) == 1 and next( iter(files_to_create)) == dummy_file_name: dummy_data_guidance_print += f"- Please create a single dummy data file called '{next(iter(files_to_create))}' from the folder '{dummy_data_folder}'. Make sure that the dummy data file provides at least one example for the split(s) '{split_names}' \n\n" files_string = dummy_file_name else: files_string = ", ".join(files_to_create) dummy_data_guidance_print += f"- Please create the following dummy data files '{files_string}' from the folder '{dummy_data_folder}'\n\n" dummy_data_guidance_print += f"- For each of the splits '{split_names}', make sure that one or more of the dummy data files provide at least one example \n\n" dummy_data_guidance_print += f"- If the method `_generate_examples(...)` includes multiple `open()` statements, you might have to create other files in addition to '{files_string}'. In this case please refer to the `_generate_examples(...)` method \n\n" if len(files_to_create) == 1 and next( iter(files_to_create)) == dummy_file_name: dummy_data_guidance_print += f"-After the dummy data file is created, it should be zipped to '{dummy_file_name}.zip' with the command `zip {dummy_file_name}.zip {dummy_file_name}` \n\n" dummy_data_guidance_print += ( f"-You can now delete the file '{dummy_file_name}' with the command `rm {dummy_file_name}` \n\n" ) dummy_data_guidance_print += f"- To get the file '{dummy_file_name}' back for further changes to the dummy data, simply unzip {dummy_file_name}.zip with the command `unzip {dummy_file_name}.zip` \n\n" else: dummy_data_guidance_print += f"-After all dummy data files are created, they should be zipped recursively to '{dummy_file_name}.zip' with the command `zip -r {dummy_file_name}.zip {dummy_file_name}/` \n\n" dummy_data_guidance_print += f"-You can now delete the folder '{dummy_file_name}' with the command `rm -r {dummy_file_name}` \n\n" dummy_data_guidance_print += f"- To get the folder '{dummy_file_name}' back for further changes to the dummy data, simply unzip {dummy_file_name}.zip with the command `unzip {dummy_file_name}.zip` \n\n" dummy_data_guidance_print += ( f"- Make sure you have created the file '{dummy_file_name}.zip' in '{dummy_data_folder}' \n" ) dummy_data_guidance_print += 83 * "=" + "\n" print(dummy_data_guidance_print)
def run(self): if self._name is not None and self._all_configs: print( "Both parameters `config` and `all_configs` can't be used at once." ) exit(1) path, name = self._dataset, self._name module_path, hash = prepare_module(path) builder_cls = import_main_class(module_path) builders: List[DatasetBuilder] = [] if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0: for config in builder_cls.BUILDER_CONFIGS: builders.append( builder_cls(name=config.name, hash=hash, cache_dir=self._cache_dir, data_dir=self._data_dir)) else: builders.append( builder_cls(name=name, hash=hash, cache_dir=self._cache_dir, data_dir=self._data_dir)) for builder in builders: builder.download_and_prepare( download_mode=REUSE_CACHE_IF_EXISTS if not self._force_redownload else FORCE_REDOWNLOAD, ignore_verifications=self._ignore_verifications, try_from_hf_gcs=False, ) builder.as_dataset() if self._save_infos: builder._save_infos() print("Test successful.") # If save_infos=True, the dataset infos file is created next to the loaded module file. # Let's move it to the original directory of the dataset script, to allow the user to # upload them on S3 at the same time afterwards. if self._save_infos: dataset_infos_path = os.path.join( builder_cls.get_imported_module_dir(), DATASET_INFOS_DICT_FILE_NAME) name = Path(path).name + ".py" combined_path = os.path.join(path, name) if os.path.isfile(path): dataset_dir = os.path.dirname(path) elif os.path.isfile(combined_path): dataset_dir = path else: # in case of a remote dataset print("Dataset Infos file saved at {}".format( dataset_infos_path)) exit(1) # Move datasetinfo back to the user user_dataset_infos_path = os.path.join( dataset_dir, DATASET_INFOS_DICT_FILE_NAME) copyfile(dataset_infos_path, user_dataset_infos_path) print("Dataset Infos file saved at {}".format( user_dataset_infos_path))