def test_dummy_data_autogenerate(self): n_lines = 5 with TemporaryDirectory() as tmp_dir: with open(os.path.join(tmp_dir, "train.txt"), "w", encoding="utf-8") as f: f.write("foo\nbar\n" * 10) with open(os.path.join(tmp_dir, "test.txt"), "w", encoding="utf-8") as f: f.write("foo\nbar\n" * 10) class MockDownloadManagerWithCustomDatasetsScriptsDir( MockDownloadManager): datasets_scripts_dir = os.path.join(tmp_dir, "datasets") cache_dir = os.path.join(tmp_dir, "cache") os.makedirs(cache_dir, exist_ok=True) dataset_builder = DummyBuilder(tmp_test_dir=tmp_dir, cache_dir=cache_dir) mock_dl_manager = MockDownloadManagerWithCustomDatasetsScriptsDir( dataset_name=dataset_builder.name, config=None, version=Version("0.0.0"), use_local_dummy_data=True, cache_dir=cache_dir, load_existing_dummy_data=False, # dummy data don't exist yet ) download_config = DownloadConfig(cache_dir=os.path.join( tmp_dir, datasets.config.DOWNLOADED_DATASETS_DIR)) dl_manager = DummyDataGeneratorDownloadManager( dataset_name=dataset_builder.name, mock_download_manager=mock_dl_manager, download_config=download_config, ) dataset_builder.download_and_prepare(dl_manager=dl_manager, try_from_hf_gcs=False) shutil.rmtree(dataset_builder._cache_dir) dl_manager.auto_generate_dummy_data_folder(n_lines=n_lines) path_do_dataset = os.path.join( mock_dl_manager.datasets_scripts_dir, mock_dl_manager.dataset_name) dl_manager.compress_autogenerated_dummy_data(path_do_dataset) mock_dl_manager.load_existing_dummy_data = True dataset_builder.download_and_prepare(dl_manager=mock_dl_manager, ignore_verifications=True, try_from_hf_gcs=False) dataset = dataset_builder.as_dataset(split="train") self.assertEqual(len(dataset), n_lines) del dataset
def test_download_manager_download(urls_type, tmp_path, monkeypatch): import requests monkeypatch.setattr(requests, "request", mock_request) url = URL if issubclass(urls_type, str): urls = url elif issubclass(urls_type, list): urls = [url] elif issubclass(urls_type, dict): urls = {"train": url} dataset_name = "dummy" cache_subdir = "downloads" cache_dir_root = str(tmp_path) download_config = DownloadConfig( cache_dir=os.path.join(cache_dir_root, cache_subdir), use_etag=False, ) dl_manager = DownloadManager(dataset_name=dataset_name, download_config=download_config) downloaded_paths = dl_manager.download(urls) input_urls = urls for downloaded_paths in [downloaded_paths]: if isinstance(urls, str): downloaded_paths = [downloaded_paths] input_urls = [urls] elif isinstance(urls, dict): assert "train" in downloaded_paths.keys() downloaded_paths = downloaded_paths.values() input_urls = urls.values() assert downloaded_paths for downloaded_path, input_url in zip(downloaded_paths, input_urls): assert downloaded_path == dl_manager.downloaded_paths[input_url] downloaded_path = Path(downloaded_path) parts = downloaded_path.parts assert parts[-1] == HASH assert parts[-2] == cache_subdir assert downloaded_path.exists() content = downloaded_path.read_text() assert content == CONTENT metadata_downloaded_path = downloaded_path.with_suffix(".json") assert metadata_downloaded_path.exists() metadata_content = json.loads(metadata_downloaded_path.read_text()) assert metadata_content == {"url": URL, "etag": None}
def test_download_manager_extract(paths_type, xz_file, text_file): filename = str(xz_file) if issubclass(paths_type, str): paths = filename elif issubclass(paths_type, list): paths = [filename] elif issubclass(paths_type, dict): paths = {"train": filename} dataset_name = "dummy" cache_dir = xz_file.parent extracted_subdir = "extracted" download_config = DownloadConfig( cache_dir=cache_dir, use_etag=False, ) dl_manager = DownloadManager(dataset_name=dataset_name, download_config=download_config) extracted_paths = dl_manager.extract(paths) input_paths = paths for extracted_paths in [extracted_paths]: if isinstance(paths, str): extracted_paths = [extracted_paths] input_paths = [paths] elif isinstance(paths, dict): assert "train" in extracted_paths.keys() extracted_paths = extracted_paths.values() input_paths = paths.values() assert extracted_paths for extracted_path, input_path in zip(extracted_paths, input_paths): assert extracted_path == dl_manager.extracted_paths[input_path] extracted_path = Path(extracted_path) parts = extracted_path.parts assert parts[-1] == hash_url_to_filename(input_path, etag=None) assert parts[-2] == extracted_subdir assert extracted_path.exists() extracted_file_content = extracted_path.read_text() expected_file_content = text_file.read_text() assert extracted_file_content == expected_file_content
def run(self): import apache_beam as beam if self._name is not None and self._all_configs: print("Both parameters `name` and `all_configs` can't be used at once.") exit(1) path, name = self._dataset, self._name module_path, hash = prepare_module(path) builder_cls = import_main_class(module_path) builders: List[DatasetBuilder] = [] if self._beam_pipeline_options: beam_options = beam.options.pipeline_options.PipelineOptions( flags=["--%s" % opt.strip() for opt in self._beam_pipeline_options.split(",") if opt] ) else: beam_options = None if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0: for builder_config in builder_cls.BUILDER_CONFIGS: builders.append( builder_cls( name=builder_config.name, data_dir=self._data_dir, hash=hash, beam_options=beam_options, cache_dir=self._cache_dir, ) ) else: builders.append( builder_cls(name=name, data_dir=self._data_dir, beam_options=beam_options, cache_dir=self._cache_dir) ) for builder in builders: builder.download_and_prepare( download_mode=GenerateMode.REUSE_CACHE_IF_EXISTS if not self._force_redownload else GenerateMode.FORCE_REDOWNLOAD, download_config=DownloadConfig(cache_dir=os.path.join(config.HF_DATASETS_CACHE, "downloads")), save_infos=self._save_infos, ignore_verifications=self._ignore_verifications, try_from_hf_gcs=False, ) print("Apache beam run successful.") # If save_infos=True, the dataset infos file is created next to the loaded module file. # Let's move it to the original directory of the dataset script, to allow the user to # upload them on S3 at the same time afterwards. if self._save_infos: dataset_infos_path = os.path.join(builder_cls.get_imported_module_dir(), config.DATASETDICT_INFOS_FILENAME) name = Path(path).name + ".py" combined_path = os.path.join(path, name) if os.path.isfile(path): dataset_dir = os.path.dirname(path) elif os.path.isfile(combined_path): dataset_dir = path else: # in case of a remote dataset print("Dataset Infos file saved at {}".format(dataset_infos_path)) exit(1) # Move datasetinfo back to the user user_dataset_infos_path = os.path.join(dataset_dir, config.DATASETDICT_INFOS_FILENAME) copyfile(dataset_infos_path, user_dataset_infos_path) print("Dataset Infos file saved at {}".format(user_dataset_infos_path))