Esempio n. 1
0
 def setUp(self):
     self.hf_modules_cache = tempfile.mkdtemp()
     self.cache_dir = tempfile.mkdtemp()
     self.download_config = DownloadConfig(cache_dir=self.cache_dir)
     self.dynamic_modules_path = datasets.load.init_dynamic_modules(
         name="test_datasets_modules_" +
         os.path.basename(self.hf_modules_cache),
         hf_modules_cache=self.hf_modules_cache,
     )
Esempio n. 2
0
def test_extracted_datasets_path(default_extracted, default_cache_dir, xz_file, tmp_path, monkeypatch):
    custom_cache_dir = "custom_cache"
    custom_extracted_dir = "custom_extracted_dir"
    custom_extracted_path = tmp_path / "custom_extracted_path"
    if default_extracted:
        expected = ("downloads" if default_cache_dir else custom_cache_dir, "extracted")
    else:
        monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_DIR", custom_extracted_dir)
        monkeypatch.setattr("datasets.config.EXTRACTED_DATASETS_PATH", str(custom_extracted_path))
        expected = custom_extracted_path.parts[-2:] if default_cache_dir else (custom_cache_dir, custom_extracted_dir)

    filename = xz_file
    download_config = (
        DownloadConfig(extract_compressed_file=True)
        if default_cache_dir
        else DownloadConfig(cache_dir=tmp_path / custom_cache_dir, extract_compressed_file=True)
    )
    extracted_file_path = cached_path(filename, download_config=download_config)
    assert Path(extracted_file_path).parent.parts[-2:] == expected
Esempio n. 3
0
def test_cached_path_extract(compression_format, gz_file, xz_file, zstd_path, tmp_path, text_file):
    input_paths = {"gzip": gz_file, "xz": xz_file, "zstd": zstd_path}
    input_path = str(input_paths[compression_format])
    cache_dir = tmp_path / "cache"
    download_config = DownloadConfig(cache_dir=cache_dir, extract_compressed_file=True)
    extracted_path = cached_path(input_path, download_config=download_config)
    with open(extracted_path) as f:
        extracted_file_content = f.read()
    with open(text_file) as f:
        expected_file_content = f.read()
    assert extracted_file_content == expected_file_content
Esempio n. 4
0
 def load_builder_class(self, dataset_name, is_local=False):
     # Download/copy dataset script
     if is_local is True:
         dataset_module = dataset_module_factory(
             os.path.join("datasets", dataset_name))
     else:
         dataset_module = dataset_module_factory(
             dataset_name,
             download_config=DownloadConfig(force_download=True))
     # Get dataset builder class
     builder_cls = import_main_class(dataset_module.module_path)
     return builder_cls
    def test_dummy_data_autogenerate(self):
        n_lines = 5

        with TemporaryDirectory() as tmp_dir:
            with open(os.path.join(tmp_dir, "train.txt"),
                      "w",
                      encoding="utf-8") as f:
                f.write("foo\nbar\n" * 10)
            with open(os.path.join(tmp_dir, "test.txt"), "w",
                      encoding="utf-8") as f:
                f.write("foo\nbar\n" * 10)

            class MockDownloadManagerWithCustomDatasetsScriptsDir(
                    MockDownloadManager):
                datasets_scripts_dir = os.path.join(tmp_dir, "datasets")

            cache_dir = os.path.join(tmp_dir, "cache")
            os.makedirs(cache_dir, exist_ok=True)
            dataset_builder = DummyBuilder(tmp_test_dir=tmp_dir,
                                           cache_dir=cache_dir)
            mock_dl_manager = MockDownloadManagerWithCustomDatasetsScriptsDir(
                dataset_name=dataset_builder.name,
                config=None,
                version=Version("0.0.0"),
                use_local_dummy_data=True,
                cache_dir=cache_dir,
                load_existing_dummy_data=False,  # dummy data don't exist yet
            )
            download_config = DownloadConfig(cache_dir=os.path.join(
                tmp_dir, datasets.config.DOWNLOADED_DATASETS_DIR))
            dl_manager = DummyDataGeneratorDownloadManager(
                dataset_name=dataset_builder.name,
                mock_download_manager=mock_dl_manager,
                download_config=download_config,
            )
            dataset_builder.download_and_prepare(dl_manager=dl_manager,
                                                 try_from_hf_gcs=False)
            shutil.rmtree(dataset_builder._cache_dir)

            dl_manager.auto_generate_dummy_data_folder(n_lines=n_lines)
            path_do_dataset = os.path.join(
                mock_dl_manager.datasets_scripts_dir,
                mock_dl_manager.dataset_name)
            dl_manager.compress_autogenerated_dummy_data(path_do_dataset)

            mock_dl_manager.load_existing_dummy_data = True
            dataset_builder.download_and_prepare(dl_manager=mock_dl_manager,
                                                 ignore_verifications=True,
                                                 try_from_hf_gcs=False)
            dataset = dataset_builder.as_dataset(split="train")
            self.assertEqual(len(dataset), n_lines)
            del dataset
Esempio n. 6
0
 def test_load_real_dataset(self, dataset_name):
     path = "./datasets/" + dataset_name
     dataset_module = dataset_module_factory(
         path, download_config=DownloadConfig(local_files_only=True))
     builder_cls = import_main_class(dataset_module.module_path)
     name = builder_cls.BUILDER_CONFIGS[
         0].name if builder_cls.BUILDER_CONFIGS else None
     with tempfile.TemporaryDirectory() as temp_cache_dir:
         dataset = load_dataset(path,
                                name=name,
                                cache_dir=temp_cache_dir,
                                download_mode=DownloadMode.FORCE_REDOWNLOAD)
         for split in dataset.keys():
             self.assertTrue(len(dataset[split]) > 0)
         del dataset
Esempio n. 7
0
def test_download_manager_download(urls_type, tmp_path, monkeypatch):
    import requests

    monkeypatch.setattr(requests, "request", mock_request)

    url = URL
    if issubclass(urls_type, str):
        urls = url
    elif issubclass(urls_type, list):
        urls = [url]
    elif issubclass(urls_type, dict):
        urls = {"train": url}
    dataset_name = "dummy"
    cache_subdir = "downloads"
    cache_dir_root = str(tmp_path)
    download_config = DownloadConfig(
        cache_dir=os.path.join(cache_dir_root, cache_subdir),
        use_etag=False,
    )
    dl_manager = DownloadManager(dataset_name=dataset_name, download_config=download_config)
    downloaded_paths = dl_manager.download(urls)
    input_urls = urls
    for downloaded_paths in [downloaded_paths]:
        if isinstance(urls, str):
            downloaded_paths = [downloaded_paths]
            input_urls = [urls]
        elif isinstance(urls, dict):
            assert "train" in downloaded_paths.keys()
            downloaded_paths = downloaded_paths.values()
            input_urls = urls.values()
        assert downloaded_paths
        for downloaded_path, input_url in zip(downloaded_paths, input_urls):
            assert downloaded_path == dl_manager.downloaded_paths[input_url]
            downloaded_path = Path(downloaded_path)
            parts = downloaded_path.parts
            assert parts[-1] == HASH
            assert parts[-2] == cache_subdir
            assert downloaded_path.exists()
            content = downloaded_path.read_text()
            assert content == CONTENT
            metadata_downloaded_path = downloaded_path.with_suffix(".json")
            assert metadata_downloaded_path.exists()
            metadata_content = json.loads(metadata_downloaded_path.read_text())
            assert metadata_content == {"url": URL, "etag": None}
Esempio n. 8
0
def test_load_dataset_deletes_extracted_files(deleted, jsonl_gz_path,
                                              tmp_path):
    data_files = jsonl_gz_path
    cache_dir = tmp_path / "cache"
    if deleted:
        download_config = DownloadConfig(delete_extracted=True,
                                         cache_dir=cache_dir / "downloads")
        ds = load_dataset("json",
                          split="train",
                          data_files=data_files,
                          cache_dir=cache_dir,
                          download_config=download_config)
    else:  # default
        ds = load_dataset("json",
                          split="train",
                          data_files=data_files,
                          cache_dir=cache_dir)
    assert ds[0] == {"col_1": "0", "col_2": 0, "col_3": 0.0}
    assert (sorted(
        (cache_dir / "downloads" / "extracted").iterdir()) == []) is deleted
Esempio n. 9
0
def test_download_manager_extract(paths_type, xz_file, text_file):
    filename = str(xz_file)
    if issubclass(paths_type, str):
        paths = filename
    elif issubclass(paths_type, list):
        paths = [filename]
    elif issubclass(paths_type, dict):
        paths = {"train": filename}
    dataset_name = "dummy"
    cache_dir = xz_file.parent
    extracted_subdir = "extracted"
    download_config = DownloadConfig(
        cache_dir=cache_dir,
        use_etag=False,
    )
    dl_manager = DownloadManager(dataset_name=dataset_name, download_config=download_config)
    extracted_paths = dl_manager.extract(paths)
    input_paths = paths
    for extracted_paths in [extracted_paths]:
        if isinstance(paths, str):
            extracted_paths = [extracted_paths]
            input_paths = [paths]
        elif isinstance(paths, dict):
            assert "train" in extracted_paths.keys()
            extracted_paths = extracted_paths.values()
            input_paths = paths.values()
        assert extracted_paths
        for extracted_path, input_path in zip(extracted_paths, input_paths):
            assert extracted_path == dl_manager.extracted_paths[input_path]
            extracted_path = Path(extracted_path)
            parts = extracted_path.parts
            assert parts[-1] == hash_url_to_filename(input_path, etag=None)
            assert parts[-2] == extracted_subdir
            assert extracted_path.exists()
            extracted_file_content = extracted_path.read_text()
            expected_file_content = text_file.read_text()
            assert extracted_file_content == expected_file_content
Esempio n. 10
0
 def _autogenerate_dummy_data(self, dataset_builder, mock_dl_manager,
                              keep_uncompressed) -> Optional[bool]:
     dl_cache_dir = (os.path.join(
         self._cache_dir, config.DOWNLOADED_DATASETS_DIR) if self._cache_dir
                     else config.DOWNLOADED_DATASETS_PATH)
     download_config = DownloadConfig(cache_dir=dl_cache_dir)
     dl_manager = DummyDataGeneratorDownloadManager(
         dataset_name=self._dataset_name,
         mock_download_manager=mock_dl_manager,
         download_config=download_config)
     dataset_builder._split_generators(dl_manager)
     mock_dl_manager.load_existing_dummy_data = False  # don't use real dummy data
     dl_manager.auto_generate_dummy_data_folder(
         n_lines=self._n_lines,
         json_field=self._json_field,
         xml_tag=self._xml_tag,
         match_text_files=self._match_text_files,
         encoding=self._encoding,
     )
     if not keep_uncompressed:
         path_do_dataset = os.path.join(
             mock_dl_manager.datasets_scripts_dir,
             mock_dl_manager.dataset_name)
         dl_manager.compress_autogenerated_dummy_data(path_do_dataset)
         # now test that the dummy_data.zip file actually works
         mock_dl_manager.load_existing_dummy_data = True  # use real dummy data
         n_examples_per_split = {}
         os.makedirs(dataset_builder._cache_dir, exist_ok=True)
         try:
             split_generators = dataset_builder._split_generators(
                 mock_dl_manager)
             for split_generator in split_generators:
                 dataset_builder._prepare_split(split_generator,
                                                check_duplicate_keys=False)
                 n_examples_per_split[
                     split_generator.
                     name] = split_generator.split_info.num_examples
         except OSError as e:
             logger.error(
                 f"Failed to load dummy data for config '{dataset_builder.config.name}''.\nOriginal error:\n"
                 + str(e))
             return False
         else:
             if all(n_examples > 0
                    for n_examples in n_examples_per_split.values()):
                 logger.warning(
                     f"Dummy data generation done and dummy data test succeeded for config '{dataset_builder.config.name}''."
                 )
                 return True
             else:
                 empty_splits = [
                     split_name for split_name in n_examples_per_split
                     if n_examples_per_split[split_name] == 0
                 ]
                 logger.warning(
                     f"Dummy data generation done but dummy data test failed since splits {empty_splits} have 0 examples for config '{dataset_builder.config.name}''."
                 )
                 return False
     else:
         generated_dummy_data_dir = os.path.join(
             self._path_to_dataset, mock_dl_manager.dummy_data_folder)
         logger.info(
             f"Dummy data generated in directory '{generated_dummy_data_dir}' but kept uncompressed. "
             "Please compress this directory into a zip file to use it for dummy data tests."
         )
Esempio n. 11
0
    def run(self):
        import apache_beam as beam

        if self._name is not None and self._all_configs:
            print(
                "Both parameters `name` and `all_configs` can't be used at once."
            )
            exit(1)
        path, name = self._dataset, self._name
        dataset_module = dataset_module_factory(path)
        builder_cls = import_main_class(dataset_module.module_path)
        builders: List[DatasetBuilder] = []
        if self._beam_pipeline_options:
            beam_options = beam.options.pipeline_options.PipelineOptions(
                flags=[
                    f"--{opt.strip()}"
                    for opt in self._beam_pipeline_options.split(",") if opt
                ])
        else:
            beam_options = None
        if self._all_configs and len(builder_cls.BUILDER_CONFIGS) > 0:
            for builder_config in builder_cls.BUILDER_CONFIGS:
                builders.append(
                    builder_cls(
                        name=builder_config.name,
                        data_dir=self._data_dir,
                        hash=dataset_module.hash,
                        beam_options=beam_options,
                        cache_dir=self._cache_dir,
                        base_path=dataset_module.builder_kwargs.get(
                            "base_path"),
                    ))
        else:
            builders.append(
                builder_cls(
                    name=name,
                    data_dir=self._data_dir,
                    beam_options=beam_options,
                    cache_dir=self._cache_dir,
                    base_path=dataset_module.builder_kwargs.get("base_path"),
                    **self._config_kwargs,
                ))

        for builder in builders:
            builder.download_and_prepare(
                download_mode=DownloadMode.REUSE_CACHE_IF_EXISTS if
                not self._force_redownload else DownloadMode.FORCE_REDOWNLOAD,
                download_config=DownloadConfig(
                    cache_dir=config.DOWNLOADED_DATASETS_PATH),
                ignore_verifications=self._ignore_verifications,
                try_from_hf_gcs=False,
            )
            if self._save_infos:
                builder._save_infos()

        print("Apache beam run successful.")

        # If save_infos=True, the dataset infos file is created next to the loaded module file.
        # Let's move it to the original directory of the dataset script, to allow the user to
        # upload them on S3 at the same time afterwards.
        if self._save_infos:
            dataset_infos_path = os.path.join(
                builder_cls.get_imported_module_dir(),
                config.DATASETDICT_INFOS_FILENAME)

            name = Path(path).name + ".py"

            combined_path = os.path.join(path, name)
            if os.path.isfile(path):
                dataset_dir = os.path.dirname(path)
            elif os.path.isfile(combined_path):
                dataset_dir = path
            else:  # in case of a remote dataset
                print(f"Dataset Infos file saved at {dataset_infos_path}")
                exit(1)

            # Move datasetinfo back to the user
            user_dataset_infos_path = os.path.join(
                dataset_dir, config.DATASETDICT_INFOS_FILENAME)
            copyfile(dataset_infos_path, user_dataset_infos_path)
            print(f"Dataset Infos file saved at {user_dataset_infos_path}")