def download_gcs_folder( gcs_folder: epath.Path, local_folder: epath.PathLike, max_simultaneous_downloads: int = 25, ) -> None: """Downloads prepared GCS folder to local folder.""" if _is_gcs_disabled: raise AssertionError('Cannot download from GCS when _is_gcs_disabled') # Filter out the diffs folder if present paths_to_dl = [p for p in gcs_folder.iterdir() if p.name != 'diffs'] with tqdm_utils.async_tqdm(total=len(paths_to_dl), desc='Dl Completed...', unit=' file') as pbar: def _copy(gcs_path_: epath.Path): # Copy 'gs://tfds-data/datasets/ds/1.0.0/file' -> `local_dir/file` tf.io.gfile.copy( os.fspath(gcs_path_), os.path.join(local_folder, gcs_path_.name), ) pbar.update(1) with concurrent.futures.ThreadPoolExecutor( max_workers=max_simultaneous_downloads) as executor: futures = [executor.submit(_copy, path) for path in paths_to_dl] for future in concurrent.futures.as_completed(futures): future.result()
def _compute_dir_hash(path: epath.Path) -> str: """Computes the checksums of the given directory deterministically.""" all_files = sorted(path.iterdir()) if any(f.is_dir() for f in all_files): raise ValueError('Installed package should only contains files.') # Concatenate the filenames and files content to create the directory hash all_checksums = [f.name for f in all_files] all_checksums += [checksums.compute_url_info(f).checksum for f in all_files] return hashlib.sha256(''.join(all_checksums).encode()).hexdigest()
def _maybe_iterdir(path: epath.Path) -> Iterator[epath.Path]: """Same as `path.iterdir()`, but don't fail if path does not exist.""" # Use try/except rather than `.exists()` to avoid an extra RPC call # per namespace try: for f in path.iterdir(): yield f except ( FileNotFoundError, tf.errors.NotFoundError, tf.errors.PermissionDeniedError, ) as e: pass
def list_ds_packages_for_namespace( namespace: str, path: epath.Path, ) -> List[DatasetPackage]: """Returns the dataset names found in a specific directory. Directories that contain code should have the following structure: ``` <path>/ <dataset0>/ <dataset0>.py <dataset1>/ <dataset1>.py ... ``` Additional files or folders which are not detected as datasets will be ignored (e.g. `__init__.py`). Args: namespace: Namespace of the datasets path: The directory path containing the datasets. Returns: ds_packages: The dataset packages found in the directory (sorted for determinism). Raises: FileNotFoundError: If the path cannot be reached. """ if not path.exists(): # Should be fault-tolerant in the future raise FileNotFoundError(f'Could not find datasets at {path}') all_packages = [] for ds_path in path.iterdir(): source = get_dataset_source(ds_path) if source: pkg = DatasetPackage( name=naming.DatasetName(namespace=namespace, name=ds_path.name), source=source, ) all_packages.append(pkg) return all_packages
def _extract_split_files(data_dir: epath.Path) -> _SplitFilesDict: """Extract the files.""" files = sorted(data_dir.iterdir()) file_infos = [ naming.FilenameInfo.from_str(f.name) for f in files if naming.FilenameInfo.is_valid(f.name) ] if not file_infos: raise ValueError( f'No example files detected in {data_dir}. Make sure to follow the ' 'pattern: ' '`<dataset_name>-<split_name>.<file-extension>-xxxxxx-of-yyyyyy`') split_files = collections.defaultdict(list) for file_info in file_infos: split_files[file_info.split].append(file_info) return split_files
def get_dataset_source( ds_path: epath.Path, ) -> Optional[dataset_sources_lib.DatasetSource]: """Returns a `DatasetSource` instance if the given path corresponds to a dataset. To determine whether the given path contains a dataset, a simple heuristic is used that checks whether the path has the following structure: ``` <ds_name>/ <ds_name>.py ``` If so, all `.py`, `.txt`, `.tsv`, `.json` files will be added to the package. Args: ds_path: Path of the dataset module Returns: A `DatasetSource` instance if the path matches the expected file structure. """ filter_list = {'__init__.py'} suffixes_list = ('.txt', '.tsv', '.py', '.json') def is_interesting_file(fname: str) -> bool: return fname.endswith(suffixes_list) and fname not in filter_list if not ds_path.is_dir(): return None all_filenames = set(f.name for f in ds_path.iterdir()) if f'{ds_path.name}.py' not in all_filenames: return None return dataset_sources_lib.DatasetSource( root_path=ds_path, filenames=sorted( [fname for fname in all_filenames if is_interesting_file(fname)]), )