Example #1
0
def download_gcs_folder(
    gcs_folder: epath.Path,
    local_folder: epath.PathLike,
    max_simultaneous_downloads: int = 25,
) -> None:
    """Downloads prepared GCS folder to local folder."""
    if _is_gcs_disabled:
        raise AssertionError('Cannot download from GCS when _is_gcs_disabled')

    # Filter out the diffs folder if present
    paths_to_dl = [p for p in gcs_folder.iterdir() if p.name != 'diffs']

    with tqdm_utils.async_tqdm(total=len(paths_to_dl),
                               desc='Dl Completed...',
                               unit=' file') as pbar:

        def _copy(gcs_path_: epath.Path):
            # Copy 'gs://tfds-data/datasets/ds/1.0.0/file' -> `local_dir/file`
            tf.io.gfile.copy(
                os.fspath(gcs_path_),
                os.path.join(local_folder, gcs_path_.name),
            )
            pbar.update(1)

        with concurrent.futures.ThreadPoolExecutor(
                max_workers=max_simultaneous_downloads) as executor:
            futures = [executor.submit(_copy, path) for path in paths_to_dl]
            for future in concurrent.futures.as_completed(futures):
                future.result()
Example #2
0
def _compute_dir_hash(path: epath.Path) -> str:
  """Computes the checksums of the given directory deterministically."""
  all_files = sorted(path.iterdir())

  if any(f.is_dir() for f in all_files):
    raise ValueError('Installed package should only contains files.')

  # Concatenate the filenames and files content to create the directory hash
  all_checksums = [f.name for f in all_files]
  all_checksums += [checksums.compute_url_info(f).checksum for f in all_files]
  return hashlib.sha256(''.join(all_checksums).encode()).hexdigest()
Example #3
0
def _maybe_iterdir(path: epath.Path) -> Iterator[epath.Path]:
    """Same as `path.iterdir()`, but don't fail if path does not exist."""
    # Use try/except rather than `.exists()` to avoid an extra RPC call
    # per namespace
    try:
        for f in path.iterdir():
            yield f
    except (
            FileNotFoundError,
            tf.errors.NotFoundError,
            tf.errors.PermissionDeniedError,
    ) as e:
        pass
Example #4
0
def list_ds_packages_for_namespace(
    namespace: str,
    path: epath.Path,
) -> List[DatasetPackage]:
    """Returns the dataset names found in a specific directory.

  Directories that contain code should have the following structure:

  ```
  <path>/
      <dataset0>/
          <dataset0>.py
      <dataset1>/
          <dataset1>.py
      ...
  ```

  Additional files or folders which are not detected as datasets will be
  ignored (e.g. `__init__.py`).

  Args:
    namespace: Namespace of the datasets
    path: The directory path containing the datasets.

  Returns:
    ds_packages: The dataset packages found in the directory (sorted for
      determinism).

  Raises:
    FileNotFoundError: If the path cannot be reached.
  """
    if not path.exists():
        # Should be fault-tolerant in the future
        raise FileNotFoundError(f'Could not find datasets at {path}')

    all_packages = []
    for ds_path in path.iterdir():
        source = get_dataset_source(ds_path)
        if source:
            pkg = DatasetPackage(
                name=naming.DatasetName(namespace=namespace,
                                        name=ds_path.name),
                source=source,
            )
            all_packages.append(pkg)

    return all_packages
Example #5
0
def _extract_split_files(data_dir: epath.Path) -> _SplitFilesDict:
    """Extract the files."""
    files = sorted(data_dir.iterdir())
    file_infos = [
        naming.FilenameInfo.from_str(f.name) for f in files
        if naming.FilenameInfo.is_valid(f.name)
    ]
    if not file_infos:
        raise ValueError(
            f'No example files detected in {data_dir}. Make sure to follow the '
            'pattern: '
            '`<dataset_name>-<split_name>.<file-extension>-xxxxxx-of-yyyyyy`')

    split_files = collections.defaultdict(list)
    for file_info in file_infos:
        split_files[file_info.split].append(file_info)

    return split_files
Example #6
0
def get_dataset_source(
    ds_path: epath.Path, ) -> Optional[dataset_sources_lib.DatasetSource]:
    """Returns a `DatasetSource` instance if the given path corresponds to a dataset.

  To determine whether the given path contains a dataset, a simple heuristic is
  used that checks whether the path has the following structure:

  ```
  <ds_name>/
      <ds_name>.py
  ```

  If so, all `.py`, `.txt`, `.tsv`, `.json` files will be added to the package.

  Args:
    ds_path: Path of the dataset module

  Returns:
    A `DatasetSource` instance if the path matches the expected file structure.
  """
    filter_list = {'__init__.py'}
    suffixes_list = ('.txt', '.tsv', '.py', '.json')

    def is_interesting_file(fname: str) -> bool:
        return fname.endswith(suffixes_list) and fname not in filter_list

    if not ds_path.is_dir():
        return None
    all_filenames = set(f.name for f in ds_path.iterdir())
    if f'{ds_path.name}.py' not in all_filenames:
        return None

    return dataset_sources_lib.DatasetSource(
        root_path=ds_path,
        filenames=sorted(
            [fname for fname in all_filenames if is_interesting_file(fname)]),
    )