Example #1
0
def save_url_infos(
    path: epath.Path,
    url_infos: Dict[str, UrlInfo],
) -> None:
    """Store given checksums and sizes for specific dataset.

  Content of file is never disgarded, only updated. This is to ensure that if
  process is killed right after first download finishes, checksums registered
  during previous runs aren't lost.

  It is the responsibility of the caller not to call function multiple times in
  parallel for a given dataset.

  Only original file content is updated. This means the entire set of new sizes
  and checksums must be given at every call.

  Args:
    path: Path to the resources.
    url_infos: dict, {url: (size_in_bytes, checksum)}.
  """
    original_data = load_url_infos(path) if path.exists() else {}
    new_data = original_data.copy()
    new_data.update(url_infos)
    # Compare filenames separatelly, as filename field is eq=False
    if original_data == new_data and _filenames_equal(original_data, new_data):
        return
    lines = [
        f'{url}\t{int(url_info.size)}\t{url_info.checksum}\t'
        f'{url_info.filename or ""}\n'
        for url, url_info in sorted(new_data.items())
    ]
    path.write_text(''.join(lines), encoding='UTF-8')
Example #2
0
def _get_cached_copy(file_path: epath.Path,
                     max_age_days: int) -> Optional[str]:
    if file_path.exists():
        stats = os.stat(file_path)
        modified_time = datetime.datetime.fromtimestamp(stats.st_mtime)
        if modified_time > datetime.datetime.now() - datetime.timedelta(
                days=max_age_days):
            return file_path.read_text()
    return None
Example #3
0
def build_and_save_community_catalog(catalog_dir: epath.Path) -> None:
    """Builds and saves the catalog of community datasets."""
    templates = DocumentationTemplates.load()
    formatter_per_namespace = _get_formatter_per_namespace(templates)

    overview = build_overview(formatter_per_namespace, templates)
    catalog_dir.joinpath('overview.md').write_text(overview)

    # Write the `_toc.yaml` TF documentation navigation bar
    toc_yaml = build_toc_yaml(formatter_per_namespace)
    with catalog_dir.joinpath('_toc.yaml').open('w') as f:
        yaml.dump(toc_yaml, f, default_flow_style=False)

    for namespace, details in build_namespace_details(formatter_per_namespace):
        namespace_file = catalog_dir / f'{namespace}.md'
        namespace_file.write_text(details)
Example #4
0
def download_gcs_folder(
    gcs_folder: epath.Path,
    local_folder: epath.PathLike,
    max_simultaneous_downloads: int = 25,
) -> None:
    """Downloads prepared GCS folder to local folder."""
    if _is_gcs_disabled:
        raise AssertionError('Cannot download from GCS when _is_gcs_disabled')

    # Filter out the diffs folder if present
    paths_to_dl = [p for p in gcs_folder.iterdir() if p.name != 'diffs']

    with tqdm_utils.async_tqdm(total=len(paths_to_dl),
                               desc='Dl Completed...',
                               unit=' file') as pbar:

        def _copy(gcs_path_: epath.Path):
            # Copy 'gs://tfds-data/datasets/ds/1.0.0/file' -> `local_dir/file`
            tf.io.gfile.copy(
                os.fspath(gcs_path_),
                os.path.join(local_folder, gcs_path_.name),
            )
            pbar.update(1)

        with concurrent.futures.ThreadPoolExecutor(
                max_workers=max_simultaneous_downloads) as executor:
            futures = [executor.submit(_copy, path) for path in paths_to_dl]
            for future in concurrent.futures.as_completed(futures):
                future.result()
Example #5
0
def list_ds_packages_for_namespace(
    namespace: str,
    path: epath.Path,
) -> List[DatasetPackage]:
    """Returns the dataset names found in a specific directory.

  Directories that contain code should have the following structure:

  ```
  <path>/
      <dataset0>/
          <dataset0>.py
      <dataset1>/
          <dataset1>.py
      ...
  ```

  Additional files or folders which are not detected as datasets will be
  ignored (e.g. `__init__.py`).

  Args:
    namespace: Namespace of the datasets
    path: The directory path containing the datasets.

  Returns:
    ds_packages: The dataset packages found in the directory (sorted for
      determinism).

  Raises:
    FileNotFoundError: If the path cannot be reached.
  """
    if not path.exists():
        # Should be fault-tolerant in the future
        raise FileNotFoundError(f'Could not find datasets at {path}')

    all_packages = []
    for ds_path in path.iterdir():
        source = get_dataset_source(ds_path)
        if source:
            pkg = DatasetPackage(
                name=naming.DatasetName(namespace=namespace,
                                        name=ds_path.name),
                source=source,
            )
            all_packages.append(pkg)

    return all_packages
Example #6
0
 def _rename_and_get_final_dl_path(
     self,
     url: str,
     path: epath.Path,
     expected_url_info: Optional[checksums.UrlInfo],
     computed_url_info: Optional[checksums.UrlInfo],
     checksum_path: Optional[epath.Path],
     url_path: epath.Path,
 ) -> epath.Path:
     """Eventually rename the downloaded file if checksums were recorded."""
     # `path` can be:
     # * Manually downloaded
     # * (cached) checksum_path
     # * (cached) url_path
     # * `tmp_dir/file` (downloaded path)
     if self._manual_dir and path.is_relative_to(self._manual_dir):
         return path  # Manually downloaded data
     elif path == checksum_path:  # Path already at final destination
         assert computed_url_info == expected_url_info  # Sanity check
         return checksum_path  # pytype: disable=bad-return-type
     elif path == url_path:
         if checksum_path:
             # Checksums were registered: Rename -> checksums_path
             resource_lib.rename_info_file(path,
                                           checksum_path,
                                           overwrite=True)
             return path.replace(checksum_path)
         else:
             # Checksums not registered: -> do nothing
             return path
     else:  # Path was downloaded in tmp dir
         dst_path = checksum_path or url_path
         resource_lib.write_info_file(
             url=url,
             path=dst_path,
             dataset_name=self._dataset_name,
             original_fname=path.name,
             url_info=computed_url_info,
         )
         path.replace(dst_path)
         path.parent.rmdir()  # Cleanup tmp dir (will fail if dir not empty)
         return dst_path
Example #7
0
def _compute_dir_hash(path: epath.Path) -> str:
  """Computes the checksums of the given directory deterministically."""
  all_files = sorted(path.iterdir())

  if any(f.is_dir() for f in all_files):
    raise ValueError('Installed package should only contains files.')

  # Concatenate the filenames and files content to create the directory hash
  all_checksums = [f.name for f in all_files]
  all_checksums += [checksums.compute_url_info(f).checksum for f in all_files]
  return hashlib.sha256(''.join(all_checksums).encode()).hexdigest()
Example #8
0
def _maybe_iterdir(path: epath.Path) -> Iterator[epath.Path]:
    """Same as `path.iterdir()`, but don't fail if path does not exist."""
    # Use try/except rather than `.exists()` to avoid an extra RPC call
    # per namespace
    try:
        for f in path.iterdir():
            yield f
    except (
            FileNotFoundError,
            tf.errors.NotFoundError,
            tf.errors.PermissionDeniedError,
    ) as e:
        pass
Example #9
0
def get_dataset_source(
    ds_path: epath.Path, ) -> Optional[dataset_sources_lib.DatasetSource]:
    """Returns a `DatasetSource` instance if the given path corresponds to a dataset.

  To determine whether the given path contains a dataset, a simple heuristic is
  used that checks whether the path has the following structure:

  ```
  <ds_name>/
      <ds_name>.py
  ```

  If so, all `.py`, `.txt`, `.tsv`, `.json` files will be added to the package.

  Args:
    ds_path: Path of the dataset module

  Returns:
    A `DatasetSource` instance if the path matches the expected file structure.
  """
    filter_list = {'__init__.py'}
    suffixes_list = ('.txt', '.tsv', '.py', '.json')

    def is_interesting_file(fname: str) -> bool:
        return fname.endswith(suffixes_list) and fname not in filter_list

    if not ds_path.is_dir():
        return None
    all_filenames = set(f.name for f in ds_path.iterdir())
    if f'{ds_path.name}.py' not in all_filenames:
        return None

    return dataset_sources_lib.DatasetSource(
        root_path=ds_path,
        filenames=sorted(
            [fname for fname in all_filenames if is_interesting_file(fname)]),
    )
Example #10
0
def _extract_split_files(data_dir: epath.Path) -> _SplitFilesDict:
    """Extract the files."""
    files = sorted(data_dir.iterdir())
    file_infos = [
        naming.FilenameInfo.from_str(f.name) for f in files
        if naming.FilenameInfo.is_valid(f.name)
    ]
    if not file_infos:
        raise ValueError(
            f'No example files detected in {data_dir}. Make sure to follow the '
            'pattern: '
            '`<dataset_name>-<split_name>.<file-extension>-xxxxxx-of-yyyyyy`')

    split_files = collections.defaultdict(list)
    for file_info in file_infos:
        split_files[file_info.split].append(file_info)

    return split_files
Example #11
0
  def _generate_examples(self,
                         annotations: epath.Path,
                         all_frames: epath.Path,
                         video_range_to_use: Optional[Tuple[int, int]] = None):
    beam = tfds.core.lazy_imports.apache_beam
    annotations = json.loads(annotations.read_text())
    video_id_to_tracks, videos = _build_annotations_index(annotations)
    height = self._builder_config.height
    width = self._builder_config.width
    only_frames_with_labels = self._builder_config.only_frames_with_labels
    data_example = {}

    def _frame_index(frame_filename):
      """Convert a video frame filename into a numerical index."""
      basename = os.path.basename(os.fspath(frame_filename))
      return int(basename.split('.')[0])

    def _process_example(video_id):
      """Process a single video into a data example."""
      video = videos[video_id]
      if only_frames_with_labels:
        frames_list = [all_frames / file for file in video['file_names']]
      else:
        video_dir = os.path.dirname(video['file_names'][0])
        video_directory = all_frames / video_dir
        frames_list = list(video_directory.glob('*'))
      frames_list = sorted(frames_list, key=_frame_index)
      data_example['metadata'] = _create_metadata(video, height, width,
                                                  len(frames_list))
      data_example['tracks'] = []
      track_annotations = video_id_to_tracks[video_id]
      for track in track_annotations:
        data_example['tracks'].append(
            _create_per_track_annotation(video, frames_list, track, height,
                                         width))
      data_example['video'] = self._maybe_resize_video(frames_list)
      return data_example['metadata']['video_name'], data_example

    video_keys = list(videos.keys())
    if video_range_to_use is not None:
      video_keys = video_keys[video_range_to_use[0]:video_range_to_use[1]]

    return beam.Create(video_keys) | beam.Map(_process_example)
Example #12
0
def exists(path: epath.Path) -> bool:
    """Checks if path exists. Returns False if issues occur connecting to GCS."""
    try:
        return path.exists()
    except GCS_UNAVAILABLE_EXCEPTIONS:  # pylint: disable=catching-non-exception
        return False