Example #1
0
def save_url_infos(
    path: epath.Path,
    url_infos: Dict[str, UrlInfo],
) -> None:
    """Store given checksums and sizes for specific dataset.

  Content of file is never disgarded, only updated. This is to ensure that if
  process is killed right after first download finishes, checksums registered
  during previous runs aren't lost.

  It is the responsibility of the caller not to call function multiple times in
  parallel for a given dataset.

  Only original file content is updated. This means the entire set of new sizes
  and checksums must be given at every call.

  Args:
    path: Path to the resources.
    url_infos: dict, {url: (size_in_bytes, checksum)}.
  """
    original_data = load_url_infos(path) if path.exists() else {}
    new_data = original_data.copy()
    new_data.update(url_infos)
    # Compare filenames separatelly, as filename field is eq=False
    if original_data == new_data and _filenames_equal(original_data, new_data):
        return
    lines = [
        f'{url}\t{int(url_info.size)}\t{url_info.checksum}\t'
        f'{url_info.filename or ""}\n'
        for url, url_info in sorted(new_data.items())
    ]
    path.write_text(''.join(lines), encoding='UTF-8')
Example #2
0
def _get_cached_copy(file_path: epath.Path,
                     max_age_days: int) -> Optional[str]:
    if file_path.exists():
        stats = os.stat(file_path)
        modified_time = datetime.datetime.fromtimestamp(stats.st_mtime)
        if modified_time > datetime.datetime.now() - datetime.timedelta(
                days=max_age_days):
            return file_path.read_text()
    return None
Example #3
0
def list_ds_packages_for_namespace(
    namespace: str,
    path: epath.Path,
) -> List[DatasetPackage]:
    """Returns the dataset names found in a specific directory.

  Directories that contain code should have the following structure:

  ```
  <path>/
      <dataset0>/
          <dataset0>.py
      <dataset1>/
          <dataset1>.py
      ...
  ```

  Additional files or folders which are not detected as datasets will be
  ignored (e.g. `__init__.py`).

  Args:
    namespace: Namespace of the datasets
    path: The directory path containing the datasets.

  Returns:
    ds_packages: The dataset packages found in the directory (sorted for
      determinism).

  Raises:
    FileNotFoundError: If the path cannot be reached.
  """
    if not path.exists():
        # Should be fault-tolerant in the future
        raise FileNotFoundError(f'Could not find datasets at {path}')

    all_packages = []
    for ds_path in path.iterdir():
        source = get_dataset_source(ds_path)
        if source:
            pkg = DatasetPackage(
                name=naming.DatasetName(namespace=namespace,
                                        name=ds_path.name),
                source=source,
            )
            all_packages.append(pkg)

    return all_packages
Example #4
0
def exists(path: epath.Path) -> bool:
    """Checks if path exists. Returns False if issues occur connecting to GCS."""
    try:
        return path.exists()
    except GCS_UNAVAILABLE_EXCEPTIONS:  # pylint: disable=catching-non-exception
        return False