Beispiel #1
0
def save_url_infos(
    path: type_utils.ReadWritePath,
    url_infos: Dict[str, UrlInfo],
) -> None:
    """Store given checksums and sizes for specific dataset.

  Content of file is never disgarded, only updated. This is to ensure that if
  process is killed right after first download finishes, checksums registered
  during previous runs aren't lost.

  It is the responsibility of the caller not to call function multiple times in
  parallel for a given dataset.

  Only original file content is updated. This means the entire set of new sizes
  and checksums must be given at every call.

  Args:
    path: Path to the resources.
    url_infos: dict, {url: (size_in_bytes, checksum)}.
  """
    original_data = load_url_infos(path) if path.exists() else {}
    new_data = original_data.copy()
    new_data.update(url_infos)
    # Compare filenames separatelly, as filename field is eq=False
    if original_data == new_data and _filenames_equal(original_data, new_data):
        return
    lines = [
        f'{url}\t{int(url_info.size)}\t{url_info.checksum}\t'
        f'{url_info.filename or ""}\n'
        for url, url_info in sorted(new_data.items())
    ]
    path.write_text(''.join(lines), encoding='UTF-8')
Beispiel #2
0
def exists(path: type_utils.ReadWritePath) -> bool:
    """Checks if path exists. Returns False if issues occur connecting to GCS."""
    try:
        return path.exists()
    # * UnimplementedError: On windows, gs:// isn't supported.
    # * FailedPreconditionError: Raised by TF
    # * PermissionDeniedError: Some environments block GCS access.
    # * AbortedError: All 10 retry attempts failed.
    except (
            tf.errors.UnimplementedError,
            tf.errors.FailedPreconditionError,
            tf.errors.PermissionDeniedError,
            tf.errors.AbortedError,
    ):
        # TODO(tfds): Investigate why windows, gs:// isn't supported.
        # https://github.com/tensorflow/tensorflow/issues/38477
        return False
Beispiel #3
0
def exists(path: type_utils.ReadWritePath) -> bool:
    """Checks if path exists. Returns False if issues occur connecting to GCS."""
    try:
        return path.exists()
    except GCS_UNAVAILABLE_EXCEPTIONS:  # pylint: disable=catching-non-exception
        return False