def _read_url_info(url_path: epath.PathLike) -> checksums.UrlInfo:
    """Loads the `UrlInfo` from the `.INFO` file."""
    file_info = resource_lib.read_info_file(url_path)
    if 'url_info' not in file_info:
        raise ValueError(
            'Could not find `url_info` in {}. This likely indicates that '
            'the files where downloaded with a previous version of TFDS (<=3.1.0). '
        )
    url_info = file_info['url_info']
    url_info.setdefault('filename', None)
    url_info['size'] = utils.Size(url_info['size'])
    return checksums.UrlInfo(**url_info)
def test_compute_url_info():
  filepath = utils.tfds_path() / 'testing/test_data/6pixels.png'

  expected_url_info = checksums.UrlInfo(
      checksum=
      '04f38ebed34d3b027d2683193766155912fba647158c583c3bdb4597ad8af34c',
      size=utils.Size(102),
      filename='6pixels.png',
  )
  url_info = checksums.compute_url_info(filepath, checksum_cls=hashlib.sha256)
  assert url_info == expected_url_info
  assert url_info.filename == expected_url_info.filename
Beispiel #3
0
def compute_url_info(
    path: utils.PathLike,
    checksum_cls=hashlib.sha256,
) -> UrlInfo:
    """Locally compute size, checksums of the given file."""
    path = utils.as_path(path)

    checksum = checksum_cls()
    size = 0
    with path.open('rb') as f:
        while True:
            block = f.read(io.DEFAULT_BUFFER_SIZE)
            size += len(block)
            if not block:
                break
            checksum.update(block)

    return UrlInfo(
        checksum=checksum.hexdigest(),  # base64 digest would have been better.
        size=utils.Size(size),
        filename=path.name,
    )
Beispiel #4
0
def _parse_url_infos(checksums_file: Iterable[str]) -> Dict[str, UrlInfo]:
    """Returns {URL: (size, checksum)}s stored within given file."""
    url_infos = {}
    for line in checksums_file:
        line = line.strip()  # Remove the trailing '\r' on Windows OS.
        if not line or line.startswith('#'):
            continue
        values = line.split('\t')
        if len(values) == 1:  # not enough values to unpack (legacy files)
            # URL might have spaces inside, but size and checksum will not.
            values = line.rsplit(' ', 2)
        if len(values) == 4:
            url, size, checksum, filename = values
        elif len(values) == 3:
            url, size, checksum = values
            filename = None
        else:
            raise AssertionError(f'Error parsing checksums: {values}')
        url_infos[url] = UrlInfo(
            size=utils.Size(size),
            checksum=checksum,
            filename=filename,
        )
    return url_infos
Beispiel #5
0
 def download_size(self) -> utils.Size:
     """Downloaded files size, in bytes."""
     # Fallback to deprecated `size_in_bytes` if `download_size` is empty.
     return utils.Size(self.as_proto.download_size
                       or self.as_proto.size_in_bytes)
Beispiel #6
0
 def dataset_size(self) -> utils.Size:
     """Generated dataset files size, in bytes."""
     # For old datasets, maybe empty.
     return utils.Size(
         sum(split.num_bytes for split in self.splits.values()))
Beispiel #7
0
    def _sync_download(self,
                       url: str,
                       destination_path: str,
                       verify: bool = True) -> DownloadResult:
        """Synchronous version of `download` method.

    To download through a proxy, the `HTTP_PROXY`, `HTTPS_PROXY`,
    `REQUESTS_CA_BUNDLE`,... environment variables can be exported, as
    described in:
    https://requests.readthedocs.io/en/master/user/advanced/#proxies

    Args:
      url: url to download
      destination_path: path where to write it
      verify: whether to verify ssl certificates

    Returns:
      None

    Raises:
      DownloadError: when download fails.
    """
        try:
            # If url is on a filesystem that gfile understands, use copy. Otherwise,
            # use requests (http) or urllib (ftp).
            if not url.startswith('http'):
                return self._sync_file_copy(url, destination_path)
        except tf.errors.UnimplementedError:
            pass

        with _open_url(url, verify=verify) as (response, iter_content):
            fname = _get_filename(response)
            path = os.path.join(destination_path, fname)
            size = 0

            # Initialize the download size progress bar
            size_mb = 0
            unit_mb = units.MiB
            total_size = int(response.headers.get('Content-length',
                                                  0)) // unit_mb
            self._pbar_dl_size.update_total(total_size)
            with tf.io.gfile.GFile(path, 'wb') as file_:
                checksum = self._checksumer_cls()
                for block in iter_content:
                    size += len(block)
                    checksum.update(block)
                    file_.write(block)

                    # Update the download size progress bar
                    size_mb += len(block)
                    if size_mb > unit_mb:
                        self._pbar_dl_size.update(size_mb // unit_mb)
                        size_mb %= unit_mb
        self._pbar_url.update(1)
        return DownloadResult(
            path=utils.as_path(path),
            url_info=checksums_lib.UrlInfo(
                checksum=checksum.hexdigest(),
                size=utils.Size(size),
                filename=fname,
            ),
        )