Ejemplo n.º 1
0
 def _rename_and_get_final_dl_path(
     self,
     url: str,
     path: epath.Path,
     expected_url_info: Optional[checksums.UrlInfo],
     computed_url_info: Optional[checksums.UrlInfo],
     checksum_path: Optional[epath.Path],
     url_path: epath.Path,
 ) -> epath.Path:
     """Eventually rename the downloaded file if checksums were recorded."""
     # `path` can be:
     # * Manually downloaded
     # * (cached) checksum_path
     # * (cached) url_path
     # * `tmp_dir/file` (downloaded path)
     if self._manual_dir and path.is_relative_to(self._manual_dir):
         return path  # Manually downloaded data
     elif path == checksum_path:  # Path already at final destination
         assert computed_url_info == expected_url_info  # Sanity check
         return checksum_path  # pytype: disable=bad-return-type
     elif path == url_path:
         if checksum_path:
             # Checksums were registered: Rename -> checksums_path
             resource_lib.rename_info_file(path,
                                           checksum_path,
                                           overwrite=True)
             return path.replace(checksum_path)
         else:
             # Checksums not registered: -> do nothing
             return path
     else:  # Path was downloaded in tmp dir
         dst_path = checksum_path or url_path
         resource_lib.write_info_file(
             url=url,
             path=dst_path,
             dataset_name=self._dataset_name,
             original_fname=path.name,
             url_info=computed_url_info,
         )
         path.replace(dst_path)
         path.parent.rmdir()  # Cleanup tmp dir (will fail if dir not empty)
         return dst_path
Ejemplo n.º 2
0
 def _handle_download_result(self, resource, tmp_dir_path, sha256, dl_size):
   """Store dled file to definitive place, write INFO file, return path."""
   fnames = tf.io.gfile.listdir(tmp_dir_path)
   if len(fnames) > 1:
     raise AssertionError('More than one file in %s.' % tmp_dir_path)
   original_fname = fnames[0]
   tmp_path = os.path.join(tmp_dir_path, original_fname)
   self._recorded_sizes_checksums[resource.url] = (dl_size, sha256)
   if self._register_checksums:
     self._record_sizes_checksums()
   elif (dl_size, sha256) != self._sizes_checksums.get(resource.url, None):
     raise NonMatchingChecksumError(resource.url, tmp_path)
   download_path = self._get_final_dl_path(resource.url, sha256)
   resource_lib.write_info_file(resource, download_path, self._dataset_name,
                                original_fname)
   # Unconditionally overwrite because either file doesn't exist or
   # FORCE_DOWNLOAD=true
   tf.io.gfile.rename(tmp_path, download_path, overwrite=True)
   tf.io.gfile.rmtree(tmp_dir_path)
   return download_path
Ejemplo n.º 3
0
    def _handle_download_result(
        self,
        resource: resource_lib.Resource,
        tmp_dir_path: str,
        url_path: str,
        url_info: checksums.UrlInfo,
    ) -> str:
        """Post-processing of the downloaded file.

    * Write `.INFO` file
    * Rename `tmp_dir/file.xyz` -> `url_path`
    * Validate/record checksums
    * Eventually rename `url_path` -> `file_path` when `record_checksums=True`

    Args:
      resource: The url to download.
      tmp_dir_path: Temporary dir where the file was downloaded.
      url_path: Destination path.
      url_info: File checksums, size, computed during download.

    Returns:
      dst_path: `url_path` (or `file_path` when `register_checksums=True`)

    Raises:
      NonMatchingChecksumError:
    """
        # Extract the file name, path from the tmp_dir
        fnames = tf.io.gfile.listdir(tmp_dir_path)
        if len(fnames) != 1:
            raise ValueError(
                'Download not found for url {} in: {}. Found {} files, but expected '
                '1.'.format(resource.url, tmp_dir_path, len(fnames)))
        original_fname, = fnames  # Unpack list
        tmp_path = os.path.join(tmp_dir_path, original_fname)

        # Write `.INFO` file and rename `tmp_dir/file.xyz` -> `url_path`
        resource_lib.write_info_file(
            resource=resource,
            path=url_path,
            dataset_name=self._dataset_name,
            original_fname=original_fname,
            url_info=url_info,
        )
        # Unconditionally overwrite because either file doesn't exist or
        # FORCE_DOWNLOAD=true
        tf.io.gfile.rename(tmp_path, url_path, overwrite=True)
        tf.io.gfile.rmtree(tmp_dir_path)

        # After this checkpoint, the url file is cached, so should never be
        # downloaded again, even if there are error in registering checksums.

        # Even if `_handle_download_result` is executed asyncronously, Python
        # built-in ops are atomic in CPython (and Pypy), so it should be safe
        # to update `_recorded_url_infos`.
        self._recorded_url_infos[resource.url] = url_info

        # Validate the download checksum, or register checksums
        dst_path = url_path
        if self._register_checksums:
            # Change `dst_path` from `url_path` -> `file_path`
            dst_path = self._save_url_info_and_rename(url=resource.url,
                                                      url_path=url_path,
                                                      url_info=url_info)
        elif resource.url not in self._url_infos:
            if self._force_checksums_validation:
                raise ValueError(f'Missing checksums url: {resource.url}, yet '
                                 '`force_checksums_validation=True`. '
                                 'Did you forgot to register checksums ?')
            # Otherwise, missing checksums, do nothing
        elif url_info != self._url_infos.get(resource.url, None):
            raise NonMatchingChecksumError(resource.url, tmp_path)

        return dst_path