Beispiel #1
0
    def _save_url_info_and_rename(
        self,
        url: str,
        url_path: str,
        url_info: checksums.UrlInfo,
    ) -> str:
        """Saves the checksums on disk and renames `url_path` -> `file_path`.

    This function assume the file has already be downloaded in `url_path`.

    Args:
      url: Url downloaded
      url_path: Path of the downloaded file.
      url_info: Downloaded file information.

    Returns:
      file_path: The downloaded file after renaming.
    """
        # Record checksums/download size
        # As downloads are cached even without checksum, we could
        # avoid recording the checksums for each urls, and record them once
        # globally at the end.
        assert url in self._recorded_url_infos
        self._record_url_infos()

        # Rename (after checksum got saved succesfully)
        file_path = self._get_final_dl_path(url, url_info.checksum)
        tf.io.gfile.rename(url_path, file_path, overwrite=True)
        resource_lib.rename_info_file(url_path, file_path, overwrite=True)
        return file_path
 def _rename_and_get_final_dl_path(
     self,
     url: str,
     path: epath.Path,
     expected_url_info: Optional[checksums.UrlInfo],
     computed_url_info: Optional[checksums.UrlInfo],
     checksum_path: Optional[epath.Path],
     url_path: epath.Path,
 ) -> epath.Path:
     """Eventually rename the downloaded file if checksums were recorded."""
     # `path` can be:
     # * Manually downloaded
     # * (cached) checksum_path
     # * (cached) url_path
     # * `tmp_dir/file` (downloaded path)
     if self._manual_dir and path.is_relative_to(self._manual_dir):
         return path  # Manually downloaded data
     elif path == checksum_path:  # Path already at final destination
         assert computed_url_info == expected_url_info  # Sanity check
         return checksum_path  # pytype: disable=bad-return-type
     elif path == url_path:
         if checksum_path:
             # Checksums were registered: Rename -> checksums_path
             resource_lib.rename_info_file(path,
                                           checksum_path,
                                           overwrite=True)
             return path.replace(checksum_path)
         else:
             # Checksums not registered: -> do nothing
             return path
     else:  # Path was downloaded in tmp dir
         dst_path = checksum_path or url_path
         resource_lib.write_info_file(
             url=url,
             path=dst_path,
             dataset_name=self._dataset_name,
             original_fname=path.name,
             url_info=computed_url_info,
         )
         path.replace(dst_path)
         path.parent.rmdir()  # Cleanup tmp dir (will fail if dir not empty)
         return dst_path