def _save_url_info_and_rename( self, url: str, url_path: str, url_info: checksums.UrlInfo, ) -> str: """Saves the checksums on disk and renames `url_path` -> `file_path`. This function assume the file has already be downloaded in `url_path`. Args: url: Url downloaded url_path: Path of the downloaded file. url_info: Downloaded file information. Returns: file_path: The downloaded file after renaming. """ # Record checksums/download size # As downloads are cached even without checksum, we could # avoid recording the checksums for each urls, and record them once # globally at the end. assert url in self._recorded_url_infos self._record_url_infos() # Rename (after checksum got saved succesfully) file_path = self._get_final_dl_path(url, url_info.checksum) tf.io.gfile.rename(url_path, file_path, overwrite=True) resource_lib.rename_info_file(url_path, file_path, overwrite=True) return file_path
def _rename_and_get_final_dl_path( self, url: str, path: epath.Path, expected_url_info: Optional[checksums.UrlInfo], computed_url_info: Optional[checksums.UrlInfo], checksum_path: Optional[epath.Path], url_path: epath.Path, ) -> epath.Path: """Eventually rename the downloaded file if checksums were recorded.""" # `path` can be: # * Manually downloaded # * (cached) checksum_path # * (cached) url_path # * `tmp_dir/file` (downloaded path) if self._manual_dir and path.is_relative_to(self._manual_dir): return path # Manually downloaded data elif path == checksum_path: # Path already at final destination assert computed_url_info == expected_url_info # Sanity check return checksum_path # pytype: disable=bad-return-type elif path == url_path: if checksum_path: # Checksums were registered: Rename -> checksums_path resource_lib.rename_info_file(path, checksum_path, overwrite=True) return path.replace(checksum_path) else: # Checksums not registered: -> do nothing return path else: # Path was downloaded in tmp dir dst_path = checksum_path or url_path resource_lib.write_info_file( url=url, path=dst_path, dataset_name=self._dataset_name, original_fname=path.name, url_info=computed_url_info, ) path.replace(dst_path) path.parent.rmdir() # Cleanup tmp dir (will fail if dir not empty) return dst_path