def _download(self, resource): """Download resource, returns Promise->path to downloaded file.""" if isinstance(resource, six.string_types): resource = resource_lib.Resource(url=resource) url = resource.url if url in self._sizes_checksums: expected_sha256 = self._sizes_checksums[url][1] download_path = self._get_final_dl_path(url, expected_sha256) if not self._force_download and resource.exists_locally( download_path): logging.info('URL %s already downloaded: reusing %s.', url, download_path) self._recorded_sizes_checksums[url] = self._sizes_checksums[ url] return promise.Promise.resolve(download_path) # There is a slight difference between downloader and extractor here: # the extractor manages its own temp directory, while the DownloadManager # manages the temp directory of downloader. download_dir_path = os.path.join( self._download_dir, '%s.tmp.%s' % (resource_lib.get_dl_dirname(url), uuid.uuid4().hex)) tf.io.gfile.makedirs(download_dir_path) logging.info('Downloading %s into %s...', url, download_dir_path) def callback(val): checksum, dl_size = val return self._handle_download_result(resource, download_dir_path, checksum, dl_size) return self._downloader.download(url, download_dir_path).then(callback)
def __init__(self, name, url=None): url = url or 'http://foo-bar.ch/%s' % name content = 'content of %s' % name self.url = url self.content = content self.size = len(content) self.sha = _sha256(content) self.size_checksum = (self.size, self.sha) self.checksum_size = (self.sha, self.size) self.dl_fname = resource_lib.get_dl_fname(url, self.sha) self.dl_tmp_dirname = resource_lib.get_dl_dirname(url)
def _download_and_extract_multipart( self, dl_manager: tfds.download.DownloadManager, url: str, parts: int, pwd: str = None): """Download and extract multipart zip file""" # Write OpenPose disclaimer if self._builder_config.include_pose == "openpose": print(_OPENPOSE_DISCLAIMER) # Make sure not already downloaded dirname = get_dl_dirname(url) output_path = os.path.join(dl_manager._download_dir, dirname) output_path_extracted = os.path.join(dl_manager._extract_dir, dirname) print("output_path", output_path) print("output_path_extracted", output_path_extracted) if not os.path.isfile(output_path): parts = [url + f".{i + 1:03}" for i in range(parts)] files = dl_manager.download(parts) # Cat parts to single file with open(output_path, "ab") as cat_file: for f in files: with open(f, "rb") as z: cat_file.write(z.read()) if not os.path.isdir(output_path_extracted): # Extract file os.makedirs(output_path_extracted) pwd_bytes = bytes(pwd, "utf-8") if pwd is not None else None with ZipFile(output_path, "r") as zip_obj: # Loop over each file for file in tqdm(iterable=zip_obj.namelist(), total=len(zip_obj.namelist())): zip_obj.extract(member=file, path=output_path_extracted, pwd=pwd_bytes) return output_path_extracted
def _download(self, resource: Union[str, resource_lib.Resource]): """Download resource, returns Promise->path to downloaded file. Args: resource: The URL to download. Returns: path: The path to the downloaded resource. """ # Normalize the input if isinstance(resource, six.string_types): resource = resource_lib.Resource(url=resource) url = resource.url # Compute the existing path if the file was previously downloaded url_path = self._get_final_dl_path( url, hashlib.sha256(url.encode('utf-8')).hexdigest()) existing_path = self._find_existing_path(url=url, url_path=url_path) # If register checksums and file already downloaded, then: # * Record the url_infos of the downloaded file # * Rename the filename `url_path` -> `file_path`, and return it. if self._register_checksums and existing_path == url_path: logging.info( 'URL %s already downloaded: Recording checksums from %s.', url, existing_path, ) future = self._executor.submit( self._save_url_info_and_rename, url=url, url_path=url_path, url_info=self._recorded_url_infos[url], ) return promise.Promise.resolve(future) # Otherwise, url_infos are either already registered, or will be registered # in the `_handle_download_result` callback. # If the file file already exists (`file_path` or `url_path`), return it. if existing_path: logging.info('URL %s already downloaded: reusing %s.', url, existing_path) return promise.Promise.resolve(existing_path) # Otherwise, download the file, and eventually computing the checksums. # There is a slight difference between downloader and extractor here: # the extractor manages its own temp directory, while the DownloadManager # manages the temp directory of downloader. download_dir_path = os.path.join( self._download_dir, '%s.tmp.%s' % (resource_lib.get_dl_dirname(url), uuid.uuid4().hex)) tf.io.gfile.makedirs(download_dir_path) logging.info('Downloading %s into %s...', url, download_dir_path) def callback(url_info): return self._handle_download_result( resource=resource, tmp_dir_path=download_dir_path, url_path=url_path, url_info=url_info, ) return self._downloader.download( url, download_dir_path, verify=self._verify_ssl).then(callback)
def test_(self): for url, expected in zip(self.urls, self.expected): res = resource.get_dl_dirname(url) self.assertEqual(res, expected)