def url_retrieve(url: str, outfile: str, chunk_size: int = 128): """Improved urlretrieve with progressbar, timeout and chunker. This downloader has built-in progress bar using tqdm and using the `requests` package it improves standard `urllib` behavior by adding time-out capability. I tested different chunk_sizes and most of the time 128 was actually fastest, YYMV. Parameters ---------- url : str, urlpath.URL The URL to download outfile: str, pathlib.Path The path where to store the downloaded file. chunk_size : int, optional The size of the chunk for the request.iter_content call. Default: 128 See also -------- Inspired by https://stackoverflow.com/a/61575758/680232 """ R = requests.get(url, stream=True, allow_redirects=True) if R.status_code != 200: raise ConnectionError( f"Could not download {url}\nError code: {R.status_code}") with tqdm.wrapattr( open(outfile, "wb"), "write", miniters=1, total=int(R.headers.get("content-length", 0)), desc=str(Path(outfile).name), ) as fd: for chunk in R.iter_content(chunk_size=chunk_size): fd.write(chunk)
def download_file(url, filename): response = requests.get(url, stream=True) with tqdm.wrapattr(open(filename, "wb"), "write", miniters=1, total=int(response.headers.get('content-length', 0)), desc=filename) as fout: for chunk in response.iter_content(chunk_size=4096): fout.write(chunk)
def download_file(url: str, output_path: str, desc: str): """Downloads a file with progress shown""" response = urllib.urlopen(url) with tqdm.wrapattr( open(output_path, "wb"), "write", miniters=1, desc=desc, total=getattr(response, "length", None), ) as fout: for chunk in response: fout.write(chunk)
def download(in_args): eg_file = in_args[0] eg_link = in_args[1] if not os.path.isfile(eg_file): response = requests.get(eg_link, stream=True) with tqdm.wrapattr(open(eg_file + ".part", "wb"), "write", miniters=1, total=int(response.headers.get('content-length', 0)), desc=eg_file[-20:]) as fout: for chunk in response.iter_content(chunk_size=4096): fout.write(chunk) os.rename(eg_file + ".part", eg_file)
def unzip(url:str, dest:PosixPath, chunk_size:int=1024*1024, remove_zip: bool=False): """ Downloads and unzips a zip file parameters: url: str, uri to zip file dest: PosixPath, destination folder chunk_size: int, default 1 MB remove_zip: bool, default False, unlinks zip file after unzip operation returns: tqdm progress bar and typer echo messages """ stream = requests.get(url, stream=True, verify=False, allow_redirects=True) filename = stream.url.split(sep="/")[-1] length = int(stream.headers.get("content-length", -1)) if length < 1: raise Exception(f"content length is less than 1 bytes") if not dest.exists(): raise Exception(f"destination folder does not exist: {dest}") if dest.is_file(): dest = dest.parent dest = dest.resolve() typer.echo("Downloading zip file...") with tqdm.wrapattr( open(dest.joinpath(filename), "wb"), "write", unit='B', unit_scale=True, unit_divisor=1024, miniters=1, desc=filename, total=length) as f: for chunk in stream.iter_content(chunk_size=chunk_size): if chunk: f.write(chunk) f.flush() typer.echo("Extracting zip file...") with zipfile.ZipFile(dest.joinpath(filename)) as zippo: for member in tqdm(zippo.infolist(), desc="Extracting zip file..."): zippo.extract(member, dest) if remove_zip: dest.joinpath(filename).unlink() typer.secho(f"{filename} is removed.", bold=True, fg="red") else: typer.secho(f"{filename} is unzipped in {dest}.", bold=True, fg="green")
def download(in_args): eg_file = in_args[0] eg_link = in_args[1] response = requests.get(eg_link, stream=True) with tqdm.wrapattr(open(eg_file, "wb"), "write", miniters=1, total=int(response.headers.get('content-length', 0)), desc=eg_file[-20:]) as fout: for chunk in response.iter_content(chunk_size=4096): fout.write(chunk) db = Database() db.set_download(in_args[2]) db.__del__()
def get(url:str): """ 文件下载:cit get <url> """ 下载地址 = cit_url.main(url) print(f'下载地址是:{下载地址}') file_name = 下载地址.split('/')[-1] typer.echo(f"开始下载文件:{file_name}") r = requests.get(下载地址,stream=True) with tqdm.wrapattr(open(file_name, "wb"), "write", miniters=1, total=int(r.headers.get('content-length', 0)), desc=file_name) as fout: for chunk in r.iter_content(chunk_size=4096): fout.write(chunk)
def install(library_name, library_install): print(f'Install: {library_name}') for step in library_install: step_type = step['type'] if step_type == "copy": copy_from = step['from'] copy_to = step['to'] shutil.copyfile(copy_from, copy_to) print(f"copy: {copy_to} success") elif step_type == "download": download_from = step['from'] download_to = step['to'] with requests.get(download_from, stream=True) as r: total_length = int(r.headers.get("Content-Length")) with tqdm.wrapattr(r.raw, "read", total=total_length, desc="") as raw: with open(download_to, 'wb') as output: shutil.copyfileobj(raw, output) print(f'download: {download_to} success') elif step_type == "extract": tar_file: str = step['from'] tar_mode = tar_file.split('.')[-1] assert tar_mode in ['gz', 'bz2', 'xz', 'tar'] print(f'extract: {tar_file}') with tarfile.open(tar_file, 'r') as tar: tar.extractall(path=step['to'], members=tar) print(f'extract: {tar_file} success') elif step_type == 'remove': remove_path = step['path'] if os.path.isfile(remove_path): os.remove(remove_path) else: shutil.rmtree(remove_path) print(f'remove: {remove_path} success') elif step_type == "replaceText": filename = step['file'] file_content = open(filename, encoding='utf-8').read() replace_old = step['old'] replace_new = step['new'] file_content = file_content.replace(replace_old, replace_new) open(filename, 'w', encoding='utf-8').write(file_content) print(f"replaceText: {filename} success") else: print(step) raise ValueError print(f'Install: {library_name} success')
def _get_dataset(ds_name, ds_url): """ @purpose: Connect and Download the necessary files @input: ds_name: str: DataSet Names ds_url: str: link to the datasets """ response = requests.get(ds_url, stream=True) with tqdm.wrapattr(open(f'{ds_name}.csv', "w"), "write", miniters=1, total=int(response.headers.get('content-length', 0)), desc=ds_name) as fout: for chunk in response.iter_content(chunk_size=4096): fout.write(chunk)
def _download_items(self, remote_items, local_items): ''' Given a list of remote and local items, download the remote data if it is not already found locally Inputs: - remote_items: list of tuples containing (remote url, remote filesize) - local_items: list of local paths where data will be saved Outputs: - local_items: same as input Assumptions: - length of remote_items and local_items must match - filenames in remote_items and local_items must be in sequence ''' remote_urls = [f[0] for f in remote_items] remote_sizes = [f[1] for f in remote_items] for remote, expected_size, local in zip(remote_urls, remote_sizes, local_items): # if we have an existing local file, check the filesize against the manifest if os.path.exists(local): # if all-good, continue to next file if os.stat(local).st_size == expected_size: LOGGER.info('Local file exists: %s' % local) continue # otherwise, delete the incomplete/malformed local file and redownload else: LOGGER.warn( 'Filesize mismatch with %s. Re-downloading...' % os.path.basename(local) ) os.remove(local) # use streamed download so we can wrap nicely with tqdm with self._session.get(remote, stream=True) as stream: with open(local, 'wb') as pipe: with tqdm.wrapattr( pipe, method='write', miniters=1, total=expected_size, desc=os.path.basename(local) ) as file_out: for chunk in stream.iter_content(chunk_size=1024): file_out.write(chunk) return local_items
def __download(self, url, filename): d = lzma.LZMADecompressor() r = requests.get(url, stream=True, allow_redirects=True) if r.status_code != 200: r.raise_for_status() raise RuntimeError( f"Request to {url} returned status code {r.status_code}") file_size = int(r.headers.get('Content-Length', 0)) path = pathlib.Path(filename.replace(".xz", "")).expanduser().resolve() path.parent.mkdir(parents=True, exist_ok=True) desc = "(Unknown total file size)" if file_size == 0 else "" r.raw.read = functools.partial(r.raw.read, decode_content=True) with tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw: with path.open("wb") as f: f.write(d.decompress(r_raw.read())) return path
def download_dataset(download_path='dataset/'): ''' An ideally hidden function that downloads the dataset in case its not locally available. @params <string> download_path, where to download the dataset defaults to 'dataset/' ''' filename = '{}{}'.format(download_path, BASE_DATA_SET_LINK.split('/')[-1]) response = requests.get(BASE_DATA_SET_LINK, stream=True) with tqdm.wrapattr(open(filename, "wb"), "write", miniters=1, total=int(response.headers.get('content-length', 0)), desc=filename) as fout: for chunk in response.iter_content(chunk_size=4096): fout.write(chunk) print("Download Completed!") return filename
def download_file(data, retries=0): local_filename = data[0] url = data[1] if os.path.isfile(local_filename): return url try: response = requests.get(url, stream=True) with tqdm.wrapattr(open(local_filename, "wb"), "write", miniters=1, total=int(response.headers.get('content-length', 0)), desc=local_filename[-20:]) as fout: for chunk in response.iter_content(chunk_size=4096): fout.write(chunk) except requests.exceptions.HTTPError: if retries > 5: return print("retrying", url) retries += 1 download_file((local_filename, url), retries) return url
def download_file(url: str, filepath: Union[Path, str], chunk_size=None, retries=2, backoff_factor=1) -> Path: """Download a file Args: url: URL of the file to download filepath: Location to place file into chunk_size: Size to chunk the download into retries: Number of retries to attempt backoff_factor: Factor for calculating time between retries Returns: download_path: The path to the downloaded file """ filepath = Path(filepath) session = requests.Session() retry_strategy = Retry( total=retries, backoff_factor=backoff_factor, status_forcelist=[429, 500, 502, 503, 504], ) session.mount('https://', HTTPAdapter(max_retries=retry_strategy)) session.mount('http://', HTTPAdapter(max_retries=retry_strategy)) stream = False if chunk_size is None else True with session.get(url, stream=stream) as s: s.raise_for_status() tqdm = get_tqdm_progress_bar() with tqdm.wrapattr(open(filepath, "wb"), 'write', miniters=1, desc=filepath.name, total=int(s.headers.get('content-length', 0))) as f: for chunk in s.iter_content(chunk_size=chunk_size): if chunk: f.write(chunk) session.close() return filepath
eg_file = eg_link.replace('/', ' ').split()[-1] eg_out = opts['--output'].replace("/dev/null", devnull) response = requests.get(eg_link, stream=True) with open(eg_out, "wb") as fout: with tqdm( # all optional kwargs unit='B', unit_scale=True, unit_divisor=1024, miniters=1, desc=eg_file, total=int(response.headers.get('content-length', 0))) as pbar: for chunk in response.iter_content(chunk_size=4096): fout.write(chunk) pbar.update(len(chunk)) # Even simpler progress by wrapping the output file's `write()` response = requests.get(eg_link, stream=True) with tqdm.wrapattr(open(eg_out, "wb"), "write", unit='B', unit_scale=True, unit_divisor=1024, miniters=1, desc=eg_file, total=int(response.headers.get('content-length', 0))) as fout: for chunk in response.iter_content(chunk_size=4096): fout.write(chunk)
if tsize is not None: self.total = tsize return self.update(b * bsize - self.n) # also sets self.n = b * bsize opts = docopt(__doc__) eg_link = opts['--url'] eg_file = eg_link.replace('/', ' ').split()[-1] eg_out = opts['--output'].replace("/dev/null", devnull) # with tqdm(unit='B', unit_scale=True, unit_divisor=1024, miniters=1, # desc=eg_file) as t: # all optional kwargs # urllib.urlretrieve(eg_link, filename=eg_out, # reporthook=my_hook(t), data=None) with TqdmUpTo(unit='B', unit_scale=True, unit_divisor=1024, miniters=1, desc=eg_file) as t: # all optional kwargs urllib.urlretrieve(eg_link, filename=eg_out, reporthook=t.update_to, data=None) t.total = t.n # Even simpler progress by wrapping the output file's `write()` with tqdm.wrapattr(open(eg_out, "wb"), "write", miniters=1, desc=eg_file) as fout: for chunk in urllib.urlopen(eg_link): fout.write(chunk)
import yaml import src.logger from tqdm.auto import tqdm logger = logging.getLogger(__name__) if __name__ == "__main__": params = yaml.safe_load(open("params.yaml")) logger.info("Downloading %s", params["download"]["data-url"]) gz_path = os.path.join(params["common"]["download-path"]) response = requests.get(params["download"]["data-url"], stream=True) ### Download with a progress bar with tqdm.wrapattr( open(gz_path, "wb"), "write", miniters=1, total=int(response.headers.get("content-length", 0)), desc=gz_path, ) as fout: for chunk in response.iter_content(chunk_size=4096): fout.write(chunk) logger.info("Wrote data to %s", gz_path)
return self.update(b * bsize - self.n) # also sets self.n = b * bsize opts = docopt(__doc__) eg_link = opts['--url'] eg_file = eg_link.replace('/', ' ').split()[-1] eg_out = opts['--output'].replace("/dev/null", devnull) # with tqdm(unit='B', unit_scale=True, unit_divisor=1024, miniters=1, # desc=eg_file) as t: # all optional kwargs # urllib.urlretrieve(eg_link, filename=eg_out, # reporthook=my_hook(t), data=None) with TqdmUpTo(unit='B', unit_scale=True, unit_divisor=1024, miniters=1, desc=eg_file) as t: # all optional kwargs urllib.urlretrieve( # nosec eg_link, filename=eg_out, reporthook=t.update_to, data=None) t.total = t.n # Even simpler progress by wrapping the output file's `write()` response = urllib.urlopen(eg_link) # nosec with tqdm.wrapattr(open(eg_out, "wb"), "write", miniters=1, desc=eg_file, total=getattr(response, 'length', None)) as fout: for chunk in response: fout.write(chunk)