Ejemplo n.º 1
0
def url_retrieve(url: str, outfile: str, chunk_size: int = 128):
    """Improved urlretrieve with progressbar, timeout and chunker.

    This downloader has built-in progress bar using tqdm and using the `requests`
    package it improves standard `urllib` behavior by adding time-out capability.

    I tested different chunk_sizes and most of the time 128 was actually fastest, YYMV.

    Parameters
    ----------
    url : str, urlpath.URL
        The URL to download
    outfile: str, pathlib.Path
        The path where to store the downloaded file.
    chunk_size : int, optional
        The size of the chunk for the request.iter_content call. Default: 128

    See also
    --------
    Inspired by https://stackoverflow.com/a/61575758/680232
    """
    R = requests.get(url, stream=True, allow_redirects=True)
    if R.status_code != 200:
        raise ConnectionError(
            f"Could not download {url}\nError code: {R.status_code}")
    with tqdm.wrapattr(
            open(outfile, "wb"),
            "write",
            miniters=1,
            total=int(R.headers.get("content-length", 0)),
            desc=str(Path(outfile).name),
    ) as fd:
        for chunk in R.iter_content(chunk_size=chunk_size):
            fd.write(chunk)
Ejemplo n.º 2
0
 def download_file(url, filename):
   response = requests.get(url, stream=True)
   with tqdm.wrapattr(open(filename, "wb"), "write", miniters=1,
                     total=int(response.headers.get('content-length', 0)),
                     desc=filename) as fout:
       for chunk in response.iter_content(chunk_size=4096):
           fout.write(chunk)
Ejemplo n.º 3
0
def download_file(url: str, output_path: str, desc: str):
    """Downloads a file with progress shown"""
    response = urllib.urlopen(url)
    with tqdm.wrapattr(
        open(output_path, "wb"),
        "write",
        miniters=1,
        desc=desc,
        total=getattr(response, "length", None),
    ) as fout:
        for chunk in response:
            fout.write(chunk)
Ejemplo n.º 4
0
def download(in_args):
    eg_file = in_args[0]
    eg_link = in_args[1]
    if not os.path.isfile(eg_file):
        response = requests.get(eg_link, stream=True)
        with tqdm.wrapattr(open(eg_file + ".part", "wb"),
                           "write",
                           miniters=1,
                           total=int(response.headers.get('content-length',
                                                          0)),
                           desc=eg_file[-20:]) as fout:
            for chunk in response.iter_content(chunk_size=4096):
                fout.write(chunk)
        os.rename(eg_file + ".part", eg_file)
Ejemplo n.º 5
0
def unzip(url:str, dest:PosixPath, chunk_size:int=1024*1024, remove_zip: bool=False):
    """ 
    Downloads and unzips a zip file
    
    parameters:
        url: str, uri to zip file
        dest: PosixPath, destination folder
        chunk_size: int, default 1 MB
        remove_zip: bool, default False, unlinks zip file after unzip operation
        
    returns:
        tqdm progress bar and typer echo messages
    """
    stream = requests.get(url, stream=True, verify=False, allow_redirects=True)
    filename = stream.url.split(sep="/")[-1]
    length = int(stream.headers.get("content-length", -1))
    
    if length < 1:
        raise Exception(f"content length is less than 1 bytes")
    
    if not dest.exists():
        raise Exception(f"destination folder does not exist: {dest}")
    
    if dest.is_file():
        dest = dest.parent
        
    dest = dest.resolve()

    typer.echo("Downloading zip file...")

    with tqdm.wrapattr(
    open(dest.joinpath(filename), "wb"), "write",
    unit='B', unit_scale=True, unit_divisor=1024, miniters=1,
    desc=filename, total=length) as f:
        for chunk in stream.iter_content(chunk_size=chunk_size):
            if chunk:
                f.write(chunk)
                f.flush()
                
    typer.echo("Extracting zip file...")
    
    with zipfile.ZipFile(dest.joinpath(filename)) as zippo:
        for member in tqdm(zippo.infolist(), desc="Extracting zip file..."):
            zippo.extract(member, dest)
            
    if remove_zip:
        dest.joinpath(filename).unlink()
        typer.secho(f"{filename} is removed.", bold=True, fg="red")
    else:
        typer.secho(f"{filename} is unzipped in {dest}.", bold=True, fg="green")
Ejemplo n.º 6
0
def download(in_args):
    eg_file = in_args[0]
    eg_link = in_args[1]
    response = requests.get(eg_link, stream=True)
    with tqdm.wrapattr(open(eg_file, "wb"),
                       "write",
                       miniters=1,
                       total=int(response.headers.get('content-length', 0)),
                       desc=eg_file[-20:]) as fout:
        for chunk in response.iter_content(chunk_size=4096):
            fout.write(chunk)
    db = Database()
    db.set_download(in_args[2])
    db.__del__()
Ejemplo n.º 7
0
Archivo: main.py Proyecto: LecoLu/cit
def get(url:str):
    """ 
    文件下载:cit get <url>
    """
   
    下载地址 = cit_url.main(url)
    print(f'下载地址是:{下载地址}')
    file_name = 下载地址.split('/')[-1]
    typer.echo(f"开始下载文件:{file_name}")
    r = requests.get(下载地址,stream=True)
    with tqdm.wrapattr(open(file_name, "wb"), "write", miniters=1,
                total=int(r.headers.get('content-length', 0)),
                desc=file_name) as fout:
        for chunk in r.iter_content(chunk_size=4096):
            fout.write(chunk)
Ejemplo n.º 8
0
def install(library_name, library_install):
    print(f'Install: {library_name}')
    for step in library_install:
        step_type = step['type']
        if step_type == "copy":
            copy_from = step['from']
            copy_to = step['to']
            shutil.copyfile(copy_from, copy_to)
            print(f"copy: {copy_to} success")
        elif step_type == "download":
            download_from = step['from']
            download_to = step['to']

            with requests.get(download_from, stream=True) as r:
                total_length = int(r.headers.get("Content-Length"))
                with tqdm.wrapattr(r.raw, "read", total=total_length,
                                   desc="") as raw:
                    with open(download_to, 'wb') as output:
                        shutil.copyfileobj(raw, output)
            print(f'download: {download_to} success')
        elif step_type == "extract":
            tar_file: str = step['from']
            tar_mode = tar_file.split('.')[-1]
            assert tar_mode in ['gz', 'bz2', 'xz', 'tar']
            print(f'extract: {tar_file}')
            with tarfile.open(tar_file, 'r') as tar:
                tar.extractall(path=step['to'], members=tar)
            print(f'extract: {tar_file} success')
        elif step_type == 'remove':
            remove_path = step['path']
            if os.path.isfile(remove_path):
                os.remove(remove_path)
            else:
                shutil.rmtree(remove_path)
            print(f'remove: {remove_path} success')
        elif step_type == "replaceText":
            filename = step['file']
            file_content = open(filename, encoding='utf-8').read()
            replace_old = step['old']
            replace_new = step['new']
            file_content = file_content.replace(replace_old, replace_new)
            open(filename, 'w', encoding='utf-8').write(file_content)
            print(f"replaceText: {filename} success")
        else:
            print(step)
            raise ValueError
    print(f'Install: {library_name} success')
Ejemplo n.º 9
0
def _get_dataset(ds_name, ds_url):
    """
        @purpose: Connect and Download the necessary files
        @input:
            ds_name: str: DataSet Names
            ds_url: str: link to the datasets

    """
    response = requests.get(ds_url, stream=True)

    with tqdm.wrapattr(open(f'{ds_name}.csv', "w"),
                       "write",
                       miniters=1,
                       total=int(response.headers.get('content-length', 0)),
                       desc=ds_name) as fout:
        for chunk in response.iter_content(chunk_size=4096):
            fout.write(chunk)
Ejemplo n.º 10
0
    def _download_items(self, remote_items, local_items):
        '''
        Given a list of remote and local items, download the remote data if it is not already
        found locally

        Inputs:
          - remote_items: list of tuples containing (remote url, remote filesize)
          - local_items: list of local paths where data will be saved

        Outputs:
          - local_items: same as input 

        Assumptions:
          - length of remote_items and local_items must match
          - filenames in remote_items and local_items must be in sequence
        '''
        remote_urls = [f[0] for f in remote_items]
        remote_sizes = [f[1] for f in  remote_items]
        for remote, expected_size, local in zip(remote_urls, remote_sizes, local_items):
            # if we have an existing local file, check the filesize against the manifest
            if os.path.exists(local):
                # if all-good, continue to next file
                if os.stat(local).st_size == expected_size:
                    LOGGER.info('Local file exists: %s' % local)
                    continue
                # otherwise, delete the incomplete/malformed local file and redownload
                else:
                    LOGGER.warn(
                        'Filesize mismatch with %s. Re-downloading...' % os.path.basename(local)
                    )
                    os.remove(local)
            # use streamed download so we can wrap nicely with tqdm
            with self._session.get(remote, stream=True) as stream:
                with open(local, 'wb') as pipe:
                    with tqdm.wrapattr(
                        pipe,
                        method='write',
                        miniters=1,
                        total=expected_size,
                        desc=os.path.basename(local)
                    ) as file_out:
                        for chunk in stream.iter_content(chunk_size=1024):
                            file_out.write(chunk)
        return local_items
Ejemplo n.º 11
0
    def __download(self, url, filename):
        d = lzma.LZMADecompressor()
        r = requests.get(url, stream=True, allow_redirects=True)
        if r.status_code != 200:
            r.raise_for_status()
            raise RuntimeError(
                f"Request to {url} returned status code {r.status_code}")
        file_size = int(r.headers.get('Content-Length', 0))

        path = pathlib.Path(filename.replace(".xz", "")).expanduser().resolve()
        path.parent.mkdir(parents=True, exist_ok=True)

        desc = "(Unknown total file size)" if file_size == 0 else ""
        r.raw.read = functools.partial(r.raw.read, decode_content=True)
        with tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw:
            with path.open("wb") as f:
                f.write(d.decompress(r_raw.read()))

        return path
Ejemplo n.º 12
0
def download_dataset(download_path='dataset/'):
    '''
        An ideally hidden function that downloads the dataset in case its not locally available.
        @params <string> download_path, where to download the dataset
                        defaults to 'dataset/'
    '''
    filename = '{}{}'.format(download_path, BASE_DATA_SET_LINK.split('/')[-1])
    response = requests.get(BASE_DATA_SET_LINK, stream=True)

    with tqdm.wrapattr(open(filename, "wb"),
                       "write",
                       miniters=1,
                       total=int(response.headers.get('content-length', 0)),
                       desc=filename) as fout:

        for chunk in response.iter_content(chunk_size=4096):
            fout.write(chunk)

    print("Download Completed!")
    return filename
Ejemplo n.º 13
0
def download_file(data, retries=0):
    local_filename = data[0]
    url = data[1]
    if os.path.isfile(local_filename):
        return url

    try:
        response = requests.get(url, stream=True)
        with tqdm.wrapattr(open(local_filename, "wb"), "write", miniters=1,
                           total=int(response.headers.get('content-length', 0)),
                           desc=local_filename[-20:]) as fout:
            for chunk in response.iter_content(chunk_size=4096):
                fout.write(chunk)
    except requests.exceptions.HTTPError:
        if retries > 5:
            return
        print("retrying", url)
        retries += 1
        download_file((local_filename, url), retries)

    return url
Ejemplo n.º 14
0
def download_file(url: str,
                  filepath: Union[Path, str],
                  chunk_size=None,
                  retries=2,
                  backoff_factor=1) -> Path:
    """Download a file
    Args:
        url: URL of the file to download
        filepath: Location to place file into
        chunk_size: Size to chunk the download into
        retries: Number of retries to attempt
        backoff_factor: Factor for calculating time between retries
    Returns:
        download_path: The path to the downloaded file
    """
    filepath = Path(filepath)
    session = requests.Session()
    retry_strategy = Retry(
        total=retries,
        backoff_factor=backoff_factor,
        status_forcelist=[429, 500, 502, 503, 504],
    )

    session.mount('https://', HTTPAdapter(max_retries=retry_strategy))
    session.mount('http://', HTTPAdapter(max_retries=retry_strategy))
    stream = False if chunk_size is None else True
    with session.get(url, stream=stream) as s:
        s.raise_for_status()
        tqdm = get_tqdm_progress_bar()
        with tqdm.wrapattr(open(filepath, "wb"),
                           'write',
                           miniters=1,
                           desc=filepath.name,
                           total=int(s.headers.get('content-length', 0))) as f:
            for chunk in s.iter_content(chunk_size=chunk_size):
                if chunk:
                    f.write(chunk)
    session.close()

    return filepath
Ejemplo n.º 15
0
eg_file = eg_link.replace('/', ' ').split()[-1]
eg_out = opts['--output'].replace("/dev/null", devnull)

response = requests.get(eg_link, stream=True)
with open(eg_out, "wb") as fout:
    with tqdm(
            # all optional kwargs
            unit='B',
            unit_scale=True,
            unit_divisor=1024,
            miniters=1,
            desc=eg_file,
            total=int(response.headers.get('content-length', 0))) as pbar:
        for chunk in response.iter_content(chunk_size=4096):
            fout.write(chunk)
            pbar.update(len(chunk))

# Even simpler progress by wrapping the output file's `write()`
response = requests.get(eg_link, stream=True)
with tqdm.wrapattr(open(eg_out, "wb"),
                   "write",
                   unit='B',
                   unit_scale=True,
                   unit_divisor=1024,
                   miniters=1,
                   desc=eg_file,
                   total=int(response.headers.get('content-length',
                                                  0))) as fout:
    for chunk in response.iter_content(chunk_size=4096):
        fout.write(chunk)
Ejemplo n.º 16
0
        if tsize is not None:
            self.total = tsize
        return self.update(b * bsize - self.n)  # also sets self.n = b * bsize


opts = docopt(__doc__)

eg_link = opts['--url']
eg_file = eg_link.replace('/', ' ').split()[-1]
eg_out = opts['--output'].replace("/dev/null", devnull)
# with tqdm(unit='B', unit_scale=True, unit_divisor=1024, miniters=1,
#           desc=eg_file) as t:  # all optional kwargs
#     urllib.urlretrieve(eg_link, filename=eg_out,
#                        reporthook=my_hook(t), data=None)
with TqdmUpTo(unit='B',
              unit_scale=True,
              unit_divisor=1024,
              miniters=1,
              desc=eg_file) as t:  # all optional kwargs
    urllib.urlretrieve(eg_link,
                       filename=eg_out,
                       reporthook=t.update_to,
                       data=None)
    t.total = t.n

# Even simpler progress by wrapping the output file's `write()`
with tqdm.wrapattr(open(eg_out, "wb"), "write", miniters=1,
                   desc=eg_file) as fout:
    for chunk in urllib.urlopen(eg_link):
        fout.write(chunk)
import yaml

import src.logger
from tqdm.auto import tqdm

logger = logging.getLogger(__name__)

if __name__ == "__main__":

    params = yaml.safe_load(open("params.yaml"))

    logger.info("Downloading %s", params["download"]["data-url"])

    gz_path = os.path.join(params["common"]["download-path"])
    response = requests.get(params["download"]["data-url"], stream=True)

    ### Download with a progress bar

    with tqdm.wrapattr(
        open(gz_path, "wb"),
        "write",
        miniters=1,
        total=int(response.headers.get("content-length", 0)),
        desc=gz_path,
    ) as fout:

        for chunk in response.iter_content(chunk_size=4096):
            fout.write(chunk)

    logger.info("Wrote data to %s", gz_path)
Ejemplo n.º 18
0
        return self.update(b * bsize - self.n)  # also sets self.n = b * bsize


opts = docopt(__doc__)

eg_link = opts['--url']
eg_file = eg_link.replace('/', ' ').split()[-1]
eg_out = opts['--output'].replace("/dev/null", devnull)
# with tqdm(unit='B', unit_scale=True, unit_divisor=1024, miniters=1,
#           desc=eg_file) as t:  # all optional kwargs
#     urllib.urlretrieve(eg_link, filename=eg_out,
#                        reporthook=my_hook(t), data=None)
with TqdmUpTo(unit='B',
              unit_scale=True,
              unit_divisor=1024,
              miniters=1,
              desc=eg_file) as t:  # all optional kwargs
    urllib.urlretrieve(  # nosec
        eg_link, filename=eg_out, reporthook=t.update_to, data=None)
    t.total = t.n

# Even simpler progress by wrapping the output file's `write()`
response = urllib.urlopen(eg_link)  # nosec
with tqdm.wrapattr(open(eg_out, "wb"),
                   "write",
                   miniters=1,
                   desc=eg_file,
                   total=getattr(response, 'length', None)) as fout:
    for chunk in response:
        fout.write(chunk)