def __init__(self, fname, cfgstr=None, dpath=None, appname='ubelt', ext='.pkl', meta=None, verbose=None, enabled=True, log=None, hasher='sha1', protocol=2): if verbose is None: verbose = self.VERBOSE if dpath is None: # pragma: no branch dpath = util_platform.ensure_app_cache_dir(appname) util_path.ensuredir(dpath) self.dpath = dpath self.fname = fname self.cfgstr = cfgstr self.verbose = verbose self.ext = ext self.meta = meta self.enabled = enabled self.protocol = protocol self.hasher = hasher self.log = print if log is None else log if len(self.ext) > 0 and self.ext[0] != '.': raise ValueError('Please be explicit and use a dot in ext')
def grabdata(url, fpath=None, dpath=None, fname=None, redo=False, verbose=1, appname=None, **download_kw): """ Downloads a file, caches it, and returns its local path. Args: url (str): url to the file to download fpath (str): The full path to download the file to. If unspecified, the arguments `dpath` and `fname` are used to determine this. dpath (str): where to download the file. If unspecified `appname` is used to determine this. Mutually exclusive with fpath. fname (str): What to name the downloaded file. Defaults to the url basename. Mutually exclusive with fpath. redo (bool): if True forces redownload of the file (default = False) verbose (bool): verbosity flag (default = True) appname (str): set dpath to `ub.get_app_cache_dir(appname)`. Mutually exclusive with dpath and fpath. **download_kw: additional kwargs to pass to ub.download Returns: str: fpath - file path string Example: >>> # xdoctest: +REQUIRES(--network) >>> import ubelt as ub >>> file_url = 'http://i.imgur.com/rqwaDag.png' >>> lena_fpath = ub.grabdata(file_url, fname='mario.png') >>> result = basename(lena_fpath) >>> print(result) mario.png """ if appname and dpath: raise ValueError('Cannot specify appname with dpath') if fpath and (dpath or fname or appname): raise ValueError('Cannot specify fpath with dpath or fname') if fpath is None: if dpath is None: appname = appname or 'ubelt' dpath = util_platform.ensure_app_cache_dir(appname) if fname is None: fname = basename(url) fpath = join(dpath, fname) if redo or not exists(fpath): fpath = download(url, fpath, verbose=verbose, **download_kw) else: if verbose >= 2: print('Already have file %s' % fpath) return fpath
def _win32_can_symlink(verbose=0, force=0, testing=0): """ Args: verbose (int, default=0): flag force (int, default=0): flag testing (int, default=0): flag Example: >>> # xdoctest: +REQUIRES(WIN32) >>> import ubelt as ub >>> _win32_can_symlink(verbose=1, force=1, testing=1) """ global __win32_can_symlink__ if verbose: print('__win32_can_symlink__ = {!r}'.format(__win32_can_symlink__)) if __win32_can_symlink__ is not None and not force: return __win32_can_symlink__ from ubelt import util_platform tempdir = util_platform.ensure_app_cache_dir('ubelt', '_win32_can_symlink') util_io.delete(tempdir) util_path.ensuredir(tempdir) dpath = join(tempdir, 'dpath') fpath = join(tempdir, 'fpath.txt') dlink = join(tempdir, 'dlink') flink = join(tempdir, 'flink.txt') util_path.ensuredir(dpath) util_io.touch(fpath) # Add broken variants of the links for testing purposes # Its ugly, but so is all this windows code. if testing: broken_dpath = join(tempdir, 'broken_dpath') broken_fpath = join(tempdir, 'broken_fpath.txt') # Create files that we will delete after we link to them util_path.ensuredir(broken_dpath) util_io.touch(broken_fpath) try: _win32_symlink(dpath, dlink) if testing: _win32_symlink(broken_dpath, join(tempdir, 'broken_dlink')) can_symlink_directories = os.path.islink(dlink) except OSError: can_symlink_directories = False if verbose: print('can_symlink_directories = {!r}'.format(can_symlink_directories)) try: _win32_symlink(fpath, flink) if testing: _win32_symlink(broken_fpath, join(tempdir, 'broken_flink')) can_symlink_files = os.path.islink(flink) # os.path.islink(flink) except OSError: can_symlink_files = False if verbose: print('can_symlink_files = {!r}'.format(can_symlink_files)) if int(can_symlink_directories) + int(can_symlink_files) == 1: raise AssertionError( 'can do one but not both. Unexpected {} {}'.format( can_symlink_directories, can_symlink_files)) try: # test that we can create junctions, even if symlinks are disabled djunc = _win32_junction(dpath, join(tempdir, 'djunc')) fjunc = _win32_junction(fpath, join(tempdir, 'fjunc.txt')) if testing: _win32_junction(broken_dpath, join(tempdir, 'broken_djunc')) _win32_junction(broken_fpath, join(tempdir, 'broken_fjunc.txt')) if not _win32_is_junction(djunc): raise AssertionError('expected junction') if not _win32_is_hardlinked(fpath, fjunc): raise AssertionError('expected hardlink') except Exception: warnings.warn('We cannot create junctions either!') raise if testing: # break the links util_io.delete(broken_dpath) util_io.delete(broken_fpath) if verbose: from ubelt import util_links util_links._dirstats(tempdir) try: # Cleanup the test directory util_io.delete(tempdir) except Exception: print('ERROR IN DELETE') from ubelt import util_links util_links._dirstats(tempdir) raise can_symlink = can_symlink_directories and can_symlink_files __win32_can_symlink__ = can_symlink if not can_symlink: warnings.warn('Cannot make real symlink. Falling back to junction') if verbose: print('can_symlink = {!r}'.format(can_symlink)) print('__win32_can_symlink__ = {!r}'.format(__win32_can_symlink__)) return can_symlink
def download(url, fpath=None, hash_prefix=None, chunksize=8192, verbose=1): """ downloads a url to a fpath. Args: url (str): url to download fpath (str): path to download to. Defaults to basename of url chunksize (int): download chunksize verbose (bool): verbosity Notes: Original code taken from pytorch in torch/utils/model_zoo.py and slightly modified. References: http://blog.moleculea.com/2012/10/04/urlretrieve-progres-indicator/ http://stackoverflow.com/questions/15644964/python-progress-bar-and-downloads http://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py Example: >>> from ubelt.util_download import * # NOQA >>> url = 'http://i.imgur.com/rqwaDag.png' >>> fpath = download(url) >>> print(basename(fpath)) rqwaDag.png """ if fpath is None: dpath = util_platform.ensure_app_cache_dir('ubelt') fname = basename(url) fpath = join(dpath, fname) urldata = urlopen(url) # if _have_requests: # file_size = int(urldata.headers["Content-Length"]) # urldata = urldata.raw # else: meta = urldata.info() if hasattr(meta, 'getheaders'): # nocover file_size = int(meta.getheaders("Content-Length")[0]) else: file_size = int(meta.get_all("Content-Length")[0]) if verbose: print('Downloading url=%r to fpath=%r' % (url, fpath)) tmp = tempfile.NamedTemporaryFile(delete=False) try: # if hash_prefix: # sha256 = hashlib.sha256() with _tqdm(total=file_size, disable=not verbose) as pbar: while True: buffer = urldata.read(chunksize) if len(buffer) == 0: break tmp.write(buffer) # if hash_prefix: # sha256.update(buffer) pbar.update(len(buffer)) tmp.close() # if hash_prefix: # digest = sha256.hexdigest() # if digest[:len(hash_prefix)] != hash_prefix: # raise RuntimeError('invalid hash value (expected "{}", got "{}")' # .format(hash_prefix, digest)) shutil.move(tmp.name, fpath) finally: tmp.close() # If for some reason the move failed, delete the temporary file if exists(tmp.name): # nocover os.remove(tmp.name) return fpath
def grabdata(url, fpath=None, dpath=None, fname=None, redo=False, verbose=1, appname=None, hash_prefix=None, hasher='sha512', **download_kw): """ Downloads a file, caches it, and returns its local path. Args: url (str): url to the file to download fpath (PathLike): The full path to download the file to. If unspecified, the arguments `dpath` and `fname` are used to determine this. dpath (PathLike): where to download the file. If unspecified `appname` is used to determine this. Mutually exclusive with fpath. fname (str): What to name the downloaded file. Defaults to the url basename. Mutually exclusive with fpath. redo (bool): if True forces redownload of the file (default = False) verbose (bool): verbosity flag (default = True) appname (str): set dpath to `ub.get_app_cache_dir(appname)`. Mutually exclusive with dpath and fpath. hash_prefix (None or str): If specified, grabdata verifies that this matches the hash of the file, and then saves the hash in a adjacent file to certify that the download was successful. Defaults to None. hasher (str or Hasher): If hash_prefix is specified, this indicates the hashing algorithm to apply to the file. Defaults to sha512. **download_kw: additional kwargs to pass to ub.download Returns: PathLike: fpath - file path string Example: >>> # xdoctest: +REQUIRES(--network) >>> import ubelt as ub >>> url = 'http://i.imgur.com/rqwaDag.png' >>> fpath = ub.grabdata(url, fname='mario.png') >>> result = basename(fpath) >>> print(result) mario.png Example: >>> # xdoctest: +REQUIRES(--network) >>> import ubelt as ub >>> fname = 'foo.bar' >>> url = 'http://i.imgur.com/rqwaDag.png' >>> prefix1 = '944389a39dfb8fa9' >>> fpath = ub.grabdata(url, fname=fname, hash_prefix=prefix1) >>> stamp_fpath = fpath + '.hash' >>> assert open(stamp_fpath, 'r').read() == prefix1 >>> # Check that the download doesn't happen again >>> fpath = ub.grabdata(url, fname=fname, hash_prefix=prefix1) >>> # todo: check file timestamps have not changed >>> # >>> # Check redo works with hash >>> fpath = ub.grabdata(url, fname=fname, hash_prefix=prefix1, redo=True) >>> # todo: check file timestamps have changed >>> # >>> # Check that a redownload occurs when the stamp is changed >>> open(stamp_fpath, 'w').write('corrupt-stamp') >>> fpath = ub.grabdata(url, fname=fname, hash_prefix=prefix1) >>> assert open(stamp_fpath, 'r').read() == prefix1 >>> # >>> # Check that a redownload occurs when the stamp is removed >>> ub.delete(stamp_fpath) >>> open(fpath, 'w').write('corrupt-data') >>> assert not ub.hash_file(fpath, base='hex', hasher='sha512').startswith(prefix1) >>> fpath = ub.grabdata(url, fname=fname, hash_prefix=prefix1) >>> assert ub.hash_file(fpath, base='hex', hasher='sha512').startswith(prefix1) >>> # >>> # Check that requesting new data causes redownload >>> url2 = 'https://data.kitware.com/api/v1/item/5b4039308d777f2e6225994c/download' >>> prefix2 = 'c98a46cb31205cf' >>> fpath = ub.grabdata(url2, fname=fname, hash_prefix=prefix2) >>> assert open(stamp_fpath, 'r').read() == prefix2 """ from ubelt import util_platform if appname and dpath: raise ValueError('Cannot specify appname with dpath') if fpath and (dpath or fname or appname): raise ValueError('Cannot specify fpath with dpath or fname') if fpath is None: if dpath is None: appname = appname or 'ubelt' dpath = util_platform.ensure_app_cache_dir(appname) if fname is None: fname = basename(url) fpath = join(dpath, fname) # note that needs_download is never set to false after it becomes true # this is the key to working through the logic of the following checks needs_download = redo if not exists(fpath): # always download if we are missing the file needs_download = True if hash_prefix: stamp_fpath, needs_download = _check_hash_stamp( fpath, hash_prefix, hasher, verbose, needs_download) if needs_download: fpath = download(url, fpath, verbose=verbose, hash_prefix=hash_prefix, hasher=hasher, **download_kw) if hash_prefix: # If the file successfully downloaded then the hashes match. # write out the expected prefix so we can check it later with open(stamp_fpath, 'w') as file: file.write(hash_prefix) else: if verbose >= 2: print('Already have file %s' % fpath) return fpath
def download(url, fpath=None, hash_prefix=None, hasher='sha512', chunksize=8192, verbose=1): """ downloads a url to a fpath. Args: url (str): The url to download. fpath (PathLike | io.BytesIOtringIO): The path to download to. Defaults to basename of url and ubelt's application cache. If this is a io.BytesIO object then information is directly written to this object (note this prevents the use of temporary files). hash_prefix (None or str): If specified, download will retry / error if the file hash does not match this value. Defaults to None. hasher (str or Hasher): If hash_prefix is specified, this indicates the hashing algorithm to apply to the file. Defaults to sha512. chunksize (int): Download chunksize. Defaults to 2 ** 13. verbose (int): Verbosity level 0 or 1. Defaults to 1. Returns: PathLike: fpath - file path string Raises: URLError - if there is problem downloading the url RuntimeError - if the hash does not match the hash_prefix Notes: Original code taken from pytorch in torch/utils/model_zoo.py and slightly modified. References: http://blog.moleculea.com/2012/10/04/urlretrieve-progres-indicator/ http://stackoverflow.com/questions/15644964/python-progress-bar-and-downloads http://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py CommandLine: python -m xdoctest ubelt.util_download download:1 Example: >>> # xdoctest: +REQUIRES(--network) >>> from ubelt.util_download import * # NOQA >>> url = 'http://i.imgur.com/rqwaDag.png' >>> fpath = download(url) >>> print(basename(fpath)) rqwaDag.png Example: >>> # xdoctest: +REQUIRES(--network) >>> import ubelt as ub >>> import io >>> url = 'http://i.imgur.com/rqwaDag.png' >>> file = io.BytesIO() >>> fpath = download(url, file) >>> file.seek(0) >>> data = file.read() >>> assert ub.hash_data(data, hasher='sha1').startswith('f79ea24571') Example: >>> # xdoctest: +REQUIRES(--network) >>> url = 'http://i.imgur.com/rqwaDag.png' >>> fpath = download(url, hasher='sha1', hash_prefix='f79ea24571da6ddd2ba12e3d57b515249ecb8a35') Downloading url='http://i.imgur.com/rqwaDag.png' to fpath=...rqwaDag.png ... ...1233/1233... rate=... Hz, eta=..., total=..., wall=... Example: >>> # xdoctest: +REQUIRES(--network) >>> # test download from girder >>> import pytest >>> import ubelt as ub >>> url = 'https://data.kitware.com/api/v1/item/5b4039308d777f2e6225994c/download' >>> ub.download(url, hasher='sha512', hash_prefix='c98a46cb31205cf') >>> with pytest.raises(RuntimeError): >>> ub.download(url, hasher='sha512', hash_prefix='BAD_HASH') """ from progiter import ProgIter as Progress from ubelt import util_platform import shutil import tempfile import hashlib if six.PY2: # nocover from urllib2 import urlopen # NOQA else: from urllib.request import urlopen # NOQA if fpath is None: dpath = util_platform.ensure_app_cache_dir('ubelt') fname = basename(url) fpath = join(dpath, fname) _dst_is_io_object = hasattr(fpath, 'write') if verbose: if _dst_is_io_object: print('Downloading url=%r to IO object' % (url, )) else: print('Downloading url=%r to fpath=%r' % (url, fpath)) urldata = urlopen(url) meta = urldata.info() try: if hasattr(meta, 'getheaders'): # nocover file_size = int(meta.getheaders("Content-Length")[0]) else: file_size = int(meta.get_all("Content-Length")[0]) except Exception: # nocover # sometimes the url does not contain content length metadata # TODO: find a public URL that exemplifies this or figure out how to # mock it locally. file_size = None if hash_prefix: if isinstance(hasher, six.string_types): if hasher == 'sha1': hasher = hashlib.sha1() elif hasher == 'sha512': hasher = hashlib.sha512() else: raise KeyError(hasher) if _dst_is_io_object: _file_write = fpath.write else: tmp = tempfile.NamedTemporaryFile(delete=False) _file_write = tmp.write # possible optimization (have not tested or timed) _urldata_read = urldata.read try: with Progress(total=file_size, disable=not verbose) as pbar: _pbar_update = pbar.update def _critical_loop(): # Initialize the buffer to a non-empty object buffer = ' ' if hash_prefix: _hasher_update = hasher.update while buffer: buffer = _urldata_read(chunksize) _file_write(buffer) _hasher_update(buffer) _pbar_update(len(buffer)) else: # Same code as above, just without the hasher update. # (tight loop optimization: remove in-loop conditional) while buffer: buffer = _urldata_read(chunksize) _file_write(buffer) _pbar_update(len(buffer)) _critical_loop() if not _dst_is_io_object: tmp.close() # We keep a potentially corrupted file if the hash doesn't match. # It could be the case that the user simply specified the wrong # hash_prefix. shutil.move(tmp.name, fpath) if hash_prefix: got = hasher.hexdigest() if got[:len(hash_prefix)] != hash_prefix: print('hash_prefix = {!r}'.format(hash_prefix)) print('got = {!r}'.format(got)) if _dst_is_io_object: raise RuntimeError('invalid hash value ' '(expected "{}", got "{}")'.format( hash_prefix, got)) else: raise RuntimeError('invalid hash value for fpath={!r} ' '(expected "{}", got "{}")'.format( fpath, hash_prefix, got)) finally: if not _dst_is_io_object: # nocover tmp.close() # If for some reason the move failed, delete the temporary file if exists(tmp.name): os.remove(tmp.name) return fpath