def test_hash_file(): fpath = join(ub.ensure_app_cache_dir('ubelt'), 'tmp.txt') ub.writeto(fpath, 'foobar') hashid1_a = ub.hash_file(fpath, hasher='sha512', hashlen=8, stride=1, blocksize=1) hashid2_a = ub.hash_file(fpath, hasher='sha512', hashlen=8, stride=2, blocksize=1) hashid1_b = ub.hash_file(fpath, hasher='sha512', hashlen=8, stride=1, blocksize=10) hashid2_b = ub.hash_file(fpath, hasher='sha512', hashlen=8, stride=2, blocksize=10) assert hashid1_a == hashid1_b assert hashid2_a != hashid2_b, 'blocksize matters when stride is > 1' assert hashid1_a != hashid2_a
def _make_package_name2(info): """ Construct a unique and descriptive name for the deployment """ snap_fpath = info['snap_fpath'] model_fpath = info['model_fpath'] train_info_fpath = info['train_info_fpath'] if train_info_fpath and exists(train_info_fpath): train_info = json.load(open(train_info_fpath, 'r')) model_name = train_info['hyper']['model'][0].split('.')[-1] train_hash = ub.hash_data(train_info['train_id'], hasher='sha512', base='abc', types=True)[0:8] else: model_name = os.path.splitext(os.path.basename(model_fpath))[0] train_hash = 'UNKNOWN-TRAINID' print('WARNING: Train info metadata does not exist') try: # netharn models contain epoch info in the weights file import torch state = torch.load(snap_fpath, map_location=lambda storage, location: storage) epoch = '{:03d}'.format(state['epoch']) except Exception: epoch = 'UNKNOWN-EPOCH' weights_hash = ub.hash_file(snap_fpath, base='abc', hasher='sha512')[0:6].upper() deploy_name = 'deploy_{model}_{trainid}_{epoch}_{weights}'.format( model=model_name, trainid=train_hash, epoch=epoch, weights=weights_hash) return deploy_name
def _autojit_cython(pyx_fpath, verbose=1): """ This idea is that given a pyx file, we try to compile it. We write a stamp file so subsequent calls should be very fast as long as the source pyx has not changed. Parameters ---------- pyx_fpath : str path to the pyx file verbose : int higher is more verbose. """ import shutil # TODO: move necessary ubelt utilities to nx.utils? # Separate this into its own util? if shutil.which("cythonize"): pyx_dpath = dirname(pyx_fpath) # Check if the compiled library exists pyx_base = splitext(basename(pyx_fpath))[0] SO_EXTS = _platform_pylib_exts() so_fname = False for fname in os.listdir(pyx_dpath): if fname.startswith(pyx_base) and fname.endswith(SO_EXTS): so_fname = fname break try: # Currently this functionality depends on ubelt. # We could replace ub.cmd with subprocess.check_call and ub.augpath # with os.path operations, but hash_file and CacheStamp are harder # to replace. We can use "liberator" to statically extract these # and add them to nx.utils though. import ubelt as ub except Exception: return False else: if so_fname is False: # We can compute what the so_fname will be if it doesnt exist so_fname = pyx_base + SO_EXTS[0] so_fpath = join(pyx_dpath, so_fname) depends = [ub.hash_file(pyx_fpath, hasher="sha1")] stamp_fname = ub.augpath(so_fname, ext=".jit.stamp") stamp = ub.CacheStamp( stamp_fname, dpath=pyx_dpath, product=so_fpath, depends=depends, verbose=verbose, ) if stamp.expired(): ub.cmd("cythonize -i {}".format(pyx_fpath), verbose=verbose, check=True) stamp.renew() return True
def benchmark(): import timerit import ubelt as ub from kwcoco.util.util_futures import JobPool # NOQA ti = timerit.Timerit(3, bestof=1, verbose=2) max_workers = 4 # Choose a path to an HDD dpath = ub.ensuredir('/raid/data/tmp') fpath_demodata = _demodata_files(dpath=dpath, num_files=1000, size_pool=[10, 20, 50], pool_size=8) for timer in ti.reset('hash_file(hasher=xx64)'): with timer: for fpath in fpath_demodata: ub.hash_file(fpath, hasher='xx64') for timer in ti.reset('hash_file(hasher=xxhash) - serial'): # jobs = JobPool(mode='thread', max_workers=2) jobs = JobPool(mode='serial', max_workers=max_workers) with timer: for fpath in fpath_demodata: jobs.submit(ub.hash_file, fpath, hasher='xxhash') results = [job.result() for job in jobs.jobs] for timer in ti.reset('hash_file(hasher=xxhash) - thread'): # jobs = JobPool(mode='thread', max_workers=2) jobs = JobPool(mode='thread', max_workers=max_workers) with timer: for fpath in fpath_demodata: jobs.submit(ub.hash_file, fpath, hasher='xx64') results = [job.result() for job in jobs.jobs] for timer in ti.reset('hash_file(hasher=xxhash) - process'): # jobs = JobPool(mode='thread', max_workers=2) jobs = JobPool(mode='process', max_workers=max_workers) with timer: for fpath in fpath_demodata: jobs.submit(ub.hash_file, fpath, hasher='xx64') results = [job.result() for job in jobs.jobs]
def _check_integrity(pfile): for part in pfile._parts[1:]: # Ensure that each partial hash corresponds with the actual partial # hash. target = ub.hash_file(pfile.fpath, blocksize=int(2**20), maxbytes=part[1], hasher='xx64') if target != part[0]: raise AssertionError('The hashes do not match!')
def test_grabdata(): # xdoctest: +REQUIRES(--network) import ubelt as ub # fname = 'foo.bar' # url = 'http://i.imgur.com/rqwaDag.png' # prefix1 = '944389a39dfb8fa9' fname = 'foo2.bar' url = _demo_url(128 * 11) prefix1 = 'b7fa848cd088ae842a89ef' fpath = ub.grabdata(url, fname=fname, hash_prefix=prefix1) stamp_fpath = fpath + '.sha512.hash' assert ub.readfrom(stamp_fpath) == prefix1 # Check that the download doesn't happen again fpath = ub.grabdata(url, fname=fname, hash_prefix=prefix1) # todo: check file timestamps have not changed # # Check redo works with hash fpath = ub.grabdata(url, fname=fname, hash_prefix=prefix1, redo=True) # todo: check file timestamps have changed # # Check that a redownload occurs when the stamp is changed with open(stamp_fpath, 'w') as file: file.write('corrupt-stamp') fpath = ub.grabdata(url, fname=fname, hash_prefix=prefix1) assert ub.readfrom(stamp_fpath) == prefix1 # # Check that a redownload occurs when the stamp is removed ub.delete(stamp_fpath) with open(fpath, 'w') as file: file.write('corrupt-data') assert not ub.hash_file(fpath, base='hex', hasher='sha512').startswith(prefix1) fpath = ub.grabdata(url, fname=fname, hash_prefix=prefix1) assert ub.hash_file(fpath, base='hex', hasher='sha512').startswith(prefix1) # # Check that requesting new data causes redownload #url2 = 'https://data.kitware.com/api/v1/item/5b4039308d777f2e6225994c/download' #prefix2 = 'c98a46cb31205cf' # hack SSL # url2 = 'http://i.imgur.com/rqwaDag.png' # prefix2 = '944389a39dfb8fa9' url2, prefix2 = url, prefix1 fpath = ub.grabdata(url2, fname=fname, hash_prefix=prefix2) assert ub.readfrom(stamp_fpath) == prefix2
def _product_file_hash(self, product=None): """ Get the hash of the each product file """ import xxhash # much faster than builtin hashers products = self._rectify_products(product) product_file_hash = [ ub.hash_file(p, hasher=xxhash.xxh64) for p in products ] return product_file_hash
def _update_hashes(): """ for dev use to update hashes of the demo images CommandLine: xdoctest -m kwimage.im_demodata _update_hashes xdoctest -m kwimage.im_demodata _update_hashes --require-hashes """ TEST_IMAGES = _TEST_IMAGES.copy() for key in TEST_IMAGES.keys(): item = TEST_IMAGES[key] grabkw = { 'appname': 'kwimage/demodata', } # item['sha512'] = 'not correct' # Wait until ubelt 9.1 is released to change hasher due to # issue in ub.grabdata # hasher_priority = ['sha512', 'sha1'] hasher_priority = ['sha1'] REQUIRE_EXISTING_HASH = ub.argflag('--require-hashes') if REQUIRE_EXISTING_HASH: for hasher in hasher_priority: if hasher in item: grabkw.update({ 'hash_prefix': item[hasher], 'hasher': hasher, }) break if 'fname' in item: grabkw['fname'] = item['fname'] item.pop('sha512', None) fpath = ub.grabdata(item['url'], **grabkw) if 'hasher' not in item: hasher = hasher_priority[0] prefix = ub.hash_file(fpath, hasher=hasher) item[hasher] = prefix[0:64] print('_TEST_IMAGES = ' + ub.repr2(TEST_IMAGES, nl=2))
def _build_file_hashid(root, suffix, hashid_mode): """ Build a hashid for a specific file given as a path root and suffix. """ gpath = join(root, suffix) if hashid_mode == 'PATH': # Hash the full path to the image data # NOTE: this logic is not machine independent hashid = ub.hash_data(suffix, hasher='sha1', base='hex') elif hashid_mode == 'PIXELS': # Hash the pixels in the image hashid = ub.hash_file(gpath, hasher='sha1', base='hex') elif hashid_mode == 'DVC': raise NotImplementedError('todo') elif hashid_mode == 'GIVEN': raise Exception('given mode no longer supported') else: raise KeyError(hashid_mode) return hashid
def prepare_images(self, ext='.png', force=False): """ If not already done, loads paths to images into memory and constructs a unique id for that set of im/gt images. It the paths are already set, then only the input-id is constructed. """ if self.n_input is not None and not force: return self.prepare_image_paths() print('Preparing {} images'.format(self.tag)) if self.aux_paths: # new way depends = sorted(self.paths.items()) else: depends = [] depends.append(self.im_paths) depends.append(self.gt_paths) if self.gt_paths: # HACK: We will assume image data depends only on the filename # HACK: be respectful of gt label changes (ignore aug) # stride>1 is faster but might break # stride=1 is the safest hashes = [ ub.hash_file(p, stride=32) for p in ub.ProgIter(self.gt_paths, label='hashing') if 'aug' not in basename(p) and 'part' not in basename(p) ] label_hashid = ub.hash_data(hashes) depends.append(label_hashid) n_im = None if self.im_paths is None else len(self.im_paths) n_gt = None if self.gt_paths is None else len(self.gt_paths) self.n_input = n_im or n_gt hashid = ub.hash_data(depends)[:self.abbrev] self.input_id = '{}-{}'.format(self.n_input, hashid) print(' * n_images = {}'.format(n_im)) print(' * n_groundtruth = {}'.format(n_gt)) print(' * input_id = {}'.format(self.input_id))
def benchmark(): """ apt-get install xxhash """ import timerit import ubelt as ub from kwcoco.util.util_futures import JobPool # NOQA ti = timerit.Timerit(1, bestof=1, verbose=3) max_workers = 6 fpath_demodata = _demodata_files() for timer in ti.reset('hash_file(hasher=xx32)'): with timer: for fpath in fpath_demodata: ub.hash_file(fpath, hasher='xx32') for timer in ti.reset('hash_file(hasher=xx64)'): with timer: for fpath in fpath_demodata: ub.hash_file(fpath, hasher='xx64') for timer in ti.reset('hash_file(hasher=xxhash) - serial'): # jobs = JobPool(mode='thread', max_workers=2) jobs = JobPool(mode='serial', max_workers=max_workers) with timer: for fpath in fpath_demodata: jobs.submit(ub.hash_file, fpath, hasher='xxhash') results = [job.result() for job in jobs.jobs] for timer in ti.reset('hash_file(hasher=xxhash) - thread'): # jobs = JobPool(mode='thread', max_workers=2) jobs = JobPool(mode='thread', max_workers=max_workers) with timer: for fpath in fpath_demodata: jobs.submit(ub.hash_file, fpath, hasher='xx64') results = [job.result() for job in jobs.jobs] for timer in ti.reset('hash_file(hasher=xxhash) - process'): # jobs = JobPool(mode='thread', max_workers=2) jobs = JobPool(mode='process', max_workers=max_workers) with timer: for fpath in fpath_demodata: jobs.submit(ub.hash_file, fpath, hasher='xx64') results = [job.result() for job in jobs.jobs] for timer in ti.reset('cmd-xxh32sum'): with timer: for fpath in fpath_demodata: ub.cmd(['xxh32sum', fpath])['out'].split(' ')[0] for timer in ti.reset('cmd-xxh64sum'): with timer: for fpath in fpath_demodata: ub.cmd(['xxh64sum', fpath])['out'].split(' ')[0] for timer in ti.reset('cmd-xxh64sum-detatch'): with timer: jobs = [ ub.cmd(['xxh64sum', fpath], detatch=True) for fpath in fpath_demodata ] results = [ job['proc'].communicate()[0].split(' ')[0] for job in jobs ] for timer in ti.reset('cmd-sha1sum'): with timer: for fpath in fpath_demodata: ub.cmd(['sha1sum', fpath])['out'].split(' ')[0] for timer in ti.reset('hash_file(hasher=sha1)'): with timer: for fpath in fpath_demodata: ub.hash_file(fpath, hasher='sha1')
def _package_deploy(train_dpath): """ Combine the model, weights, and info files into a single deployable file CommandLine: xdoctest -m netharn.export.deployer _package_deploy Args: train_dpath (PathLike): the netharn training directory Example: >>> dpath = ub.ensure_app_cache_dir('netharn', 'tests/_package_deploy') >>> train_dpath = ub.ensuredir((dpath, 'my_train_dpath')) >>> ub.touch(join(train_dpath, 'final_snapshot.pt')) >>> ub.touch(join(train_dpath, 'my_model.py')) >>> zipfpath = _package_deploy(train_dpath) ... >>> print(os.path.basename(zipfpath)) deploy_UNKNOWN-ARCH_my_train_dpath_UNKNOWN-EPOCH_QOOEZT.zip """ print('[DEPLOYER] Deploy to dpath={}'.format(train_dpath)) snap_fpath = find_best_snapshot(train_dpath) model_fpaths = glob.glob(join(train_dpath, '*.py')) if len(model_fpaths) == 0: raise FileNotFoundError('The model topology cannot be found') elif len(model_fpaths) > 1: warnings.warn( 'There are multiple models here: {}'.format(model_fpaths)) if not snap_fpath: raise FileNotFoundError('No weights are associated with the model') weights_hash = ub.hash_file(snap_fpath, base='abc', hasher='sha512')[0:6].upper() train_info_fpath = join(train_dpath, 'train_info.json') if exists(train_info_fpath): train_info = json.load(open(train_info_fpath, 'r')) model_name = train_info['hyper']['model'][0].split('.')[-1] train_hash = ub.hash_data(train_info['train_id'], hasher='sha512', base='abc', types=True)[0:8] else: model_name = 'UNKNOWN-ARCH' train_hash = os.path.basename(train_dpath) print('WARNING: Training metadata does not exist') try: import torch state = torch.load(snap_fpath) epoch = '{:03d}'.format(state['epoch']) except Exception: epoch = 'UNKNOWN-EPOCH' deploy_name = 'deploy_{model}_{trainid}_{epoch}_{weights}'.format( model=model_name, trainid=train_hash, epoch=epoch, weights=weights_hash) deploy_fname = deploy_name + '.zip' def zwrite(myzip, fpath, fname=None): if fname is None: fname = relpath(fpath, train_dpath) myzip.write(fpath, arcname=join(deploy_name, fname)) zipfpath = join(train_dpath, deploy_fname) with zipfile.ZipFile(zipfpath, 'w') as myzip: if exists(train_info_fpath): zwrite(myzip, train_info_fpath) zwrite(myzip, snap_fpath, fname='deploy_snapshot.pt') for model_fpath in model_fpaths: zwrite(myzip, model_fpath) # Add some quick glanceable info # for bestacc_fpath in glob.glob(join(train_dpath, 'best_epoch_*')): # zwrite(myzip, bestacc_fpath) for p in glob.glob(join(train_dpath, 'glance/*')): zwrite(myzip, p) print('[DEPLOYER] Deployed zipfpath={}'.format(zipfpath)) return zipfpath
def benchmark_hash_file(): """ CommandLine: python ~/code/ubelt/dev/bench_hash.py --show python ~/code/ubelt/dev/bench_hash.py --show """ import ubelt as ub import random # dpath = ub.ensuredir(ub.expandpath('$HOME/raid/data/tmp')) dpath = ub.ensuredir(ub.expandpath('$HOME/tmp')) rng = random.Random(0) # Create a pool of random chunks of data chunksize = int(2 ** 20) pool_size = 8 part_pool = [_random_data(rng, chunksize) for _ in range(pool_size)] #ITEM = 'JUST A STRING' * 100 HASHERS = ['sha1', 'sha512', 'xxh32', 'xxh64', 'blake3'] scales = list(range(5, 10)) import os results = ub.AutoDict() # Use json is faster or at least as fast it most cases # xxhash is also significantly faster than sha512 ti = ub.Timerit(9, bestof=3, verbose=1, unit='ms') for s in ub.ProgIter(scales, desc='benchmark', verbose=3): N = 2 ** s print(' --- s={s}, N={N} --- '.format(s=s, N=N)) # Write a big file size_pool = [N] fpath = _write_random_file(dpath, part_pool, size_pool, rng) megabytes = os.stat(fpath).st_size / (2 ** 20) print('megabytes = {!r}'.format(megabytes)) for hasher in HASHERS: for timer in ti.reset(hasher): ub.hash_file(fpath, hasher=hasher) results[hasher].update({N: ti.mean()}) col = {h: results[h][N] for h in HASHERS} sortx = ub.argsort(col) ranking = ub.dict_subset(col, sortx) print('walltime: ' + ub.repr2(ranking, precision=9, nl=0)) best = next(iter(ranking)) #pairs = list(ub.iter_window( 2)) pairs = [(k, best) for k in ranking] ratios = [ranking[k1] / ranking[k2] for k1, k2 in pairs] nicekeys = ['{}/{}'.format(k1, k2) for k1, k2 in pairs] relratios = ub.odict(zip(nicekeys, ratios)) print('speedup: ' + ub.repr2(relratios, precision=4, nl=0)) # xdoc +REQUIRES(--show) # import pytest # pytest.skip() import pandas as pd df = pd.DataFrame.from_dict(results) df.columns.name = 'hasher' df.index.name = 'N' ratios = df.copy().drop(columns=df.columns) for k1, k2 in [('sha512', 'xxh64'), ('sha1', 'xxh64'), ('xxh32', 'xxh64'), ('blake3', 'xxh64')]: ratios['{}/{}'.format(k1, k2)] = df[k1] / df[k2] print() print('Seconds per iteration') print(df.to_string(float_format='%.9f')) print() print('Ratios of seconds') print(ratios.to_string(float_format='%.2f')) print() print('Average Ratio (over all N)') print(ratios.mean().sort_values()) if ub.argflag('--show'): import kwplot kwplot.autompl() xdata = sorted(ub.peek(results.values()).keys()) ydata = ub.map_vals(lambda d: [d[x] for x in xdata], results) kwplot.multi_plot(xdata, ydata, xlabel='N', ylabel='seconds') kwplot.show_if_requested()
def bench_find_optimal_blocksize(): r""" This function can help find the optimal blocksize for your usecase:w Notes: # Usage cd ~/code/ubelt/dev xdoctest bench_hash_file.py bench_find_optimal_blocksize \ --dpath <PATH-TO-HDD-OR-SDD> \ --size <INT-IN-MB> \ --hash_algo <ALGO_NAME> \ # Benchmark on an HDD xdoctest bench_hash_file.py bench_find_optimal_blocksize \ --size 500 \ --dpath $HOME/raid/data/tmp \ --hash_algo xx64 # Benchmark on an SSD xdoctest bench_hash_file.py bench_find_optimal_blocksize \ --size 500 \ --dpath $HOME/.cache/ubelt/tmp \ --hash_algo xx64 # Test a small file xdoctest bench_hash_file.py bench_find_optimal_blocksize \ --size 1 \ --dpath $HOME/.cache/ubelt/tmp \ --hash_algo xx64 Throughout our tests on SSDs / HDDs with small and large files we are finding a chunksize of 2 ** 20 consistently working best with xx64. # Test with a slower hash algo xdoctest bench_hash_file.py bench_find_optimal_blocksize \ --size 500 \ --dpath $HOME/raid/data/tmp \ --hash_algo sha1 Even that shows 2 ** 20 working well. """ import os import numpy as np import timerit dpath = ub.argval('--dpath', default=None) if dpath is None: # dpath = ub.ensuredir(ub.expandpath('$HOME/raid/data/tmp')) dpath = ub.ensure_app_cache_dir('ubelt/hash_test') else: ub.ensuredir(dpath) print('dpath = {!r}'.format(dpath)) target_size = int(ub.argval('--size', default=600)) hash_algo = ub.argval('--hash_algo', default='xx64') print('hash_algo = {!r}'.format(hash_algo)) print('target_size = {!r}'.format(target_size)) # Write a big file (~600 MB) MB = int(2 ** 20) size_pool = [target_size] rng = random.Random(0) # pool_size = max(target_size // 2, 1) # pool_size = max(1, target_size // 10) pool_size = 8 part_pool = [_random_data(rng, MB) for _ in range(pool_size)] fpath = _write_random_file(dpath, part_pool, size_pool, rng) print('fpath = {!r}'.format(fpath)) size_mb = os.stat(fpath).st_size / MB print('file size = {!r} MB'.format(size_mb)) ti = timerit.Timerit(4, bestof=2, verbose=2) results = [] # Find an optimal constant blocksize min_power = 16 max_power = 24 blocksize_candiates = [int(2 ** e) for e in range(min_power, max_power)] for blocksize in blocksize_candiates: for timer in ti.reset('constant blocksize=2 ** {} = {}'.format(np.log2(float(blocksize)), blocksize)): result = ub.hash_file(fpath, blocksize=blocksize, hasher=hash_algo) results.append(result) print('ti.rankings = {}'.format(ub.repr2(ti.rankings, nl=2, align=':'))) assert ub.allsame(results)
def 取文件哈希(文件路径, 哈希算法='sha1'): return ub.hash_file(文件路径, hasher=哈希算法, base='hex')