Python hash_file Examples, ubelt.hash_file Python Examples

Example #1

0

Show file

def test_hash_file():
    fpath = join(ub.ensure_app_cache_dir('ubelt'), 'tmp.txt')
    ub.writeto(fpath, 'foobar')
    hashid1_a = ub.hash_file(fpath,
                             hasher='sha512',
                             hashlen=8,
                             stride=1,
                             blocksize=1)
    hashid2_a = ub.hash_file(fpath,
                             hasher='sha512',
                             hashlen=8,
                             stride=2,
                             blocksize=1)

    hashid1_b = ub.hash_file(fpath,
                             hasher='sha512',
                             hashlen=8,
                             stride=1,
                             blocksize=10)
    hashid2_b = ub.hash_file(fpath,
                             hasher='sha512',
                             hashlen=8,
                             stride=2,
                             blocksize=10)

    assert hashid1_a == hashid1_b
    assert hashid2_a != hashid2_b, 'blocksize matters when stride is > 1'
    assert hashid1_a != hashid2_a

Example #2

0

Show file

File: deployer.py Project: Erotemic/netharn

def _make_package_name2(info):
    """
    Construct a unique and descriptive name for the deployment
    """
    snap_fpath = info['snap_fpath']
    model_fpath = info['model_fpath']
    train_info_fpath = info['train_info_fpath']

    if train_info_fpath and exists(train_info_fpath):
        train_info = json.load(open(train_info_fpath, 'r'))
        model_name = train_info['hyper']['model'][0].split('.')[-1]
        train_hash = ub.hash_data(train_info['train_id'], hasher='sha512',
                                  base='abc', types=True)[0:8]
    else:
        model_name = os.path.splitext(os.path.basename(model_fpath))[0]
        train_hash = 'UNKNOWN-TRAINID'
        print('WARNING: Train info metadata does not exist')

    try:
        # netharn models contain epoch info in the weights file
        import torch
        state = torch.load(snap_fpath,
                           map_location=lambda storage, location: storage)
        epoch = '{:03d}'.format(state['epoch'])
    except Exception:
        epoch = 'UNKNOWN-EPOCH'

    weights_hash = ub.hash_file(snap_fpath, base='abc',
                                hasher='sha512')[0:6].upper()

    deploy_name = 'deploy_{model}_{trainid}_{epoch}_{weights}'.format(
        model=model_name, trainid=train_hash, epoch=epoch,
        weights=weights_hash)
    return deploy_name

Example #3

0

Show file

File: _autojit.py Project: Kitware/netharn

def _autojit_cython(pyx_fpath, verbose=1):
    """
    This idea is that given a pyx file, we try to compile it. We write a stamp
    file so subsequent calls should be very fast as long as the source pyx has
    not changed.

    Parameters
    ----------
    pyx_fpath : str
        path to the pyx file

    verbose : int
        higher is more verbose.
    """
    import shutil

    # TODO: move necessary ubelt utilities to nx.utils?
    # Separate this into its own util?
    if shutil.which("cythonize"):
        pyx_dpath = dirname(pyx_fpath)

        # Check if the compiled library exists
        pyx_base = splitext(basename(pyx_fpath))[0]

        SO_EXTS = _platform_pylib_exts()
        so_fname = False
        for fname in os.listdir(pyx_dpath):
            if fname.startswith(pyx_base) and fname.endswith(SO_EXTS):
                so_fname = fname
                break

        try:
            # Currently this functionality depends on ubelt.
            # We could replace ub.cmd with subprocess.check_call and ub.augpath
            # with os.path operations, but hash_file and CacheStamp are harder
            # to replace. We can use "liberator" to statically extract these
            # and add them to nx.utils though.
            import ubelt as ub
        except Exception:
            return False
        else:
            if so_fname is False:
                # We can compute what the so_fname will be if it doesnt exist
                so_fname = pyx_base + SO_EXTS[0]

            so_fpath = join(pyx_dpath, so_fname)
            depends = [ub.hash_file(pyx_fpath, hasher="sha1")]
            stamp_fname = ub.augpath(so_fname, ext=".jit.stamp")
            stamp = ub.CacheStamp(
                stamp_fname,
                dpath=pyx_dpath,
                product=so_fpath,
                depends=depends,
                verbose=verbose,
            )
            if stamp.expired():
                ub.cmd("cythonize -i {}".format(pyx_fpath), verbose=verbose, check=True)
                stamp.renew()
            return True

Example #4

0

Show file

def benchmark():
    import timerit
    import ubelt as ub
    from kwcoco.util.util_futures import JobPool  # NOQA
    ti = timerit.Timerit(3, bestof=1, verbose=2)

    max_workers = 4

    # Choose a path to an HDD
    dpath = ub.ensuredir('/raid/data/tmp')

    fpath_demodata = _demodata_files(dpath=dpath,
                                     num_files=1000,
                                     size_pool=[10, 20, 50],
                                     pool_size=8)

    for timer in ti.reset('hash_file(hasher=xx64)'):
        with timer:
            for fpath in fpath_demodata:
                ub.hash_file(fpath, hasher='xx64')

    for timer in ti.reset('hash_file(hasher=xxhash) - serial'):
        # jobs = JobPool(mode='thread', max_workers=2)
        jobs = JobPool(mode='serial', max_workers=max_workers)
        with timer:
            for fpath in fpath_demodata:
                jobs.submit(ub.hash_file, fpath, hasher='xxhash')
            results = [job.result() for job in jobs.jobs]

    for timer in ti.reset('hash_file(hasher=xxhash) - thread'):
        # jobs = JobPool(mode='thread', max_workers=2)
        jobs = JobPool(mode='thread', max_workers=max_workers)
        with timer:
            for fpath in fpath_demodata:
                jobs.submit(ub.hash_file, fpath, hasher='xx64')
            results = [job.result() for job in jobs.jobs]

    for timer in ti.reset('hash_file(hasher=xxhash) - process'):
        # jobs = JobPool(mode='thread', max_workers=2)
        jobs = JobPool(mode='process', max_workers=max_workers)
        with timer:
            for fpath in fpath_demodata:
                jobs.submit(ub.hash_file, fpath, hasher='xx64')
            results = [job.result() for job in jobs.jobs]

Example #5

0

Show file

File: inventory.py Project: Erotemic/misc

 def _check_integrity(pfile):
     for part in pfile._parts[1:]:
         # Ensure that each partial hash corresponds with the actual partial
         # hash.
         target = ub.hash_file(pfile.fpath,
                               blocksize=int(2**20),
                               maxbytes=part[1],
                               hasher='xx64')
         if target != part[0]:
             raise AssertionError('The hashes do not match!')

Example #6

0

Show file

File: test_download.py Project: Erotemic/ubelt

def test_grabdata():
    # xdoctest: +REQUIRES(--network)
    import ubelt as ub
    # fname = 'foo.bar'
    # url = 'http://i.imgur.com/rqwaDag.png'
    # prefix1 = '944389a39dfb8fa9'
    fname = 'foo2.bar'
    url = _demo_url(128 * 11)
    prefix1 = 'b7fa848cd088ae842a89ef'
    fpath = ub.grabdata(url, fname=fname, hash_prefix=prefix1)
    stamp_fpath = fpath + '.sha512.hash'
    assert ub.readfrom(stamp_fpath) == prefix1
    # Check that the download doesn't happen again
    fpath = ub.grabdata(url, fname=fname, hash_prefix=prefix1)
    # todo: check file timestamps have not changed
    #
    # Check redo works with hash
    fpath = ub.grabdata(url, fname=fname, hash_prefix=prefix1, redo=True)
    # todo: check file timestamps have changed
    #
    # Check that a redownload occurs when the stamp is changed
    with open(stamp_fpath, 'w') as file:
        file.write('corrupt-stamp')
    fpath = ub.grabdata(url, fname=fname, hash_prefix=prefix1)
    assert ub.readfrom(stamp_fpath) == prefix1
    #
    # Check that a redownload occurs when the stamp is removed
    ub.delete(stamp_fpath)
    with open(fpath, 'w') as file:
        file.write('corrupt-data')
    assert not ub.hash_file(fpath, base='hex',
                            hasher='sha512').startswith(prefix1)
    fpath = ub.grabdata(url, fname=fname, hash_prefix=prefix1)
    assert ub.hash_file(fpath, base='hex', hasher='sha512').startswith(prefix1)
    #
    # Check that requesting new data causes redownload
    #url2 = 'https://data.kitware.com/api/v1/item/5b4039308d777f2e6225994c/download'
    #prefix2 = 'c98a46cb31205cf'  # hack SSL
    # url2 = 'http://i.imgur.com/rqwaDag.png'
    # prefix2 = '944389a39dfb8fa9'
    url2, prefix2 = url, prefix1
    fpath = ub.grabdata(url2, fname=fname, hash_prefix=prefix2)
    assert ub.readfrom(stamp_fpath) == prefix2

Example #7

0

Show file

File: util_cachestamp.py Project: AudreyBeard/netharn

 def _product_file_hash(self, product=None):
     """
     Get the hash of the each product file
     """
     import xxhash  # much faster than builtin hashers
     products = self._rectify_products(product)
     product_file_hash = [
         ub.hash_file(p, hasher=xxhash.xxh64) for p in products
     ]
     return product_file_hash

Example #8

0

Show file

def _update_hashes():
    """
    for dev use to update hashes of the demo images

    CommandLine:
        xdoctest -m kwimage.im_demodata _update_hashes
        xdoctest -m kwimage.im_demodata _update_hashes --require-hashes
    """
    TEST_IMAGES = _TEST_IMAGES.copy()

    for key in TEST_IMAGES.keys():
        item = TEST_IMAGES[key]

        grabkw = {
            'appname': 'kwimage/demodata',
        }
        # item['sha512'] = 'not correct'

        # Wait until ubelt 9.1 is released to change hasher due to
        # issue in ub.grabdata
        # hasher_priority = ['sha512', 'sha1']
        hasher_priority = ['sha1']

        REQUIRE_EXISTING_HASH = ub.argflag('--require-hashes')
        if REQUIRE_EXISTING_HASH:
            for hasher in hasher_priority:
                if hasher in item:
                    grabkw.update({
                        'hash_prefix': item[hasher],
                        'hasher': hasher,
                    })
                    break

        if 'fname' in item:
            grabkw['fname'] = item['fname']

        item.pop('sha512', None)
        fpath = ub.grabdata(item['url'], **grabkw)
        if 'hasher' not in item:
            hasher = hasher_priority[0]
            prefix = ub.hash_file(fpath, hasher=hasher)
            item[hasher] = prefix[0:64]

        print('_TEST_IMAGES = ' + ub.repr2(TEST_IMAGES, nl=2))

Example #9

0

Show file

 def _build_file_hashid(root, suffix, hashid_mode):
     """
     Build a hashid for a specific file given as a path root and suffix.
     """
     gpath = join(root, suffix)
     if hashid_mode == 'PATH':
         # Hash the full path to the image data
         # NOTE: this logic is not machine independent
         hashid = ub.hash_data(suffix, hasher='sha1', base='hex')
     elif hashid_mode == 'PIXELS':
         # Hash the pixels in the image
         hashid = ub.hash_file(gpath, hasher='sha1', base='hex')
     elif hashid_mode == 'DVC':
         raise NotImplementedError('todo')
     elif hashid_mode == 'GIVEN':
         raise Exception('given mode no longer supported')
     else:
         raise KeyError(hashid_mode)
     return hashid

Example #10

0

Show file

File: inputs.py Project: afcarl/clab

    def prepare_images(self, ext='.png', force=False):
        """
        If not already done, loads paths to images into memory and constructs a
        unique id for that set of im/gt images.

        It the paths are already set, then only the input-id is constructed.
        """
        if self.n_input is not None and not force:
            return

        self.prepare_image_paths()
        print('Preparing {} images'.format(self.tag))

        if self.aux_paths:
            # new way
            depends = sorted(self.paths.items())
        else:
            depends = []
            depends.append(self.im_paths)
            depends.append(self.gt_paths)
            if self.gt_paths:
                # HACK: We will assume image data depends only on the filename
                # HACK: be respectful of gt label changes (ignore aug)
                # stride>1 is faster but might break
                # stride=1 is the safest
                hashes = [
                    ub.hash_file(p, stride=32)
                    for p in ub.ProgIter(self.gt_paths, label='hashing')
                    if 'aug' not in basename(p) and 'part' not in basename(p)
                ]
                label_hashid = ub.hash_data(hashes)
                depends.append(label_hashid)
        n_im = None if self.im_paths is None else len(self.im_paths)
        n_gt = None if self.gt_paths is None else len(self.gt_paths)
        self.n_input = n_im or n_gt

        hashid = ub.hash_data(depends)[:self.abbrev]
        self.input_id = '{}-{}'.format(self.n_input, hashid)

        print(' * n_images = {}'.format(n_im))
        print(' * n_groundtruth = {}'.format(n_gt))
        print(' * input_id = {}'.format(self.input_id))

Example #11

0

Show file

File: inventory.py Project: Erotemic/misc

def benchmark():
    """
    apt-get install xxhash
    """
    import timerit
    import ubelt as ub
    from kwcoco.util.util_futures import JobPool  # NOQA
    ti = timerit.Timerit(1, bestof=1, verbose=3)

    max_workers = 6

    fpath_demodata = _demodata_files()
    for timer in ti.reset('hash_file(hasher=xx32)'):
        with timer:
            for fpath in fpath_demodata:
                ub.hash_file(fpath, hasher='xx32')

    for timer in ti.reset('hash_file(hasher=xx64)'):
        with timer:
            for fpath in fpath_demodata:
                ub.hash_file(fpath, hasher='xx64')

    for timer in ti.reset('hash_file(hasher=xxhash) - serial'):
        # jobs = JobPool(mode='thread', max_workers=2)
        jobs = JobPool(mode='serial', max_workers=max_workers)
        with timer:
            for fpath in fpath_demodata:
                jobs.submit(ub.hash_file, fpath, hasher='xxhash')
            results = [job.result() for job in jobs.jobs]

    for timer in ti.reset('hash_file(hasher=xxhash) - thread'):
        # jobs = JobPool(mode='thread', max_workers=2)
        jobs = JobPool(mode='thread', max_workers=max_workers)
        with timer:
            for fpath in fpath_demodata:
                jobs.submit(ub.hash_file, fpath, hasher='xx64')
            results = [job.result() for job in jobs.jobs]

    for timer in ti.reset('hash_file(hasher=xxhash) - process'):
        # jobs = JobPool(mode='thread', max_workers=2)
        jobs = JobPool(mode='process', max_workers=max_workers)
        with timer:
            for fpath in fpath_demodata:
                jobs.submit(ub.hash_file, fpath, hasher='xx64')
            results = [job.result() for job in jobs.jobs]

    for timer in ti.reset('cmd-xxh32sum'):
        with timer:
            for fpath in fpath_demodata:
                ub.cmd(['xxh32sum', fpath])['out'].split(' ')[0]

    for timer in ti.reset('cmd-xxh64sum'):
        with timer:
            for fpath in fpath_demodata:
                ub.cmd(['xxh64sum', fpath])['out'].split(' ')[0]

    for timer in ti.reset('cmd-xxh64sum-detatch'):
        with timer:
            jobs = [
                ub.cmd(['xxh64sum', fpath], detatch=True)
                for fpath in fpath_demodata
            ]
            results = [
                job['proc'].communicate()[0].split(' ')[0] for job in jobs
            ]

    for timer in ti.reset('cmd-sha1sum'):
        with timer:
            for fpath in fpath_demodata:
                ub.cmd(['sha1sum', fpath])['out'].split(' ')[0]

    for timer in ti.reset('hash_file(hasher=sha1)'):
        with timer:
            for fpath in fpath_demodata:
                ub.hash_file(fpath, hasher='sha1')

Example #12

0

Show file

File: deployer.py Project: jcfr/netharn

def _package_deploy(train_dpath):
    """
    Combine the model, weights, and info files into a single deployable file

    CommandLine:
        xdoctest -m netharn.export.deployer _package_deploy

    Args:
        train_dpath (PathLike): the netharn training directory

    Example:
        >>> dpath = ub.ensure_app_cache_dir('netharn', 'tests/_package_deploy')
        >>> train_dpath = ub.ensuredir((dpath, 'my_train_dpath'))
        >>> ub.touch(join(train_dpath, 'final_snapshot.pt'))
        >>> ub.touch(join(train_dpath, 'my_model.py'))
        >>> zipfpath = _package_deploy(train_dpath)
        ...
        >>> print(os.path.basename(zipfpath))
        deploy_UNKNOWN-ARCH_my_train_dpath_UNKNOWN-EPOCH_QOOEZT.zip
    """
    print('[DEPLOYER] Deploy to dpath={}'.format(train_dpath))
    snap_fpath = find_best_snapshot(train_dpath)

    model_fpaths = glob.glob(join(train_dpath, '*.py'))
    if len(model_fpaths) == 0:
        raise FileNotFoundError('The model topology cannot be found')
    elif len(model_fpaths) > 1:
        warnings.warn(
            'There are multiple models here: {}'.format(model_fpaths))

    if not snap_fpath:
        raise FileNotFoundError('No weights are associated with the model')

    weights_hash = ub.hash_file(snap_fpath, base='abc',
                                hasher='sha512')[0:6].upper()

    train_info_fpath = join(train_dpath, 'train_info.json')

    if exists(train_info_fpath):
        train_info = json.load(open(train_info_fpath, 'r'))
        model_name = train_info['hyper']['model'][0].split('.')[-1]
        train_hash = ub.hash_data(train_info['train_id'],
                                  hasher='sha512',
                                  base='abc',
                                  types=True)[0:8]
    else:
        model_name = 'UNKNOWN-ARCH'
        train_hash = os.path.basename(train_dpath)
        print('WARNING: Training metadata does not exist')

    try:
        import torch
        state = torch.load(snap_fpath)
        epoch = '{:03d}'.format(state['epoch'])
    except Exception:
        epoch = 'UNKNOWN-EPOCH'

    deploy_name = 'deploy_{model}_{trainid}_{epoch}_{weights}'.format(
        model=model_name,
        trainid=train_hash,
        epoch=epoch,
        weights=weights_hash)

    deploy_fname = deploy_name + '.zip'

    def zwrite(myzip, fpath, fname=None):
        if fname is None:
            fname = relpath(fpath, train_dpath)
        myzip.write(fpath, arcname=join(deploy_name, fname))

    zipfpath = join(train_dpath, deploy_fname)
    with zipfile.ZipFile(zipfpath, 'w') as myzip:
        if exists(train_info_fpath):
            zwrite(myzip, train_info_fpath)
        zwrite(myzip, snap_fpath, fname='deploy_snapshot.pt')
        for model_fpath in model_fpaths:
            zwrite(myzip, model_fpath)
        # Add some quick glanceable info
        # for bestacc_fpath in glob.glob(join(train_dpath, 'best_epoch_*')):
        #     zwrite(myzip, bestacc_fpath)
        for p in glob.glob(join(train_dpath, 'glance/*')):
            zwrite(myzip, p)
    print('[DEPLOYER] Deployed zipfpath={}'.format(zipfpath))
    return zipfpath

Example #13

0

Show file

def benchmark_hash_file():
    """
    CommandLine:
        python ~/code/ubelt/dev/bench_hash.py --show
        python ~/code/ubelt/dev/bench_hash.py --show
    """
    import ubelt as ub
    import random

    # dpath = ub.ensuredir(ub.expandpath('$HOME/raid/data/tmp'))
    dpath = ub.ensuredir(ub.expandpath('$HOME/tmp'))

    rng = random.Random(0)
    # Create a pool of random chunks of data
    chunksize = int(2 ** 20)
    pool_size = 8
    part_pool = [_random_data(rng, chunksize) for _ in range(pool_size)]

    #ITEM = 'JUST A STRING' * 100
    HASHERS = ['sha1', 'sha512', 'xxh32', 'xxh64', 'blake3']

    scales = list(range(5, 10))
    import os

    results = ub.AutoDict()
    # Use json is faster or at least as fast it most cases
    # xxhash is also significantly faster than sha512
    ti = ub.Timerit(9, bestof=3, verbose=1, unit='ms')
    for s in ub.ProgIter(scales, desc='benchmark', verbose=3):
        N = 2 ** s
        print(' --- s={s}, N={N} --- '.format(s=s, N=N))
        # Write a big file
        size_pool = [N]
        fpath = _write_random_file(dpath, part_pool, size_pool, rng)

        megabytes = os.stat(fpath).st_size / (2 ** 20)
        print('megabytes = {!r}'.format(megabytes))

        for hasher in HASHERS:
            for timer in ti.reset(hasher):
                ub.hash_file(fpath, hasher=hasher)
            results[hasher].update({N: ti.mean()})
        col = {h: results[h][N] for h in HASHERS}
        sortx = ub.argsort(col)
        ranking = ub.dict_subset(col, sortx)
        print('walltime: ' + ub.repr2(ranking, precision=9, nl=0))
        best = next(iter(ranking))
        #pairs = list(ub.iter_window( 2))
        pairs = [(k, best) for k in ranking]
        ratios = [ranking[k1] / ranking[k2] for k1, k2 in pairs]
        nicekeys = ['{}/{}'.format(k1, k2) for k1, k2 in pairs]
        relratios = ub.odict(zip(nicekeys, ratios))
        print('speedup: ' + ub.repr2(relratios, precision=4, nl=0))
    # xdoc +REQUIRES(--show)
    # import pytest
    # pytest.skip()
    import pandas as pd
    df = pd.DataFrame.from_dict(results)
    df.columns.name = 'hasher'
    df.index.name = 'N'
    ratios = df.copy().drop(columns=df.columns)
    for k1, k2 in [('sha512', 'xxh64'), ('sha1', 'xxh64'), ('xxh32', 'xxh64'), ('blake3', 'xxh64')]:
        ratios['{}/{}'.format(k1, k2)] = df[k1] / df[k2]
    print()
    print('Seconds per iteration')
    print(df.to_string(float_format='%.9f'))
    print()
    print('Ratios of seconds')
    print(ratios.to_string(float_format='%.2f'))
    print()
    print('Average Ratio (over all N)')
    print(ratios.mean().sort_values())
    if ub.argflag('--show'):
        import kwplot
        kwplot.autompl()
        xdata = sorted(ub.peek(results.values()).keys())
        ydata = ub.map_vals(lambda d: [d[x] for x in xdata], results)
        kwplot.multi_plot(xdata, ydata, xlabel='N', ylabel='seconds')
        kwplot.show_if_requested()

Example #14

0

Show file

def bench_find_optimal_blocksize():
    r"""
    This function can help find the optimal blocksize for your usecase:w

    Notes:

        # Usage
        cd ~/code/ubelt/dev
        xdoctest bench_hash_file.py bench_find_optimal_blocksize \
            --dpath <PATH-TO-HDD-OR-SDD> \
            --size <INT-IN-MB> \
            --hash_algo <ALGO_NAME> \

        # Benchmark on an HDD
        xdoctest bench_hash_file.py bench_find_optimal_blocksize \
            --size 500 \
            --dpath $HOME/raid/data/tmp \
            --hash_algo xx64

        # Benchmark on an SSD
        xdoctest bench_hash_file.py bench_find_optimal_blocksize \
            --size 500 \
            --dpath $HOME/.cache/ubelt/tmp \
            --hash_algo xx64


        # Test a small file
        xdoctest bench_hash_file.py bench_find_optimal_blocksize \
            --size 1 \
            --dpath $HOME/.cache/ubelt/tmp \
            --hash_algo xx64

        Throughout our tests on SSDs / HDDs with small and large files
        we are finding a chunksize of 2 ** 20 consistently working best with
        xx64.

        # Test with a slower hash algo
        xdoctest bench_hash_file.py bench_find_optimal_blocksize \
            --size 500 \
            --dpath $HOME/raid/data/tmp \
            --hash_algo sha1

        Even that shows 2 ** 20 working well.
    """
    import os
    import numpy as np
    import timerit

    dpath = ub.argval('--dpath', default=None)

    if dpath is None:
        # dpath = ub.ensuredir(ub.expandpath('$HOME/raid/data/tmp'))
        dpath = ub.ensure_app_cache_dir('ubelt/hash_test')
    else:
        ub.ensuredir(dpath)

    print('dpath = {!r}'.format(dpath))

    target_size = int(ub.argval('--size', default=600))
    hash_algo = ub.argval('--hash_algo', default='xx64')

    print('hash_algo = {!r}'.format(hash_algo))
    print('target_size = {!r}'.format(target_size))

    # Write a big file (~600 MB)
    MB = int(2 ** 20)
    size_pool = [target_size]
    rng = random.Random(0)
    # pool_size = max(target_size // 2, 1)
    # pool_size = max(1, target_size // 10)
    pool_size = 8
    part_pool = [_random_data(rng, MB) for _ in range(pool_size)]
    fpath = _write_random_file(dpath, part_pool, size_pool, rng)
    print('fpath = {!r}'.format(fpath))

    size_mb = os.stat(fpath).st_size / MB
    print('file size = {!r} MB'.format(size_mb))

    ti = timerit.Timerit(4, bestof=2, verbose=2)

    results = []

    # Find an optimal constant blocksize
    min_power = 16
    max_power = 24
    blocksize_candiates = [int(2 ** e) for e in range(min_power, max_power)]

    for blocksize in blocksize_candiates:
        for timer in ti.reset('constant blocksize=2 ** {} = {}'.format(np.log2(float(blocksize)), blocksize)):
            result = ub.hash_file(fpath, blocksize=blocksize, hasher=hash_algo)
            results.append(result)

    print('ti.rankings = {}'.format(ub.repr2(ti.rankings, nl=2, align=':')))
    assert ub.allsame(results)

Example #15

0

Show file

def 取文件哈希(文件路径, 哈希算法='sha1'):
    return ub.hash_file(文件路径, hasher=哈希算法, base='hex')