Esempio n. 1
0
    def __init__(pfile, fpath):
        from ubelt.util_hash import _rectify_hasher
        pfile.fpath = fpath
        pfile._hash = None
        pfile._size = None
        # pfile._hgen = pfile.hash_generator()

        pfile._parts = []
        pfile._hasher = _rectify_hasher('xx64')()
        pfile._curr_blocks = 1
        pfile._pos = 0
Esempio n. 2
0
def _benchmark():
    """
    On 64-bit processors sha512 may be faster than sha256

    References:
        https://crypto.stackexchange.com/questions/26336/sha512-faster-than-sha256
    """
    result = ub.AutoOrderedDict()
    algos = ['sha1', 'sha256', 'sha512']
    for n in ub.ProgIter([1, 10, 100, 1000, 10000, 100000], desc='time'):
        # for key in hashlib.algorithms_guaranteed:
        for key in algos:
            hashtype = _rectify_hasher(key)
            t1 = ub.Timerit(100, bestof=10, label=key, verbose=0)
            for timer in t1:
                data = b'8' * n
                with timer:
                    hasher = hashtype()
                    hasher.update(data)
            result[key][n] = t1.min()
    import pandas as pd
    print(pd.DataFrame(result))

    result = ub.AutoOrderedDict()
    for n in ub.ProgIter([1, 10, 100, 1000, 10000, 100000], desc='time'):
        # for key in hashlib.algorithms_guaranteed:
        for key in algos:
            hashtype = _rectify_hasher(key)
            t1 = ub.Timerit(100, bestof=10, label=key, verbose=0)
            for timer in t1:
                data = b'8' * n
                hasher = hashtype()
                hasher.update(data)
                with timer:
                    hasher.hexdigest()
            result[key][n] = t1.min()
    import pandas as pd
    print(pd.DataFrame(result))
    """
Esempio n. 3
0
def bench_hashfile_blocksize():
    """
    Test speed of hashing with various blocksize strategies

    """
    dpath = ub.ensuredir(ub.expandpath('$HOME/raid/data/tmp'))

    size_pool = [10000]

    rng = random.Random(0)
    # Create a pool of random chunks of data
    chunksize = int(2 ** 20)
    pool_size = 8
    part_pool = [_random_data(rng, chunksize) for _ in range(pool_size)]

    # Write a big file (~600 MB)
    fpath = _write_random_file(dpath, part_pool, size_pool, rng)

    import os
    size_mb = os.stat(fpath).st_size / 1e6
    print('file size = {!r} MB'.format(size_mb))

    from ubelt.util_hash import _rectify_hasher

    hasher_algo = 'xx64'

    import timerit
    ti = timerit.Timerit(4, bestof=2, verbose=2)
    # hasher = _rectify_hasher(hash_algo)()
    # with timer:
    #     with open(fpath, 'rb') as file:
    #         buf = file.read(blocksize)
    #         while len(buf) > 0:
    #             hasher.update(buf)
    #             buf = file.read(blocksize)
    # result = hasher.hexdigest()

    results = []

    # Constant blocksize is the winner as long as its chosen right.
    for timer in ti.reset('constant blocksize'):
        blocksize = int(2 ** 20)
        hasher = _rectify_hasher(hasher_algo)()
        with timer:
            with open(fpath, 'rb') as file:
                buf = file.read(blocksize)
                while len(buf) > 0:
                    hasher.update(buf)
                    buf = file.read(blocksize)
        result = hasher.hexdigest()
        results.append(result)

    for timer in ti.reset('double blocksize'):
        blocksize = int(2 ** 20)
        hasher = _rectify_hasher(hasher_algo)()
        with timer:
            with open(fpath, 'rb') as file:
                buf = file.read(blocksize)
                while len(buf) > 0:
                    hasher.update(buf)
                    blocksize *= 2
                    buf = file.read(blocksize)
        result = hasher.hexdigest()
        results.append(result)

    for timer in ti.reset('double blocksize + limit'):
        max_blocksize = int(2 ** 20) * 16
        blocksize = int(2 ** 20)
        hasher = _rectify_hasher(hasher_algo)()
        with timer:
            with open(fpath, 'rb') as file:
                buf = file.read(blocksize)
                while len(buf) > 0:
                    hasher.update(buf)
                    blocksize = min(2 * blocksize, max_blocksize)
                    buf = file.read(blocksize)
        result = hasher.hexdigest()
        results.append(result)