def __init__(pfile, fpath): from ubelt.util_hash import _rectify_hasher pfile.fpath = fpath pfile._hash = None pfile._size = None # pfile._hgen = pfile.hash_generator() pfile._parts = [] pfile._hasher = _rectify_hasher('xx64')() pfile._curr_blocks = 1 pfile._pos = 0
def _benchmark(): """ On 64-bit processors sha512 may be faster than sha256 References: https://crypto.stackexchange.com/questions/26336/sha512-faster-than-sha256 """ result = ub.AutoOrderedDict() algos = ['sha1', 'sha256', 'sha512'] for n in ub.ProgIter([1, 10, 100, 1000, 10000, 100000], desc='time'): # for key in hashlib.algorithms_guaranteed: for key in algos: hashtype = _rectify_hasher(key) t1 = ub.Timerit(100, bestof=10, label=key, verbose=0) for timer in t1: data = b'8' * n with timer: hasher = hashtype() hasher.update(data) result[key][n] = t1.min() import pandas as pd print(pd.DataFrame(result)) result = ub.AutoOrderedDict() for n in ub.ProgIter([1, 10, 100, 1000, 10000, 100000], desc='time'): # for key in hashlib.algorithms_guaranteed: for key in algos: hashtype = _rectify_hasher(key) t1 = ub.Timerit(100, bestof=10, label=key, verbose=0) for timer in t1: data = b'8' * n hasher = hashtype() hasher.update(data) with timer: hasher.hexdigest() result[key][n] = t1.min() import pandas as pd print(pd.DataFrame(result)) """
def bench_hashfile_blocksize(): """ Test speed of hashing with various blocksize strategies """ dpath = ub.ensuredir(ub.expandpath('$HOME/raid/data/tmp')) size_pool = [10000] rng = random.Random(0) # Create a pool of random chunks of data chunksize = int(2 ** 20) pool_size = 8 part_pool = [_random_data(rng, chunksize) for _ in range(pool_size)] # Write a big file (~600 MB) fpath = _write_random_file(dpath, part_pool, size_pool, rng) import os size_mb = os.stat(fpath).st_size / 1e6 print('file size = {!r} MB'.format(size_mb)) from ubelt.util_hash import _rectify_hasher hasher_algo = 'xx64' import timerit ti = timerit.Timerit(4, bestof=2, verbose=2) # hasher = _rectify_hasher(hash_algo)() # with timer: # with open(fpath, 'rb') as file: # buf = file.read(blocksize) # while len(buf) > 0: # hasher.update(buf) # buf = file.read(blocksize) # result = hasher.hexdigest() results = [] # Constant blocksize is the winner as long as its chosen right. for timer in ti.reset('constant blocksize'): blocksize = int(2 ** 20) hasher = _rectify_hasher(hasher_algo)() with timer: with open(fpath, 'rb') as file: buf = file.read(blocksize) while len(buf) > 0: hasher.update(buf) buf = file.read(blocksize) result = hasher.hexdigest() results.append(result) for timer in ti.reset('double blocksize'): blocksize = int(2 ** 20) hasher = _rectify_hasher(hasher_algo)() with timer: with open(fpath, 'rb') as file: buf = file.read(blocksize) while len(buf) > 0: hasher.update(buf) blocksize *= 2 buf = file.read(blocksize) result = hasher.hexdigest() results.append(result) for timer in ti.reset('double blocksize + limit'): max_blocksize = int(2 ** 20) * 16 blocksize = int(2 ** 20) hasher = _rectify_hasher(hasher_algo)() with timer: with open(fpath, 'rb') as file: buf = file.read(blocksize) while len(buf) > 0: hasher.update(buf) blocksize = min(2 * blocksize, max_blocksize) buf = file.read(blocksize) result = hasher.hexdigest() results.append(result)