def bench_main_blocksize_filesize(tmpdir, maxsize): stmt = textwrap.dedent(""" cfg.update(blocksize={blocksize}) main.main({files_dirs}) """) params = [] # single files, test filesize and blocksize max_filesize = maxsize max_blocksize = min(200*MiB, max_filesize) cases = [(np.array([max_filesize]), bytes_logspace(10*KiB, max_blocksize, 20), 'main_blocksize_single'), (bytes_linspace(min(1*MiB, max_filesize//2), max_filesize, 5), np.array([256*KiB]), 'main_filesize_single'), ] for filesize, blocksize, study in cases: testdir = mkdtemp(dir=tmpdir, prefix=study + '_') files = write_single_files(testdir, filesize) this = ps.pgrid(zip(ps.plist('filesize', filesize), ps.plist('filesize_str', map(size2str, filesize)), ps.plist('files_dirs', [[x] for x in files])), ps.plist('study', [study]), ps.plist('maxsize_str', [size2str(maxsize)]), zip(ps.plist('blocksize', blocksize), ps.plist('blocksize_str', map(size2str, blocksize)))) params += this study = 'main_blocksize' testdir, group_dirs, files = write_collection(maxsize, tmpdir=tmpdir, study=study) blocksize = bytes_logspace(10*KiB, min(200*MiB, maxsize), 20) this = ps.pgrid(ps.plist('files_dirs', [[testdir]]), ps.plist('study', [study]), ps.plist('maxsize_str', [size2str(maxsize)]), zip(ps.plist('blocksize', blocksize), ps.plist('blocksize_str', map(size2str, blocksize)))) params += this return stmt, params, {}
def bench_main_parallel(tmpdir, maxsize): stmt = textwrap.dedent(""" cfg.update(blocksize={blocksize}, nthreads={nthreads}, nprocs={nprocs}, share_leafs={share_leafs}) main.main({files_dirs}) """) params = [] study = 'main_parallel' testdir, group_dirs, files = write_collection(maxsize, tmpdir=tmpdir, study=study) blocksize = np.array([256*KiB]) for share_leafs in [True, False]: this = ps.pgrid(ps.plist('files_dirs', [[testdir]]), ps.plist('study', [study]), zip(ps.plist('nthreads', range(1, MAXWORKERS+1)), ps.plist('nworkers', range(1, MAXWORKERS+1))), ps.plist('nprocs', [1]), ps.plist('pool_type', ['thread']), ps.plist('maxsize_str', [size2str(maxsize)]), ps.plist('share_leafs', [share_leafs]), zip(ps.plist('blocksize', blocksize), ps.plist('blocksize_str', list(map(size2str, blocksize))))) params += this this = ps.pgrid(ps.plist('files_dirs', [[testdir]]), ps.plist('study', [study]), zip(ps.plist('nprocs', range(1, MAXWORKERS+1)), ps.plist('nworkers', range(1, MAXWORKERS+1))), ps.plist('nthreads', [1]), ps.plist('pool_type', ['proc']), ps.plist('maxsize_str', [size2str(maxsize)]), ps.plist('share_leafs', [share_leafs]), zip(ps.plist('blocksize', blocksize), ps.plist('blocksize_str', list(map(size2str, blocksize))))) params += this return stmt, params, {}
def bench_hash_file_parallel(tmpdir, maxsize): params = [] study = 'hash_file_parallel' testdir, group_dirs, files = write_collection(maxsize, tmpdir=tmpdir, study=study) pool_map = {'seq': pl.SequentialPoolExecutor, 'thread': pl.ThreadPoolExecutor, 'proc': pl.ProcessPoolExecutor, 'proc,thread=1': lambda nw: pl.ProcessAndThreadPoolExecutor(nw, 1), 'thread,proc=1': lambda nw: pl.ProcessAndThreadPoolExecutor(1, nw), } ctx = dict(pool_map=pool_map, pl=pl, files=files, worker=_worker_bench_hash_file_parallel, ) setup = cache_flush_setup stmt = """ with pool_map['{pool_type}']({nworkers}) as pool: x=list(pool.map(worker, files)) """ this = ps.pgrid(ps.plist('pool_type', [k for k in pool_map.keys() if k != 'seq']), ps.plist('nworkers', range(1, MAXWORKERS+1)), ps.plist('study', [study]), ps.plist('maxsize_str', [size2str(maxsize)]), ) params += this # non-pool reference params += [{'study': study, 'pool_type': 'seq', 'nworkers': 1, 'maxsize_str': size2str(maxsize)}] return stmt, params, dict(setup=setup, globals=ctx)
def bench_main_parallel_2d(tmpdir, maxsize): stmt = textwrap.dedent(""" cfg.update(blocksize={blocksize}, nthreads={nthreads}, nprocs={nprocs}) main.main({files_dirs}) """) params = [] study = 'main_parallel_2d' testdir, group_dirs, files = write_collection(maxsize, tmpdir=tmpdir, study=study) blocksize = np.array([256*KiB]) this = ps.pgrid(ps.plist('files_dirs', [[testdir]]), ps.plist('study', [study]), ps.plist('nthreads', range(1, MAXWORKERS+1)), ps.plist('nprocs', range(1, MAXWORKERS+1)), ps.plist('maxsize_str', [size2str(maxsize)]), zip(ps.plist('blocksize', blocksize), ps.plist('blocksize_str', list(map(size2str, blocksize))))) params += this return stmt, params, {}
'findsame', 'findsame -t1', 'findsame -l 512K', 'findsame -l 4K', 'findsame -t1 -l 4K', 'jdupes -q -r', 'jdupes -q -rQ', 'jdupes -q -rTT', 'duff -ra', 'duff -rat', 'rdfind -outputname /dev/null', ]) # cache warming only once per datadir to save time, that's why we need to # loop over ps.run() calls instead of going w/ the usual psweep workflow # (assemble all params, then call ps.run() once), instead we use one of # ps.run()'s features: append to existing db on disk for _datadir in datadirs: print(_datadir) for _cache in ['cold', 'warm']: print(f" {_cache}") if _cache == 'warm': subprocess.run(f"findsame {_datadir} > /dev/null", shell=True) datadir = ps.plist('datadir', [_datadir]) cache = ps.plist('cache', [_cache]) params = ps.pgrid(tool_cmd, datadir, cache) # results in calc/results.pk ps.run(func, params)