def _devcheck_inv_stability(): import numpy as np rows = [] for trial_idx in range(30): orig_12 = np.random.rand(3, 3).astype(np.float32) orig_21 = np.linalg.inv(orig_12) mat_12 = orig_12.copy() mat_21 = orig_21.copy() for idx in range(100): _mat_12 = np.linalg.inv(mat_21) _mat_21 = np.linalg.inv(mat_12) mat_12 = _mat_12 mat_21 = _mat_21 err_12 = np.abs(mat_12 - orig_12).sum() err_21 = np.abs(mat_21 - orig_21).sum() rows.append({'idx': idx, 'error': err_12, 'label': 'err_12'}) rows.append({'idx': idx, 'error': err_21, 'label': 'err_21'}) import kwplot import pandas as pd sns = kwplot.autosns() data = pd.DataFrame(rows) sns.lineplot(data=data, x='idx', y='error', hue='label')
def main(): sns = kwplot.autosns() # NOQA plt = kwplot.autoplt() # NOQA if 1: array = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] values = [-1, 0, 1, 4, 6, 6.1, 6.5, 7, 10.3, 10.5, 10.7, 15, 16] if 0: array = np.linspace(0, 30) values = np.linspace(0, 30) if 0: xscale = 20 num = 20 array = np.array(sorted(np.random.rand(num) * xscale)).round() values = np.hstack([ np.unique(np.random.choice(array, 3)), np.random.rand(num // 2) * xscale ]) fig = kwplot.figure(fnum=1, doclf=1, pnum=(2, 1, 1)) ax = fig.gca() plot_searchsorted_visualization(array, values, side='left', ax=ax) ax.set_title('association = searchsorted(array, values, side=left)') fig = kwplot.figure(fnum=1, doclf=0, pnum=(2, 1, 2)) ax = fig.gca() plot_searchsorted_visualization(array, values, side='right', ax=ax) ax.set_title('association = searchsorted(array, values, side=right)') import ubelt as ub fig.suptitle( ub.codeblock(''' Notice: side=left and side=right have the same result except when the value is already in the array. '''))
def run_benchmark_renormalization(): """ See if we can renormalize probabilities after update with a faster method that maintains memory a bit better Example: >>> import sys, ubelt >>> sys.path.append(ubelt.expandpath('~/misc/tests/python')) >>> from bench_renormalization import * # NOQA >>> run_benchmark_renormalization() """ import ubelt as ub import xdev import pathlib import timerit fpath = pathlib.Path('~/misc/tests/python/renormalize_cython.pyx').expanduser() renormalize_cython = xdev.import_module_from_pyx(fpath, annotate=True, verbose=3, recompile=True) xdev.profile_now(renormalize_demo_v1)(1000, 100) xdev.profile_now(renormalize_demo_v2)(1000, 100) xdev.profile_now(renormalize_demo_v3)(1000, 100) xdev.profile_now(renormalize_demo_v4)(1000, 100) func_list = [ # renormalize_demo_v1, renormalize_demo_v2, # renormalize_demo_v3, # renormalize_demo_v4, renormalize_cython.renormalize_demo_cython_v1, renormalize_cython.renormalize_demo_cython_v2, renormalize_cython.renormalize_demo_cython_v3, ] methods = {f.__name__: f for f in func_list} for key, method in methods.items(): with timerit.Timer(label=key, verbose=0) as t: method(1000, 100) print(f'{key:<30} {t.toc():0.6f}') arg_basis = { 'T': [10, 20, 30, 50], 'D': [10, 50, 100, 300], } args_grid = [] for argkw in list(ub.named_product(arg_basis)): if argkw['T'] <= argkw['D']: arg_basis['size'] = argkw['T'] * argkw['D'] args_grid.append(argkw) ti = timerit.Timerit(100, bestof=10, verbose=2) measures = [] for method_name, method in methods.items(): for argkw in args_grid: row = ub.dict_union({'method': method_name}, argkw) key = ub.repr2(row, compact=1) argkey = ub.repr2(argkw, compact=1) kwargs = ub.dict_subset(argkw, ['T', 'D']) for timer in ti.reset('time'): with timer: method(**kwargs) row['mean'] = ti.mean() row['min'] = ti.min() row['key'] = key row['argkey'] = argkey measures.append(row) import pandas as pd df = pd.DataFrame(measures) import kwplot sns = kwplot.autosns() kwplot.figure(fnum=1, pnum=(1, 2, 1), docla=True) sns.lineplot(data=df, x='D', y='min', hue='method', style='method') kwplot.figure(fnum=1, pnum=(1, 2, 2), docla=True) sns.lineplot(data=df, x='T', y='min', hue='method', style='method') p = (df.pivot(['method'], ['argkey'], ['mean'])) print(p.mean(axis=1).sort_values())
def main(): """ Run password security analysis Example: >>> import sys, ubelt >>> sys.path.append(ubelt.expandpath('~/misc/notes')) >>> from password_model import * # NOQA >>> main() """ import itertools as it from fractions import Fraction import pandas as pd # Build our adversary and our strategies devices, scales = build_threat_models() password_schemes = build_password_strategy() # Other estimates or assumptions estimates = { # estimated cost of using a kilowatt for an hour # http://www.wrecc.com/what-uses-watts-in-your-home/ # https://www.coinwarz.com/mining/ethereum/calculator 'dollars_per_kwh': 0.10, } rows = [] for device, scheme, scale in it.product(devices, password_schemes, scales): for benchmark in device['benchmarks']: states = Fraction(scheme['states']) num_devices = Fraction(scale['num_devices']) dollars_per_kwh = Fraction(estimates['dollars_per_kwh']) hashmode_attempts_per_second = benchmark['attempts_per_second'] attempts_per_second = num_devices * Fraction( int(hashmode_attempts_per_second)) seconds = states / Fraction(attempts_per_second) hours = seconds / Fraction(3600) device_kilowatts = Fraction(device['watts']) / Fraction(1000) device_dollars_per_hour = device_kilowatts * dollars_per_kwh dollars_per_device = device_dollars_per_hour * hours dollars = dollars_per_device * num_devices total_kilowatts = device_kilowatts * num_devices * hours row = { 'scheme': scheme['name'], 'entropy': scheme['entropy'], 'hashmode': benchmark['hashmode'], 'hashmode_attempts_per_second': int(hashmode_attempts_per_second), 'device': device['name'], 'scale': scale['name'], 'num_devices': scale['num_devices'], 'seconds': seconds, 'dollars': dollars, 'kilowatts': total_kilowatts, 'hours': hours, 'dollars_per_kwh': estimates['dollars_per_kwh'], } rows.append(row) df = pd.DataFrame(rows) df = df.sort_values('entropy') chosen_device = 'RTX_3090' df = df[df['device'] == chosen_device] df['time'] = df['seconds'].apply(humanize_seconds) df['cost'] = df['dollars'].apply(partial(humanize_dollars, colored=1)) df['entropy'] = df['entropy'].round(2) df['num_devices'] = df['num_devices'].apply(int) hashmodes = sorted([d['hashmode'] for d in device['benchmarks']]) # https://github.com/pandas-dev/pandas/issues/18066 monkeypatch_pandas_colored_stdout() # Output our assumptions print('\n---') print('Assumptions:') device_info = ub.group_items(devices, lambda x: x['name'])[chosen_device][0] print('estimates = {!r}'.format(estimates)) print('device_info = {}'.format(ub.repr2(device_info, nl=2))) # For each hashmode, print the scheme-vs-num_devices-vs-time matrix hashmode_to_pivots = {} for hashmode in hashmodes: subdf = df subdf = subdf[subdf['hashmode'] == hashmode] subdf = subdf.sort_values(['entropy', 'num_devices']) piv = subdf.pivot(['entropy', 'cost', 'scheme'], ['num_devices', 'scale'], 'time') # piv.style.applymap(color_cases) hashmode_to_pivots[hashmode] = piv for hashmode in hashmodes: print('\n---') print('hashmode = {!r}'.format(hashmode)) piv = hashmode_to_pivots[hashmode] print(piv) # Print the scheme-vs-hashmode-vs-cost matrix print('\n---') print('Cost Matrix:') subdf = df[df['scale'] == df['scale'].iloc[0]] piv = subdf.pivot(['entropy', 'scheme'], ['hashmode_attempts_per_second', 'hashmode'], 'cost') piv = piv.sort_index(axis=1, ascending=False) piv.columns = piv.columns.droplevel(0) print(piv) # Make the visualizations if ub.argflag('--show'): import kwplot from matplotlib.colors import LogNorm import matplotlib as mpl plt = kwplot.autoplt() sns = kwplot.autosns() use_latex = ub.argflag('--latex') if use_latex: mpl.rcParams['text.usetex'] = True def time_labelize(x): text = humanize_seconds(x, colored=False, named=True, precision=2) parts = text.split(' ') if use_latex: text = r'{\huge ' + parts[0] + '}' + '\n' + ' '.join(parts[1:]) else: text = parts[0] + '\n' + ' '.join(parts[1:]) return text def dollar_labelize(dollars): cost = humanize_dollars(dollars, named=(dollars > 1)) if use_latex: cost = cost.replace('$', r'\$') return cost hashmode_to_notes = {} for dev in devices[0]['benchmarks']: hashmode_to_notes[dev['hashmode']] = dev['notes'] if 1: # Independent of the adversary scale we can plot cost versus scheme # cost vs hashmod? subdf = df[df['scale'] == df['scale'].iloc[0]] piv = subdf.pivot(['entropy', 'scheme'], ['hashmode_attempts_per_second', 'hashmode'], 'dollars') piv = piv.sort_index(axis=1, ascending=False) # https://stackoverflow.com/questions/64234474/cust-y-lbls-seaborn ax: mpl.axes.Axes = plt.subplots(figsize=(15, 10))[1] annot = piv.applymap(dollar_labelize) piv = piv.applymap(float) sns.heatmap(piv, annot=annot, ax=ax, fmt='s', norm=LogNorm(vmin=1, vmax=100_000_000_000_000_000), annot_kws={'size': 16}, cmap='cividis', cbar_kws={ 'label': 'dollars', 'pad': 0.001 }) # Find colorbar for subax in ax.figure.axes: if subax.get_label() == '<colorbar>': subax.set_ylabel('dollars', labelpad=0) break new_ytick_labels = [] for ent, scheme in piv.index.to_list(): if use_latex: scheme = r'{\LARGE ' + scheme + '}' _ = '{scheme}\nEntropy={ent}bits'.format(scheme=scheme, ent=ent) new_ytick_labels.append(_) new_xtick_labels = [] for _, hashmode in piv.columns.to_list(): notes = '' if hashmode in hashmode_to_notes: notes = '\n(' + hashmode_to_notes[hashmode] + ')' new_xtick_labels.append(hashmode + notes) ax.set_xticklabels(new_xtick_labels, rotation=0) ax.set_yticklabels(new_ytick_labels, rotation=0) ax.set_ylabel('Password Scheme, Entropy', labelpad=24) ax.set_xlabel('Hashmode', labelpad=16) if use_latex: title = '{{\\Huge Password Cost Security}}' ax.set_title(title) else: ax.set_title('Password Cost Security') ax.figure.subplots_adjust(bottom=0.1, left=0.20, right=1.0, top=0.90, wspace=0.001) if ub.argflag('--save'): fname = 'passwd_cost_security.png' ax.figure.savefig(fname) if 1: # For each hashmode plot (scheme versus adversary scale) for hashmode in ub.ProgIter(hashmodes, desc='plotting'): subdf = df subdf = subdf[subdf['hashmode'] == hashmode] subdf = subdf.sort_values(['entropy', 'num_devices']) piv = subdf.pivot(['entropy', 'dollars', 'scheme'], ['num_devices', 'scale'], 'seconds') piv = piv.applymap(float) # https://stackoverflow.com/questions/64234474/cust-y-lbls-seaborn ax: mpl.axes.Axes = plt.subplots(figsize=(15, 10))[1] annot = piv.applymap(time_labelize) sns.heatmap(piv, annot=annot, ax=ax, fmt='s', norm=LogNorm(vmin=1, vmax=8640000000), annot_kws={'size': 10}, cbar_kws={ 'label': 'seconds', 'pad': 0.001 }) # Find colorbar for subax in ax.figure.axes: if subax.get_label() == '<colorbar>': subax.set_ylabel('seconds', labelpad=0) break new_ytick_labels = [] for ent, dollars, scheme in piv.index.to_list(): cost = dollar_labelize(dollars) if use_latex: scheme = r'{\LARGE ' + scheme + '}' _ = '{scheme}\nEntropy={ent}bits\nCost={cost}'.format( scheme=scheme, cost=cost, ent=ent) new_ytick_labels.append(_) new_xtick_labels = [] for n, name in piv.columns.to_list(): if use_latex: name = r'{\LARGE ' + name + '}' _ = name + '\n' + named_large_number(n, precision=0) + ' GPUs' new_xtick_labels.append(_) ax.set_xticklabels(new_xtick_labels, rotation=0) # ax.set_yticklabels(new_ytick_labels, horizontalalignment='left', pad=30) ax.set_yticklabels(new_ytick_labels) ax.set_ylabel('Password Scheme, Entropy, and Cost to Crack', labelpad=24) ax.set_xlabel('Adversary Resources', labelpad=16) notes = '' if hashmode in hashmode_to_notes: notes = ' (' + hashmode_to_notes[hashmode] + ')' if use_latex: title = '{{\\Huge Password Time Security}}\nhashmode={}{}'.format( hashmode, notes) ax.set_title(title) else: ax.set_title( 'Password Time Security\n(hashmode={}{})'.format( hashmode, notes)) ax.figure.subplots_adjust(bottom=0.1, left=0.20, right=1.0, top=0.90, wspace=0.001) if ub.argflag('--save'): fname = 'passwd_robustness_{}.png'.format(hashmode) ax.figure.savefig(fname) plt.show()
def benchmark_template(): import ubelt as ub import pandas as pd import timerit def method1(x, y, z): ret = [] for i in range((x + y) * z): ret.append(i) return ret def method2(x, y, z): ret = [i for i in range((x + y) * z)] return ret method_lut = locals() # can populate this some other way # Change params here to modify number of trials ti = timerit.Timerit(100, bestof=10, verbose=1) # if True, record every trail run and show variance in seaborn # if False, use the standard timerit min/mean measures RECORD_ALL = True # These are the parameters that we benchmark over basis = { 'method': ['method1', 'method2'], 'x': list(range(7)), 'y': [0, 100], 'z': [2, 3] # 'param_name': [param values], } xlabel = 'x' # Set these to param labels that directly transfer to method kwargs kw_labels = ['x', 'y', 'z'] # Set these to empty lists if they are not used group_labels = { 'style': ['y'], 'size': ['z'], } group_labels['hue'] = list((ub.oset(basis) - {xlabel}) - set.union(*map(set, group_labels.values()))) grid_iter = list(ub.named_product(basis)) # For each variation of your experiment, create a row. rows = [] for params in grid_iter: group_keys = {} for gname, labels in group_labels.items(): group_keys[gname + '_key'] = ub.repr2(ub.dict_isect( params, labels), compact=1, si=1) key = ub.repr2(params, compact=1, si=1) # Make any modifications you need to compute input kwargs for each # method here. kwargs = ub.dict_isect(params.copy(), kw_labels) method = method_lut[params['method']] # Timerit will run some user-specified number of loops. # and compute time stats with similar methodology to timeit for timer in ti.reset(key): # Put any setup logic you dont want to time here. # ... with timer: # Put the logic you want to time here method(**kwargs) if RECORD_ALL: # Seaborn will show the variance if this is enabled, otherwise # use the robust timerit mean / min times chunk_iter = ub.chunks(ti.times, ti.bestof) times = list(map(min, chunk_iter)) # TODO: timerit method for this for time in times: row = { # 'mean': ti.mean(), 'time': time, 'key': key, **group_keys, **params, } rows.append(row) else: row = { 'mean': ti.mean(), 'min': ti.min(), 'key': key, **group_keys, **params, } rows.append(row) time_key = 'time' if RECORD_ALL else 'min' # The rows define a long-form pandas data array. # Data in long-form makes it very easy to use seaborn. data = pd.DataFrame(rows) data = data.sort_values(time_key) if RECORD_ALL: # Show the min / mean if we record all min_times = data.groupby('key').min().rename({'time': 'min'}, axis=1) mean_times = data.groupby('key')[['time' ]].mean().rename({'time': 'mean'}, axis=1) stats_data = pd.concat([min_times, mean_times], axis=1) stats_data = stats_data.sort_values('min') else: stats_data = data USE_OPENSKILL = 1 if USE_OPENSKILL: # Lets try a real ranking method # https://github.com/OpenDebates/openskill.py import openskill method_ratings = {m: openskill.Rating() for m in basis['method']} other_keys = sorted( set(stats_data.columns) - {'key', 'method', 'min', 'mean', 'hue_key', 'size_key', 'style_key'}) for params, variants in stats_data.groupby(other_keys): variants = variants.sort_values('mean') ranking = variants['method'].reset_index(drop=True) mean_speedup = variants['mean'].max() / variants['mean'] stats_data.loc[mean_speedup.index, 'mean_speedup'] = mean_speedup min_speedup = variants['min'].max() / variants['min'] stats_data.loc[min_speedup.index, 'min_speedup'] = min_speedup if USE_OPENSKILL: # The idea is that each setting of parameters is a game, and each # "method" is a player. We rank the players by which is fastest, # and update their ranking according to the Weng-Lin Bayes ranking # model. This does not take the fact that some "games" (i.e. # parameter settings) are more important than others, but it should # be fairly robust on average. old_ratings = [[r] for r in ub.take(method_ratings, ranking)] new_values = openskill.rate(old_ratings) # Not inplace new_ratings = [openskill.Rating(*new[0]) for new in new_values] method_ratings.update(ub.dzip(ranking, new_ratings)) print('Statistics:') print(stats_data) if USE_OPENSKILL: from openskill import predict_win win_prob = predict_win([[r] for r in method_ratings.values()]) skill_agg = pd.Series(ub.dzip(method_ratings.keys(), win_prob)).sort_values(ascending=False) print('Aggregated Rankings =\n{}'.format(skill_agg)) plot = True if plot: # import seaborn as sns # kwplot autosns works well for IPython and script execution. # not sure about notebooks. import kwplot sns = kwplot.autosns() plt = kwplot.autoplt() plotkw = {} for gname, labels in group_labels.items(): if labels: plotkw[gname] = gname + '_key' # Your variables may change ax = kwplot.figure(fnum=1, doclf=True).gca() sns.lineplot(data=data, x=xlabel, y=time_key, marker='o', ax=ax, **plotkw) ax.set_title('Benchmark Name') ax.set_xlabel('Size (todo: A better x-variable description)') ax.set_ylabel('Time (todo: A better y-variable description)') # ax.set_xscale('log') # ax.set_yscale('log') try: __IPYTHON__ except NameError: plt.show()
def benchmark_ubelt_import_time_robust(): import pandas as pd import ubelt as ub import kwplot sns = kwplot.autosns(force='Qt5Agg') prog = ub.codeblock(r''' def _main(): import subprocess import ubelt as ub measurements = [] for i in range(200): row = {} # info = ub.cmd('python -X importtime -c "import ubelt"') # text = info['err'] prog = subprocess.Popen('python -X importtime -c "import ubelt"', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) _, text = prog.communicate() text = text.decode() final_line = text.rstrip().split('\n')[-1] partial = final_line.split(':')[1].split('|') row['self_us'] = float(partial[0].strip()) row['cummulative'] = float(partial[1].strip()) measurements.append(row) import pandas as pd df = pd.DataFrame(measurements) stats = pd.DataFrame({ 'mean': df.mean(), 'std': df.std(), 'min': df.min(), 'max': df.max(), 'total': df.sum(), }) info = stats.to_dict() info['version'] = ub.__version__ print(info) # print(stats) _main() ''') dpath = ub.Path(ub.ensure_app_cache_dir('ubelt/tests/test_version_import')) fpath = dpath / 'do_test.py' fpath.write_text(prog) repo_root = ub.Path('$HOME/code/ubelt').expand() info = ub.cmd('git tag', cwd=repo_root) versions = [p for p in info['out'].split('\n') if p] branches = ['dev/1.0.1', 'main'] + versions fig = kwplot.figure(doclf=True) ax = fig.gca() bname_to_info = {} rows = [] for bname in branches: print('bname = {!r}'.format(bname)) ub.cmd('git checkout {}'.format(bname), cwd=repo_root, verbose=3, check=True) info = ub.cmd('python {}'.format(fpath), verbose=2) dict_info = eval(info['out']) bname_to_info[bname] = dict_info for stat in ['mean', 'min', 'max']: for type in ['self_us', 'cummulative']: rows.append({ 'version': dict_info['version'], 'stat': stat, 'type': type, 'time': dict_info[stat][type], }) df = pd.DataFrame(rows[-1:]) print(df) # ax.cla() # sns.lineplot(data=df, x='version', y='time', hue='stat', style='type', ax=ax) ub.cmd('git checkout {}'.format('dev/1.0.1'), cwd=repo_root) df = pd.DataFrame(rows) from distutils.version import LooseVersion unique_versions = list( map(str, sorted(map(LooseVersion, df['version'].unique())))) df['release_index'] = df['version'].apply( lambda x: unique_versions.index(x)) ax.cla() kwplot.figure(fnum=2, pnum=(2, 1, 1), doclf=True) ax = sns.lineplot(data=df[df['type'] == 'cummulative'], x='release_index', y='time', hue='stat', style='type', marker='o') ax.set_title('Ubelt import time over release history') kwplot.figure(fnum=2, pnum=(2, 1, 2)) sns.lineplot(data=df[df['type'] == 'self_us'], x='release_index', y='time', hue='stat', style='type', marker='o')
def benchmark_repeat_vs_reduce_mul(): import ubelt as ub import pandas as pd import timerit def reduce_daq_rec(func, arrs): if len(arrs) == 1: return arrs[0] if len(arrs) == 2: return func(arrs[0], arrs[1]) elif len(arrs) == 3: return func(func(arrs[0], arrs[1]), arrs[3]) else: arrs1 = arrs[0::2] arrs2 = arrs[1::2] res1 = reduce_daq_rec(func, arrs1) res2 = reduce_daq_rec(func, arrs2) res = func(res1, res2) return res def reduce_daq_iter(func, arrs): """ https://www.baeldung.com/cs/convert-recursion-to-iteration https://stackoverflow.com/questions/159590/way-to-go-from-recursion-to-iteration arrs = [2, 3, 5, 7, 11, 13, 17, 21] """ raise NotImplementedError # TODO: make the iterative version from collections import deque empty_result = None stack = deque([(arrs, empty_result)]) idx = 0 while stack: print('----') print('stack = {}'.format(ub.repr2(list(stack), nl=1))) arrs0, result = stack.pop() if len(arrs0) == 0: raise Exception if result is not None: # raise Exception results = [result] while stack: next_arrs0, next_result = stack.pop() if next_result is None: break else: results.append(next_result) if results: if len(results) == 1: stack.append((results, results[0])) else: stack.append((results, None)) if next_result is None: stack.append((next_arrs0, None)) elif result is None: if len(arrs0) == 1: result = arrs0[0] stack.append((arrs0, result)) # return arrs0[0] if len(arrs0) == 2: result = func(arrs0[0], arrs0[1]) stack.append((arrs0, result)) elif len(arrs0) == 3: result = func(func(arrs0[0], arrs0[1]), arrs0[3]) stack.append((arrs0, result)) else: arrs01 = arrs0[0::2] arrs02 = arrs0[1::2] stack.append((arrs0, empty_result)) stack.append((arrs01, empty_result)) stack.append((arrs02, empty_result)) # res1 = reduce_daq_rec(func, arrs01) # res2 = reduce_daq_rec(func, arrs2) # res = func(res1, res2) idx += 1 if idx > 10: raise Exception return res def method_daq_rec(arrs): return reduce_daq_rec(np.multiply, arrs) def method_repeat(arrs): """ helper code: arr_names = ['a{:02d}'.format(idx) for idx in range(1, 32 + 1)] lhs = ', '.join(arr_names) rhs = ' * '.join(arr_names) print(f'{lhs} = arrs') print(f'ret = {rhs}') """ # Hard coded pure python syntax for multiplying if len(arrs) == 4: a01, a02, a03, a04 = arrs ret = a01 * a02 * a03 * a04 elif len(arrs) == 8: a01, a02, a03, a04, a05, a06, a07, a08 = arrs ret = a01 * a02 * a03 * a04 * a05 * a06 * a07 * a08 elif len(arrs) == 32: a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32 = arrs ret = a01 * a02 * a03 * a04 * a05 * a06 * a07 * a08 * a09 * a10 * a11 * a12 * a13 * a14 * a15 * a16 * a17 * a18 * a19 * a20 * a21 * a22 * a23 * a24 * a25 * a26 * a27 * a28 * a29 * a30 * a31 * a32 return ret def method_reduce(arrs): ret = np.multiply.reduce(arrs) return ret def method_stack(arrs): stacked = np.stack(arrs) ret = stacked.prod(axis=0) return ret method_lut = locals() # can populate this some other way ti = timerit.Timerit(10000, bestof=10, verbose=2) basis = { 'method': ['method_repeat', 'method_reduce', 'method_stack', 'method_daq_rec'], 'arr_size': [10, 100, 1000, 10000], 'num_arrs': [4, 8, 32], } xlabel = 'arr_size' kw_labels = [] group_labels = { 'style': ['num_arrs'], 'size': [], } group_labels['hue'] = list((ub.oset(basis) - {xlabel}) - set.union(*map(set, group_labels.values()))) grid_iter = list(ub.named_product(basis)) # For each variation of your experiment, create a row. rows = [] for params in grid_iter: group_keys = {} for gname, labels in group_labels.items(): group_keys[gname + '_key'] = ub.repr2(ub.dict_isect( params, labels), compact=1, si=1) key = ub.repr2(params, compact=1, si=1) kwargs = ub.dict_isect(params.copy(), kw_labels) arr_size = params['arr_size'] num_arrs = params['num_arrs'] arrs = [] for _ in range(num_arrs): arr = np.random.rand(arr_size) arrs.append(arr) kwargs['arrs'] = arrs method = method_lut[params['method']] # Timerit will run some user-specified number of loops. # and compute time stats with similar methodology to timeit for timer in ti.reset(key): # Put any setup logic you dont want to time here. # ... with timer: # Put the logic you want to time here method(**kwargs) row = { 'mean': ti.mean(), 'min': ti.min(), 'key': key, **group_keys, **params, } rows.append(row) # The rows define a long-form pandas data array. # Data in long-form makes it very easy to use seaborn. data = pd.DataFrame(rows) data = data.sort_values('min') print(data) plot = True if plot: # import seaborn as sns # kwplot autosns works well for IPython and script execution. # not sure about notebooks. import kwplot sns = kwplot.autosns() plotkw = {} for gname, labels in group_labels.items(): if labels: plotkw[gname] = gname + '_key' # Your variables may change ax = kwplot.figure(fnum=1, doclf=True).gca() sns.lineplot(data=data, x=xlabel, y='min', marker='o', ax=ax, **plotkw) ax.set_title('Benchmark') ax.set_xlabel('Array Size') ax.set_ylabel('Time')
owid_co2_data_fpath = ub.grabdata('https://nyc3.digitaloceanspaces.com/owid-public/data/co2/owid-co2-data.json', expires=timedelta(days=30)) with open(owid_co2_data_fpath, 'r') as file: co2_data = json.load(file) us_co2_data = co2_data['United States']['data'] _raw_us_data = pd.DataFrame(us_co2_data).set_index('year', drop=0) us_data = _raw_us_data.loc[1980:][columns_of_interest] us_data = us_data.assign(year=_raw_us_data['year'].apply(lambda x: x * reg.year)) us_data = us_data.assign(co2=_raw_us_data['co2'].apply(lambda x: x * million * CO2_ton)) us_data = us_data.assign(population=_raw_us_data['population'].apply(lambda x: x * reg.us_person)) us_data = us_data.assign(primary_energy_consumption=_raw_us_data['primary_energy_consumption'].apply(lambda x: x * twh / reg.year)) us_data = us_data.assign(gdp=_raw_us_data['gdp'].apply(lambda x: x * reg.dollar_2011)) if 0: import kwplot sns = kwplot.autosns() sns.lineplot(data=_raw_us_data, x='year', y='co2') us_emissions_2018 = us_data['co2'].loc[2018] us_population_2018 = us_data['population'].loc[2018] us_person_anual_footprint = us_emissions_2018 / us_population_2018 us_data['co2_per_capita'] = us_data['co2'] / us_data['population'] else: us_emissions_2018 = 5.27 * billion * CO2_ton / reg.year_2018 us_population_2018 = 327.2 * million * reg.us_person us_person_anual_footprint = us_emissions_2018 / us_population_2018 # Different estimates for this number us_person_anual_footprint_candidates = { 'nature.org': 16 * CO2_ton / (reg.year * reg.us_person), 'terrapass': (63_934 * CO2_pound).to(CO2_ton) / (reg.year * reg.us_person),
def benchmark_nested_break(): """ There are several ways to do a nested break, but which one is best? https://twitter.com/nedbat/status/1515345787563220996 """ import ubelt as ub import pandas as pd import timerit import itertools as it def method1_itertools(iter1, iter2): for i, j in it.product(iter1, iter2): if i == 20 and j == 20: break def method2_except(iter1, iter2): class Found(Exception): pass try: for i in iter1: for j in iter2: if i == 20 and j == 20: raise Found except Found: pass class FoundPredef(Exception): pass def method2_5_except_predef(iter1, iter2): try: for i in iter1: for j in iter2: if i == 20 and j == 20: raise FoundPredef except FoundPredef: pass def method3_gendef(iter1, iter2): def genfunc(): for i in iter1: for j in iter2: yield i, j for i, j in genfunc(): if i == 20 and j == 20: break def method4_genexp(iter1, iter2): genexpr = ((i, j) for i in iter1 for j in iter2) for i, j in genexpr: if i == 20 and j == 20: break method_lut = locals() # can populate this some other way # Change params here to modify number of trials ti = timerit.Timerit(1000, bestof=10, verbose=1) # if True, record every trail run and show variance in seaborn # if False, use the standard timerit min/mean measures RECORD_ALL = True # These are the parameters that we benchmark over import numpy as np basis = { 'method': ['method1_itertools', 'method2_except', 'method2_5_except_predef', 'method3_gendef', 'method4_genexp'], # 'n1': np.logspace(1, np.log2(100), 30, base=2).astype(int), # 'n2': np.logspace(1, np.log2(100), 30, base=2).astype(int), 'size': np.logspace(1, np.log2(10000), 30, base=2).astype(int), 'input_style': ['range', 'list', 'customized_iter'], # 'param_name': [param values], } xlabel = 'size' xinput_labels = ['n1', 'n2', 'size'] # Set these to param labels that directly transfer to method kwargs kw_labels = [] # Set these to empty lists if they are not used group_labels = { 'style': ['input_style'], 'size': [], } group_labels['hue'] = list( (ub.oset(basis) - {xlabel} - xinput_labels) - set.union(*map(set, group_labels.values()))) grid_iter = list(ub.named_product(basis)) def make_input(params): # Given the parameterization make the benchmark function input # n1 = params['n1'] # n2 = params['n2'] size = params['size'] n1 = int(np.sqrt(size)) n2 = int(np.sqrt(size)) if params['input_style'] == 'list': iter1 = list(range(n1)) iter2 = list(range(n1)) elif params['input_style'] == 'range': iter1 = range(n1) iter2 = range(n2) elif params['input_style'] == 'customized_iter': import random def rando1(): rng1 = random.Random(0) for _ in range(n1): yield rng1.randint(0, n2) def rando2(): rng2 = random.Random(1) for _ in range(n1): yield rng2.randint(0, n2) iter1 = rando1() iter2 = rando2() else: raise KeyError return {'iter1': iter1, 'iter2': iter2} # For each variation of your experiment, create a row. rows = [] for params in grid_iter: # size = params['n1'] * params['n2'] # params['size'] = size group_keys = {} for gname, labels in group_labels.items(): group_keys[gname + '_key'] = ub.repr2( ub.dict_isect(params, labels), compact=1, si=1) key = ub.repr2(params, compact=1, si=1) # Make any modifications you need to compute input kwargs for each # method here. kwargs = ub.dict_isect(params.copy(), kw_labels) method = method_lut[params['method']] # Timerit will run some user-specified number of loops. # and compute time stats with similar methodology to timeit for timer in ti.reset(key): # Put any setup logic you dont want to time here. # ... kwargs.update(make_input(params)) with timer: # Put the logic you want to time here method(**kwargs) if RECORD_ALL: # Seaborn will show the variance if this is enabled, otherwise # use the robust timerit mean / min times # chunk_iter = ub.chunks(ti.times, ti.bestof) # times = list(map(min, chunk_iter)) # TODO: timerit method for this times = ti.robust_times() for time in times: row = { # 'mean': ti.mean(), 'time': time, 'key': key, **group_keys, **params, } rows.append(row) else: row = { 'mean': ti.mean(), 'min': ti.min(), 'key': key, **group_keys, **params, } rows.append(row) time_key = 'time' if RECORD_ALL else 'min' # The rows define a long-form pandas data array. # Data in long-form makes it very easy to use seaborn. data = pd.DataFrame(rows) data = data.sort_values(time_key) if RECORD_ALL: # Show the min / mean if we record all min_times = data.groupby('key').min().rename({'time': 'min'}, axis=1) mean_times = data.groupby('key')[['time']].mean().rename({'time': 'mean'}, axis=1) stats_data = pd.concat([min_times, mean_times], axis=1) stats_data = stats_data.sort_values('min') else: stats_data = data USE_OPENSKILL = 1 if USE_OPENSKILL: # Lets try a real ranking method # https://github.com/OpenDebates/openskill.py import openskill method_ratings = {m: openskill.Rating() for m in basis['method']} other_keys = sorted(set(stats_data.columns) - {'key', 'method', 'min', 'mean', 'hue_key', 'size_key', 'style_key'}) for params, variants in stats_data.groupby(other_keys): variants = variants.sort_values('mean') ranking = variants['method'].reset_index(drop=True) mean_speedup = variants['mean'].max() / variants['mean'] stats_data.loc[mean_speedup.index, 'mean_speedup'] = mean_speedup min_speedup = variants['min'].max() / variants['min'] stats_data.loc[min_speedup.index, 'min_speedup'] = min_speedup if USE_OPENSKILL: # The idea is that each setting of parameters is a game, and each # "method" is a player. We rank the players by which is fastest, # and update their ranking according to the Weng-Lin Bayes ranking # model. This does not take the fact that some "games" (i.e. # parameter settings) are more important than others, but it should # be fairly robust on average. old_ratings = [[r] for r in ub.take(method_ratings, ranking)] new_values = openskill.rate(old_ratings) # Not inplace new_ratings = [openskill.Rating(*new[0]) for new in new_values] method_ratings.update(ub.dzip(ranking, new_ratings)) print('Statistics:') print(stats_data) if USE_OPENSKILL: from openskill import predict_win win_prob = predict_win([[r] for r in method_ratings.values()]) skill_agg = pd.Series(ub.dzip(method_ratings.keys(), win_prob)).sort_values(ascending=False) print('method_ratings = {}'.format(ub.repr2(method_ratings, nl=1))) print('Aggregated Rankings =\n{}'.format(skill_agg)) plot = True if plot: # import seaborn as sns # kwplot autosns works well for IPython and script execution. # not sure about notebooks. import kwplot sns = kwplot.autosns() plt = kwplot.autoplt() plotkw = {} for gname, labels in group_labels.items(): if labels: plotkw[gname] = gname + '_key' # Your variables may change ax = kwplot.figure(fnum=1, doclf=True).gca() sns.lineplot(data=data, x=xlabel, y=time_key, marker='o', ax=ax, **plotkw) ax.set_title(f'Benchmark Nested Breaks: #Trials {ti.num}, bestof {ti.bestof}') ax.set_xlabel(f'{xlabel}') ax.set_ylabel('Time') ax.set_xscale('log') ax.set_yscale('log') try: __IPYTHON__ except NameError: plt.show()
def benchmark_reversed_range(): import ubelt as ub import pandas as pd import timerit import itertools as it methods = [] def custom_reversed_range_v1(start, stop): final = stop - 1 for idx in range(stop - start): yield final - idx def custom_reversed_range_v2(start, stop): yield from it.islice(it.count(stop - 1, step=-1), stop - start) @methods.append def reversed_builtin(x): start = 10 stop = x + start ret = list(reversed(range(start, stop))) return ret @methods.append def negative_range(x): start = 10 stop = x + start ret = list(range(stop - 1, start - 1, -1)) return ret # @methods.append # def custom_v1(x): # start = 10 # stop = x + start # ret = list(custom_reversed_range_v1(start, stop)) # return ret # @methods.append # def custom_v2(x): # start = 10 # stop = x + start # ret = list(custom_reversed_range_v2(start, stop)) # return ret method_lut = {f.__name__: f for f in methods} results = {k: func(10) for k, func in method_lut.items()} print('results = {}'.format(ub.repr2(results, nl=1, align=':'))) if not ub.allsame(results.values()): raise AssertionError('Failed consistency check') ti = timerit.Timerit(1000, bestof=10, verbose=2) basis = { 'method': list(method_lut.keys()), 'x': [2 ** i for i in range(14)], } grid_iter = ub.named_product(basis) # For each variation of your experiment, create a row. rows = [] for params in grid_iter: key = ub.repr2(params, compact=1, si=1) kwargs = params.copy() method_key = kwargs.pop('method') method = method_lut[method_key] # Timerit will run some user-specified number of loops. # and compute time stats with similar methodology to timeit for timer in ti.reset(key): # Put any setup logic you dont want to time here. # ... with timer: # Put the logic you want to time here method(**kwargs) row = { 'mean': ti.mean(), 'min': ti.min(), 'key': key, **params, } rows.append(row) # The rows define a long-form pandas data array. # Data in long-form makes it very easy to use seaborn. data = pd.DataFrame(rows) print(data) plot = True if plot: # import seaborn as sns # kwplot autosns works well for IPython and script execution. # not sure about notebooks. import kwplot sns = kwplot.autosns() # Your variables may change ax = kwplot.figure(fnum=1, doclf=True).gca() sns.lineplot(data=data, x='x', y='min', hue='method', marker='o', ax=ax) # ax.set_xscale('log') ax.set_title('Benchmark Reveral Methods ') ax.set_xlabel('A better x-variable description') ax.set_ylabel('A better y-variable description')
def main(): import kwplot plt = kwplot.autoplt() sns = kwplot.autosns() alias = { '3090': 'nvctrl GeForce GTX 1080 Ti 1 temp', '1080ti': 'nvctrl GeForce RTX 3090 0 temp', # 'cpu': 'lmsensor coretemp-isa-0000 Package id 0', } all_df = read_psensor_log() unique_rawdevs = all_df.device.unique() for rawdev in unique_rawdevs: cpu_prefix = 'lmsensor coretemp-isa' if rawdev.startswith(cpu_prefix): suffix = rawdev[len(cpu_prefix):].split(' ', 1)[1].strip() alias['CPU ' + suffix] = rawdev if 'nvctrl' in rawdev and 'temp' in rawdev: alias['GPU ' + rawdev[7:-5]] = rawdev mapper = ub.invert_dict(alias) all_df['device'] = all_df['device'].apply(lambda x: mapper.get(x, None)) all_df = all_df[all_df['device'].apply(lambda x: x is not None)] hours = int(ub.argval('--hours', default=48)) delta = datetime.timedelta(hours=hours) min_time = datetime.datetime.now() - delta is_recent = all_df.datetime > min_time recent_df = all_df[is_recent] chosen = recent_df # chosen = all_df if 0: pivtbl = recent_df.pivot('unix_timestamp', 'device', 'temp') pivtbl = pivtbl.sort_index() smoothed_rows = [] for window_idxs in ub.iter_window(list(range(len(pivtbl))), size=10): window = pivtbl.iloc[list(window_idxs)] max_val = window.max(axis=0, skipna=True) for k, v in max_val.to_dict().items(): smoothed_rows.append({ 'unix_timestamp': window.index[1], 'device': k, 'temp': v, }) max_extra = pd.DataFrame(smoothed_rows) sns.lineplot(data=max_extra, x='unix_timestamp', y='temp', hue='device') df = recent_df.copy() df['device'] = df['device'].apply(lambda x: 'Core' if x.startswith('Core') else x) df['time'] = df['unix_timestamp'].apply( datetime.datetime.fromtimestamp) plt.gcf().clf() # sns.lineplot(data=chosen, x='unix_timestamp', y='temp', hue='device') for xx, (sess, group) in enumerate(chosen.groupby('session_x')): # ax.cla() ax = plt.gca() sns.lineplot(data=group, x='unix_timestamp', y='temp', hue='device', legend=xx == 0) label_xaxis_dates(ax) ax.figure.subplots_adjust(bottom=0.2) ax.set_ylim(0, 100) plt.locator_params(axis='y', nbins=10) # import matplotlib as mpl # Draw shutdown time as black lines end_times = [] for sx, group in chosen.groupby('session_x'): shutdown_time = group['unix_timestamp'].max() end_times.append(shutdown_time) for shutdown_time in sorted(end_times)[:-1]: ax.plot((shutdown_time, shutdown_time), [0, 100], color='k') # ci_df = pd.concat([max_extra, recent_df]) # ci_df['device'] = ci_df['device'].apply(lambda x: 'Core' if x.startswith('Core') else x) # sns.lineplot(data=ci_df, x='unix_timestamp', y='temp', hue='device') # from matplotlib.dates import date2num # all_df['date_ord'] = all_df['datetime'].map(lambda a: date2num(a)) # sns.lineplot(data=pt) # sns.lineplot(data=recent_df, x='unix_timestamp', y='temp', hue='device') # sns.regplot(data=recent_df, x='unix_timestamp', y='temp', hue='device') plt.show()
def ford_circles(): """ Draw Ford Circles This is a Ford Circle diagram of the Rationals and Float32 numbers. Only 163 of the 32608 rationals I generated can be exactly represented by a float32. [MF 14] [MF 95] [MF 14] https://www.youtube.com/watch?v=83ZjYvkdzYI&list=PL5A714C94D40392AB&index=14 [MF 95] https://www.youtube.com/watch?v=gATEJ3f3FBM&list=PL5A714C94D40392AB&index=95 Examples: import kwplot kwplot.autompl() """ import kwplot import ubelt as ub import matplotlib as mpl plt = kwplot.autoplt() sns = kwplot.autosns() # NOQA limit = 256 * 256 print('limit = {!r}'.format(limit)) rats_to_plot = set() maxx = 1 _iter = Rational.members(limit=limit) _genrat = set(ub.ProgIter(_iter, total=limit, desc='gen rats')) rats_to_plot |= _genrat rats_to_plot2 = sorted({Rational(r % maxx) for r in rats_to_plot} | {maxx}) floats = sorted( ub.unique(map(float, rats_to_plot2), key=lambda f: f.as_integer_ratio())) print(f'{len(rats_to_plot) = }') print(f'{len(rats_to_plot2) = }') print(f'{len(floats) = }') import numpy as np ax = kwplot.figure(fnum=1, doclf=True).gca() prog = ub.ProgIter(sorted(rats_to_plot2), verbose=1) dtype = np.float32 patches = ub.ddict(list) errors = [] for rat in prog: denominator = rat.denominator radius = 1 / (2 * (denominator * denominator)) point = (rat, radius) flt = dtype(rat) a, b = flt.as_integer_ratio() flt_as_rat = Rational(a, b) error = abs(rat - flt_as_rat) if error == 0: new_circle = plt.Circle(point, radius, facecolor='dodgerblue', edgecolor='none', linewidth=0, alpha=0.5) patches['good'].append(new_circle) else: errors.append(error) # Plot a line for error new_circle = plt.Circle(point, radius, facecolor='orangered', edgecolor='none', linewidth=0, alpha=0.5) patches['bad'].append(new_circle) ax.plot((rat - error, rat + error), (radius, radius), 'x-', color='darkgray') print(ub.map_vals(len, patches)) total = float(sum(errors)) print('total = {!r}'.format(total)) print(max(errors)) print(min(errors)) for v in patches.values(): first = ub.peek(v) prop = ub.dict_isect(first.properties(), ['facecolor', 'linewidth', 'alpha', 'edgecolor']) col = mpl.collections.PatchCollection(v, **prop) ax.add_collection(col) # Lets look for the holes in IEEE float # for flt in ub.ProgIter(sorted(floats), verbose=1): kwplot.phantom_legend({ f'rationals without a {dtype}': 'orangered', f'rationals with a {dtype}': 'dodgerblue', f'x-x indicates {dtype} approximation error': 'darkgray', }) ax.set_title('Holes in IEEE 754 Float64') ax.set_xlabel('A rational number') ax.set_ylabel('The squared rational denominator') # import numpy as np # points = np.array([c.center for c in _circles]) # maxx, maxy = points.max(axis=0) # print('maxx = {!r}'.format(maxx)) # print('maxy = {!r}'.format(maxy)) # maxx, maxy = maxx // 2, maxy // 2 # ax.set_xlim(0, np.sqrt(int(maxx))) # ax.set_ylim(0, np.sqrt(int(maxy))) # ax.set_aspect('equal') # ax.set_xlim(0.2, 0.22) ax.set_xlim(0, 1) ax.set_ylim(0, 0.1)
def benchmark_unpack(): """ What is faster unpacking items with slice syntax or tuple-unpacking Slice unpacking seems to be a tad faster. """ import ubelt as ub import random import pandas as pd import timerit import string def tuple_unpack(items): *prefix, key = items return prefix, key def slice_unpack(items): prefix, key = items[:-1], items[-1] return prefix, key method_lut = locals() # can populate this some other way ti = timerit.Timerit(5000, bestof=3, verbose=2) basis = { 'method': ['tuple_unpack', 'slice_unpack'], 'size': list(range(1, 64 + 1)), 'type': ['string', 'float'], } xlabel = 'size' kw_labels = [] group_labels = { 'style': ['type'], 'size': [], } group_labels['hue'] = list((ub.oset(basis) - {xlabel}) - set.union(*map(set, group_labels.values()))) grid_iter = list(ub.named_product(basis)) # For each variation of your experiment, create a row. rows = [] for params in grid_iter: group_keys = {} for gname, labels in group_labels.items(): group_keys[gname + '_key'] = ub.repr2(ub.dict_isect( params, labels), compact=1, si=1) key = ub.repr2(params, compact=1, si=1) size = params['size'] method = method_lut[params['method']] # Timerit will run some user-specified number of loops. # and compute time stats with similar methodology to timeit for timer in ti.reset(key): if type == 'string': items = [ ''.join(random.choices(string.printable, k=5)) for _ in range(size) ] elif type == 'float': items = [random.random() for _ in range(size)] with timer: method(items) for time in ti.times: row = { 'time': time, 'key': key, **group_keys, **params, } rows.append(row) # The rows define a long-form pandas data array. # Data in long-form makes it very easy to use seaborn. data = pd.DataFrame(rows) data = data.sort_values('time') summary_rows = [] for method, group in data.groupby('method'): row = {} row['method'] = method row['mean'] = group['time'].mean() row['std'] = group['time'].std() row['min'] = group['time'].min() row['max'] = group['time'].max() summary_rows.append(row) print(pd.DataFrame(summary_rows).sort_values('mean')) plot = True if plot: # import seaborn as sns # kwplot autosns works well for IPython and script execution. # not sure about notebooks. import kwplot sns = kwplot.autosns() plotkw = {} for gname, labels in group_labels.items(): if labels: plotkw[gname] = gname + '_key' # Your variables may change ax = kwplot.figure(fnum=1, doclf=True).gca() sns.lineplot(data=data, x=xlabel, y='time', marker='o', ax=ax, **plotkw) ax.set_title('Benchmark') ax.set_xlabel('Execution time') ax.set_ylabel('Size of slices')
def benchmark_pathlib_vs_fspath(): import ubelt as ub import pathlib import pandas as pd import random import timerit import os def method_pathlib(inputs): p = pathlib.Path(*inputs) def method_ospath(inputs): p = os.path.join(*inputs) method_lut = locals() # can populate this some other way ti = timerit.Timerit(10000, bestof=10, verbose=2) basis = { 'method': ['method_pathlib', 'method_ospath'], 'num_parts': [2, 4, 8, 12, 16], } xlabel = 'num_parts' kw_labels = [] group_labels = { 'style': [], 'size': [], } group_labels['hue'] = list((ub.oset(basis) - {xlabel}) - set.union(*map(set, group_labels.values()))) grid_iter = list(ub.named_product(basis)) # For each variation of your experiment, create a row. rows = [] for params in grid_iter: group_keys = {} for gname, labels in group_labels.items(): group_keys[gname + '_key'] = ub.repr2(ub.dict_isect( params, labels), compact=1, si=1) key = ub.repr2(params, compact=1, si=1) kwargs = ub.dict_isect(params.copy(), kw_labels) n = params['num_parts'] inputs = [chr(random.randint(97, 120)) for _ in range(n)] kwargs['inputs'] = inputs method = method_lut[params['method']] # Timerit will run some user-specified number of loops. # and compute time stats with similar methodology to timeit for timer in ti.reset(key): # Put any setup logic you dont want to time here. # ... with timer: # Put the logic you want to time here method(**kwargs) row = { 'mean': ti.mean(), 'min': ti.min(), 'key': key, **group_keys, **params, } rows.append(row) # The rows define a long-form pandas data array. # Data in long-form makes it very easy to use seaborn. data = pd.DataFrame(rows) data = data.sort_values('min') print(data) plot = True if plot: # import seaborn as sns # kwplot autosns works well for IPython and script execution. # not sure about notebooks. import kwplot sns = kwplot.autosns() plotkw = {} for gname, labels in group_labels.items(): if labels: plotkw[gname] = gname + '_key' # Your variables may change ax = kwplot.figure(fnum=1, doclf=True).gca() sns.lineplot(data=data, x=xlabel, y='min', marker='o', ax=ax, **plotkw) ax.set_title('Benchmark') ax.set_xlabel('Time') ax.set_ylabel('Number of parts')
def benchmark_dict_diff_impl(): import ubelt as ub import pandas as pd import timerit import random def method_diffkeys(*args): first_dict = args[0] keys = set(first_dict) keys.difference_update(*map(set, args[1:])) new0 = dict((k, first_dict[k]) for k in keys) return new0 def method_diffkeys_list(*args): first_dict = args[0] remove_keys = set.union(*map(set, args[1:])) keep_keys = [k for k in first_dict.keys() if k not in remove_keys] new = dict((k, first_dict[k]) for k in keep_keys) return new def method_diffkeys_oset(*args): first_dict = args[0] keys = ub.oset(first_dict) keys.difference_update(*map(set, args[1:])) new0 = dict((k, first_dict[k]) for k in keys) return new0 def method_ifkeys_setcomp(*args): first_dict = args[0] remove_keys = {k for ks in args[1:] for k in ks} new1 = dict((k, v) for k, v in first_dict.items() if k not in remove_keys) return new1 def method_ifkeys_setunion(*args): first_dict = args[0] remove_keys = set.union(*map(set, args[1:])) new2 = dict((k, v) for k, v in first_dict.items() if k not in remove_keys) return new2 def method_ifkeys_getitem(*args): first_dict = args[0] remove_keys = set.union(*map(set, args[1:])) new3 = dict((k, first_dict[k]) for k in first_dict.keys() if k not in remove_keys) return new3 def method_ifkeys_dictcomp(*args): # Cannot use until 3.6 is dropped (it is faster) first_dict = args[0] remove_keys = set.union(*map(set, args[1:])) new4 = {k: v for k, v in first_dict.items() if k not in remove_keys} return new4 def method_ifkeys_dictcomp_getitem(*args): # Cannot use until 3.6 is dropped (it is faster) first_dict = args[0] remove_keys = set.union(*map(set, args[1:])) new4 = {k: first_dict[k] for k in first_dict.keys() if k not in remove_keys} return new4 method_lut = locals() # can populate this some other way def make_data(num_items, num_other, remove_fraction, keytype): if keytype == 'str': keytype = str if keytype == 'int': keytype = int first_keys = [random.randint(0, 1000) for _ in range(num_items)] k = int(remove_fraction * len(first_keys)) remove_sets = [list(ub.unique(random.choices(first_keys, k=k) + [random.randint(0, 1000) for _ in range(num_items)])) for _ in range(num_other)] first_dict = {keytype(k): k for k in first_keys} args = [first_dict] + [{keytype(k): k for k in ks} for ks in remove_sets] return args ti = timerit.Timerit(200, bestof=1, verbose=2) basis = { 'method': [ # Cant use because unordered # 'method_diffkeys', # Cant use because python 3.6 'method_ifkeys_dictcomp', 'method_ifkeys_dictcomp_getitem', 'method_ifkeys_setunion', 'method_ifkeys_getitem', 'method_diffkeys_list', # Probably not good # 'method_ifkeys_setcomp', # 'method_diffkeys_oset', ], 'num_items': [10, 100, 1000], 'num_other': [1, 3, 5], # 'num_other': [1], 'remove_fraction': [0, 0.2, 0.5, 0.7, 1.0], # 'remove_fraction': [0.2, 0.8], 'keytype': ['str', 'int'], # 'keytype': ['str'], # 'param_name': [param values], } xlabel = 'num_items' kw_labels = ['num_items', 'num_other', 'remove_fraction', 'keytype'] group_labels = { 'style': ['num_other', 'keytype'], 'size': ['remove_fraction'], } group_labels['hue'] = list( (ub.oset(basis) - {xlabel}) - set.union(*map(set, group_labels.values()))) grid_iter = list(ub.named_product(basis)) # For each variation of your experiment, create a row. rows = [] for params in grid_iter: group_keys = {} for gname, labels in group_labels.items(): group_keys[gname + '_key'] = ub.repr2( ub.dict_isect(params, labels), compact=1, si=1) key = ub.repr2(params, compact=1, si=1) kwargs = ub.dict_isect(params.copy(), kw_labels) args = make_data(**kwargs) method = method_lut[params['method']] # Timerit will run some user-specified number of loops. # and compute time stats with similar methodology to timeit for timer in ti.reset(key): # Put any setup logic you dont want to time here. # ... with timer: # Put the logic you want to time here method(*args) row = { 'mean': ti.mean(), 'min': ti.min(), 'key': key, **group_keys, **params, } rows.append(row) # The rows define a long-form pandas data array. # Data in long-form makes it very easy to use seaborn. data = pd.DataFrame(rows) data = data.sort_values('min') print(data) # for each parameter setting, group all methods with that used those exact # comparable params. Then rank how good each method did. That will be a # preference profile. We will give that preference profile a weight (e.g. # based on the fastest method in the bunch) and then aggregate them with # some voting method. USE_OPENSKILL = 1 if USE_OPENSKILL: # Lets try a real ranking method # https://github.com/OpenDebates/openskill.py import openskill method_ratings = {m: openskill.Rating() for m in basis['method']} weighted_rankings = ub.ddict(lambda: ub.ddict(float)) for params, variants in data.groupby(['num_other', 'keytype', 'remove_fraction', 'num_items']): variants = variants.sort_values('mean') ranking = variants['method'].reset_index(drop=True) if USE_OPENSKILL: # The idea is that each setting of parameters is a game, and each # "method" is a player. We rank the players by which is fastest, # and update their ranking according to the Weng-Lin Bayes ranking # model. This does not take the fact that some "games" (i.e. # parameter settings) are more important than others, but it should # be fairly robust on average. old_ratings = [[r] for r in ub.take(method_ratings, ranking)] new_values = openskill.rate(old_ratings) # Not inplace new_ratings = [openskill.Rating(*new[0]) for new in new_values] method_ratings.update(ub.dzip(ranking, new_ratings)) # Choose a ranking weight scheme weight = variants['mean'].min() # weight = 1 for rank, method in enumerate(ranking): weighted_rankings[method][rank] += weight weighted_rankings[method]['total'] += weight # Probably a more robust voting method to do this weight_rank_rows = [] for method_name, ranks in weighted_rankings.items(): weights = ub.dict_diff(ranks, ['total']) p_rank = ub.map_vals(lambda w: w / ranks['total'], weights) for rank, w in p_rank.items(): weight_rank_rows.append({'rank': rank, 'weight': w, 'name': method_name}) weight_rank_df = pd.DataFrame(weight_rank_rows) piv = weight_rank_df.pivot(['name'], ['rank'], ['weight']) print(piv) if USE_OPENSKILL: from openskill import predict_win win_prob = predict_win([[r] for r in method_ratings.values()]) skill_agg = pd.Series(ub.dzip(method_ratings.keys(), win_prob)).sort_values(ascending=False) print('skill_agg =\n{}'.format(skill_agg)) aggregated = (piv * piv.columns.levels[1].values).sum(axis=1).sort_values() print('weight aggregated =\n{}'.format(aggregated)) plot = True if plot: # import seaborn as sns # kwplot autosns works well for IPython and script execution. # not sure about notebooks. import kwplot sns = kwplot.autosns() plotkw = {} for gname, labels in group_labels.items(): if labels: plotkw[gname] = gname + '_key' # Your variables may change ax = kwplot.figure(fnum=1, doclf=True).gca() sns.lineplot(data=data, x=xlabel, y='min', marker='o', ax=ax, **plotkw) ax.set_title('Benchmark') ax.set_xlabel('A better x-variable description') ax.set_ylabel('A better y-variable description')
def benchmark_mul_vs_pow(): import ubelt as ub import pandas as pd import timerit from functools import reduce import operator as op import itertools as it def method_pow_via_mul_raw(n): """ Construct a function that does multiplication of a value n times """ return eval('lambda v: ' + ' * '.join(['v'] * n)) def method_pow_via_mul_for(v, n): ret = v for _ in range(1, n): ret = ret * v return ret def method_pow_via_mul_reduce(v, n): """ Alternative way to multiply a value n times """ return reduce(op.mul, it.repeat(v, n)) def method_pow_via_pow(v, n): return v ** n method_lut = locals() # can populate this some other way ti = timerit.Timerit(500000, bestof=1000, verbose=2) basis = { 'method': ['method_pow_via_mul_raw', 'method_pow_via_pow'], 'n': list(range(1, 20)), 'v': ['random-int', 'random-float'], # 'param_name': [param values], } xlabel = 'n' kw_labels = ['v', 'n'] group_labels = { 'style': ['v'], 'size': [], } group_labels['hue'] = list( (ub.oset(basis) - {xlabel}) - set.union(*map(set, group_labels.values()))) grid_iter = list(ub.named_product(basis)) # For each variation of your experiment, create a row. rows = [] for params in grid_iter: group_keys = {} for gname, labels in group_labels.items(): group_keys[gname + '_key'] = ub.repr2( ub.dict_isect(params, labels), compact=1, si=1) key = ub.repr2(params, compact=1, si=1) kwargs = ub.dict_isect(params.copy(), kw_labels) method = method_lut[params['method']] # Timerit will run some user-specified number of loops. # and compute time stats with similar methodology to timeit if params['method'] == 'method_pow_via_mul_raw': method = method(kwargs.pop('n')) for timer in ti.reset(key): # Put any setup logic you dont want to time here. # ... import random if kwargs['v'] == 'random': kwargs['v'] = random.randint(1, 31000) if random.random() > 0.5 else random.random() elif kwargs['v'] == 'random-int': kwargs['v'] = random.randint(1, 31000) elif kwargs['v'] == 'random-float': kwargs['v'] = random.random() with timer: # Put the logic you want to time here method(**kwargs) for time in map(min, ub.chunks(ti.times, ti.bestof)): row = { # 'mean': ti.mean(), 'time': time, 'key': key, **group_keys, **params, } rows.append(row) # The rows define a long-form pandas data array. # Data in long-form makes it very easy to use seaborn. data = pd.DataFrame(rows) # data = data.sort_values('time') print(data) plot = True if plot: # import seaborn as sns # kwplot autosns works well for IPython and script execution. # not sure about notebooks. import kwplot sns = kwplot.autosns() plt = kwplot.autoplt() plotkw = {} for gname, labels in group_labels.items(): if labels: plotkw[gname] = gname + '_key' # Your variables may change ax = kwplot.figure(fnum=1, doclf=True).gca() sns.lineplot(data=data, x=xlabel, y='time', marker='o', ax=ax, **plotkw) ax.set_title('Benchmark') ax.set_xlabel('N') ax.set_ylabel('Time') ax.set_yscale('log') plt.show()