Exemple #1
0
def _devcheck_inv_stability():
    import numpy as np

    rows = []

    for trial_idx in range(30):
        orig_12 = np.random.rand(3, 3).astype(np.float32)
        orig_21 = np.linalg.inv(orig_12)

        mat_12 = orig_12.copy()
        mat_21 = orig_21.copy()

        for idx in range(100):
            _mat_12 = np.linalg.inv(mat_21)
            _mat_21 = np.linalg.inv(mat_12)

            mat_12 = _mat_12
            mat_21 = _mat_21

            err_12 = np.abs(mat_12 - orig_12).sum()
            err_21 = np.abs(mat_21 - orig_21).sum()
            rows.append({'idx': idx, 'error': err_12, 'label': 'err_12'})
            rows.append({'idx': idx, 'error': err_21, 'label': 'err_21'})

    import kwplot
    import pandas as pd
    sns = kwplot.autosns()

    data = pd.DataFrame(rows)

    sns.lineplot(data=data, x='idx', y='error', hue='label')
Exemple #2
0
def main():
    sns = kwplot.autosns()  # NOQA
    plt = kwplot.autoplt()  # NOQA

    if 1:
        array = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
        values = [-1, 0, 1, 4, 6, 6.1, 6.5, 7, 10.3, 10.5, 10.7, 15, 16]

    if 0:
        array = np.linspace(0, 30)
        values = np.linspace(0, 30)

    if 0:
        xscale = 20
        num = 20
        array = np.array(sorted(np.random.rand(num) * xscale)).round()
        values = np.hstack([
            np.unique(np.random.choice(array, 3)),
            np.random.rand(num // 2) * xscale
        ])

    fig = kwplot.figure(fnum=1, doclf=1, pnum=(2, 1, 1))
    ax = fig.gca()
    plot_searchsorted_visualization(array, values, side='left', ax=ax)
    ax.set_title('association = searchsorted(array, values, side=left)')

    fig = kwplot.figure(fnum=1, doclf=0, pnum=(2, 1, 2))
    ax = fig.gca()
    plot_searchsorted_visualization(array, values, side='right', ax=ax)
    ax.set_title('association = searchsorted(array, values, side=right)')

    import ubelt as ub
    fig.suptitle(
        ub.codeblock('''
        Notice:
        side=left and side=right have the same result except when the value is
        already in the array.
        '''))
Exemple #3
0
def run_benchmark_renormalization():
    """
    See if we can renormalize probabilities after update with a faster method
    that maintains memory a bit better

    Example:
        >>> import sys, ubelt
        >>> sys.path.append(ubelt.expandpath('~/misc/tests/python'))
        >>> from bench_renormalization import *  # NOQA
        >>> run_benchmark_renormalization()
    """
    import ubelt as ub
    import xdev
    import pathlib
    import timerit

    fpath = pathlib.Path('~/misc/tests/python/renormalize_cython.pyx').expanduser()
    renormalize_cython = xdev.import_module_from_pyx(fpath, annotate=True,
                                                     verbose=3, recompile=True)

    xdev.profile_now(renormalize_demo_v1)(1000, 100)
    xdev.profile_now(renormalize_demo_v2)(1000, 100)
    xdev.profile_now(renormalize_demo_v3)(1000, 100)
    xdev.profile_now(renormalize_demo_v4)(1000, 100)

    func_list = [
        # renormalize_demo_v1,
        renormalize_demo_v2,
        # renormalize_demo_v3,
        # renormalize_demo_v4,
        renormalize_cython.renormalize_demo_cython_v1,
        renormalize_cython.renormalize_demo_cython_v2,
        renormalize_cython.renormalize_demo_cython_v3,
    ]
    methods = {f.__name__: f for f in func_list}
    for key, method in methods.items():
        with timerit.Timer(label=key, verbose=0) as t:
            method(1000, 100)
        print(f'{key:<30} {t.toc():0.6f}')

    arg_basis = {
        'T': [10, 20,  30,  50],
        'D': [10, 50, 100, 300],
    }
    args_grid = []
    for argkw in list(ub.named_product(arg_basis)):
        if argkw['T'] <= argkw['D']:
            arg_basis['size'] = argkw['T'] * argkw['D']
            args_grid.append(argkw)

    ti = timerit.Timerit(100, bestof=10, verbose=2)

    measures = []

    for method_name, method in methods.items():
        for argkw in args_grid:
            row = ub.dict_union({'method': method_name}, argkw)
            key = ub.repr2(row, compact=1)
            argkey = ub.repr2(argkw, compact=1)

            kwargs = ub.dict_subset(argkw, ['T', 'D'])
            for timer in ti.reset('time'):
                with timer:
                    method(**kwargs)

            row['mean'] = ti.mean()
            row['min'] = ti.min()
            row['key'] = key
            row['argkey'] = argkey
            measures.append(row)

    import pandas as pd
    df = pd.DataFrame(measures)
    import kwplot
    sns = kwplot.autosns()

    kwplot.figure(fnum=1, pnum=(1, 2, 1), docla=True)
    sns.lineplot(data=df, x='D', y='min', hue='method', style='method')
    kwplot.figure(fnum=1, pnum=(1, 2, 2), docla=True)
    sns.lineplot(data=df, x='T', y='min', hue='method', style='method')

    p = (df.pivot(['method'], ['argkey'], ['mean']))
    print(p.mean(axis=1).sort_values())
Exemple #4
0
def main():
    """
    Run password security analysis

    Example:
        >>> import sys, ubelt
        >>> sys.path.append(ubelt.expandpath('~/misc/notes'))
        >>> from password_model import *  # NOQA
        >>> main()
    """
    import itertools as it
    from fractions import Fraction
    import pandas as pd
    # Build our adversary and our strategies
    devices, scales = build_threat_models()

    password_schemes = build_password_strategy()

    # Other estimates or assumptions
    estimates = {
        # estimated cost of using a kilowatt for an hour
        # http://www.wrecc.com/what-uses-watts-in-your-home/
        # https://www.coinwarz.com/mining/ethereum/calculator
        'dollars_per_kwh': 0.10,
    }

    rows = []
    for device, scheme, scale in it.product(devices, password_schemes, scales):
        for benchmark in device['benchmarks']:

            states = Fraction(scheme['states'])
            num_devices = Fraction(scale['num_devices'])
            dollars_per_kwh = Fraction(estimates['dollars_per_kwh'])

            hashmode_attempts_per_second = benchmark['attempts_per_second']
            attempts_per_second = num_devices * Fraction(
                int(hashmode_attempts_per_second))

            seconds = states / Fraction(attempts_per_second)

            hours = seconds / Fraction(3600)
            device_kilowatts = Fraction(device['watts']) / Fraction(1000)
            device_dollars_per_hour = device_kilowatts * dollars_per_kwh
            dollars_per_device = device_dollars_per_hour * hours
            dollars = dollars_per_device * num_devices

            total_kilowatts = device_kilowatts * num_devices * hours

            row = {
                'scheme': scheme['name'],
                'entropy': scheme['entropy'],
                'hashmode': benchmark['hashmode'],
                'hashmode_attempts_per_second':
                int(hashmode_attempts_per_second),
                'device': device['name'],
                'scale': scale['name'],
                'num_devices': scale['num_devices'],
                'seconds': seconds,
                'dollars': dollars,
                'kilowatts': total_kilowatts,
                'hours': hours,
                'dollars_per_kwh': estimates['dollars_per_kwh'],
            }
            rows.append(row)

    df = pd.DataFrame(rows)
    df = df.sort_values('entropy')

    chosen_device = 'RTX_3090'
    df = df[df['device'] == chosen_device]
    df['time'] = df['seconds'].apply(humanize_seconds)
    df['cost'] = df['dollars'].apply(partial(humanize_dollars, colored=1))
    df['entropy'] = df['entropy'].round(2)
    df['num_devices'] = df['num_devices'].apply(int)

    hashmodes = sorted([d['hashmode'] for d in device['benchmarks']])

    # https://github.com/pandas-dev/pandas/issues/18066
    monkeypatch_pandas_colored_stdout()

    # Output our assumptions
    print('\n---')
    print('Assumptions:')
    device_info = ub.group_items(devices,
                                 lambda x: x['name'])[chosen_device][0]
    print('estimates = {!r}'.format(estimates))
    print('device_info = {}'.format(ub.repr2(device_info, nl=2)))

    # For each hashmode, print the scheme-vs-num_devices-vs-time matrix
    hashmode_to_pivots = {}
    for hashmode in hashmodes:
        subdf = df
        subdf = subdf[subdf['hashmode'] == hashmode]
        subdf = subdf.sort_values(['entropy', 'num_devices'])
        piv = subdf.pivot(['entropy', 'cost', 'scheme'],
                          ['num_devices', 'scale'], 'time')
        # piv.style.applymap(color_cases)
        hashmode_to_pivots[hashmode] = piv

    for hashmode in hashmodes:
        print('\n---')
        print('hashmode = {!r}'.format(hashmode))
        piv = hashmode_to_pivots[hashmode]
        print(piv)

    # Print the scheme-vs-hashmode-vs-cost matrix
    print('\n---')
    print('Cost Matrix:')
    subdf = df[df['scale'] == df['scale'].iloc[0]]
    piv = subdf.pivot(['entropy', 'scheme'],
                      ['hashmode_attempts_per_second', 'hashmode'], 'cost')
    piv = piv.sort_index(axis=1, ascending=False)
    piv.columns = piv.columns.droplevel(0)
    print(piv)

    # Make the visualizations
    if ub.argflag('--show'):
        import kwplot
        from matplotlib.colors import LogNorm
        import matplotlib as mpl
        plt = kwplot.autoplt()
        sns = kwplot.autosns()

        use_latex = ub.argflag('--latex')
        if use_latex:
            mpl.rcParams['text.usetex'] = True

        def time_labelize(x):
            text = humanize_seconds(x, colored=False, named=True, precision=2)
            parts = text.split(' ')
            if use_latex:
                text = r'{\huge ' + parts[0] + '}' + '\n' + ' '.join(parts[1:])
            else:
                text = parts[0] + '\n' + ' '.join(parts[1:])
            return text

        def dollar_labelize(dollars):
            cost = humanize_dollars(dollars, named=(dollars > 1))
            if use_latex:
                cost = cost.replace('$', r'\$')
            return cost

        hashmode_to_notes = {}
        for dev in devices[0]['benchmarks']:
            hashmode_to_notes[dev['hashmode']] = dev['notes']

        if 1:
            # Independent of the adversary scale we can plot cost versus scheme
            # cost vs hashmod?
            subdf = df[df['scale'] == df['scale'].iloc[0]]
            piv = subdf.pivot(['entropy', 'scheme'],
                              ['hashmode_attempts_per_second', 'hashmode'],
                              'dollars')
            piv = piv.sort_index(axis=1, ascending=False)

            # https://stackoverflow.com/questions/64234474/cust-y-lbls-seaborn
            ax: mpl.axes.Axes = plt.subplots(figsize=(15, 10))[1]

            annot = piv.applymap(dollar_labelize)
            piv = piv.applymap(float)

            sns.heatmap(piv,
                        annot=annot,
                        ax=ax,
                        fmt='s',
                        norm=LogNorm(vmin=1, vmax=100_000_000_000_000_000),
                        annot_kws={'size': 16},
                        cmap='cividis',
                        cbar_kws={
                            'label': 'dollars',
                            'pad': 0.001
                        })

            # Find colorbar
            for subax in ax.figure.axes:
                if subax.get_label() == '<colorbar>':
                    subax.set_ylabel('dollars', labelpad=0)
                    break

            new_ytick_labels = []
            for ent, scheme in piv.index.to_list():
                if use_latex:
                    scheme = r'{\LARGE ' + scheme + '}'
                _ = '{scheme}\nEntropy={ent}bits'.format(scheme=scheme,
                                                         ent=ent)
                new_ytick_labels.append(_)

            new_xtick_labels = []
            for _, hashmode in piv.columns.to_list():
                notes = ''
                if hashmode in hashmode_to_notes:
                    notes = '\n(' + hashmode_to_notes[hashmode] + ')'
                new_xtick_labels.append(hashmode + notes)

            ax.set_xticklabels(new_xtick_labels, rotation=0)
            ax.set_yticklabels(new_ytick_labels, rotation=0)

            ax.set_ylabel('Password Scheme, Entropy', labelpad=24)
            ax.set_xlabel('Hashmode', labelpad=16)

            if use_latex:
                title = '{{\\Huge Password Cost Security}}'
                ax.set_title(title)
            else:
                ax.set_title('Password Cost Security')

            ax.figure.subplots_adjust(bottom=0.1,
                                      left=0.20,
                                      right=1.0,
                                      top=0.90,
                                      wspace=0.001)

            if ub.argflag('--save'):
                fname = 'passwd_cost_security.png'
                ax.figure.savefig(fname)

        if 1:
            # For each hashmode plot (scheme versus adversary scale)
            for hashmode in ub.ProgIter(hashmodes, desc='plotting'):
                subdf = df
                subdf = subdf[subdf['hashmode'] == hashmode]
                subdf = subdf.sort_values(['entropy', 'num_devices'])

                piv = subdf.pivot(['entropy', 'dollars', 'scheme'],
                                  ['num_devices', 'scale'], 'seconds')
                piv = piv.applymap(float)

                # https://stackoverflow.com/questions/64234474/cust-y-lbls-seaborn
                ax: mpl.axes.Axes = plt.subplots(figsize=(15, 10))[1]

                annot = piv.applymap(time_labelize)
                sns.heatmap(piv,
                            annot=annot,
                            ax=ax,
                            fmt='s',
                            norm=LogNorm(vmin=1, vmax=8640000000),
                            annot_kws={'size': 10},
                            cbar_kws={
                                'label': 'seconds',
                                'pad': 0.001
                            })

                # Find colorbar
                for subax in ax.figure.axes:
                    if subax.get_label() == '<colorbar>':
                        subax.set_ylabel('seconds', labelpad=0)
                        break

                new_ytick_labels = []
                for ent, dollars, scheme in piv.index.to_list():
                    cost = dollar_labelize(dollars)
                    if use_latex:
                        scheme = r'{\LARGE ' + scheme + '}'
                    _ = '{scheme}\nEntropy={ent}bits\nCost={cost}'.format(
                        scheme=scheme, cost=cost, ent=ent)
                    new_ytick_labels.append(_)

                new_xtick_labels = []
                for n, name in piv.columns.to_list():
                    if use_latex:
                        name = r'{\LARGE ' + name + '}'
                    _ = name + '\n' + named_large_number(n,
                                                         precision=0) + ' GPUs'
                    new_xtick_labels.append(_)

                ax.set_xticklabels(new_xtick_labels, rotation=0)
                # ax.set_yticklabels(new_ytick_labels, horizontalalignment='left', pad=30)
                ax.set_yticklabels(new_ytick_labels)

                ax.set_ylabel('Password Scheme, Entropy, and Cost to Crack',
                              labelpad=24)
                ax.set_xlabel('Adversary Resources', labelpad=16)

                notes = ''
                if hashmode in hashmode_to_notes:
                    notes = ' (' + hashmode_to_notes[hashmode] + ')'

                if use_latex:
                    title = '{{\\Huge Password Time Security}}\nhashmode={}{}'.format(
                        hashmode, notes)
                    ax.set_title(title)
                else:
                    ax.set_title(
                        'Password Time Security\n(hashmode={}{})'.format(
                            hashmode, notes))

                ax.figure.subplots_adjust(bottom=0.1,
                                          left=0.20,
                                          right=1.0,
                                          top=0.90,
                                          wspace=0.001)

                if ub.argflag('--save'):
                    fname = 'passwd_robustness_{}.png'.format(hashmode)
                    ax.figure.savefig(fname)
        plt.show()
Exemple #5
0
def benchmark_template():
    import ubelt as ub
    import pandas as pd
    import timerit

    def method1(x, y, z):
        ret = []
        for i in range((x + y) * z):
            ret.append(i)
        return ret

    def method2(x, y, z):
        ret = [i for i in range((x + y) * z)]
        return ret

    method_lut = locals()  # can populate this some other way

    # Change params here to modify number of trials
    ti = timerit.Timerit(100, bestof=10, verbose=1)

    # if True, record every trail run and show variance in seaborn
    # if False, use the standard timerit min/mean measures
    RECORD_ALL = True

    # These are the parameters that we benchmark over
    basis = {
        'method': ['method1', 'method2'],
        'x': list(range(7)),
        'y': [0, 100],
        'z': [2, 3]
        # 'param_name': [param values],
    }
    xlabel = 'x'
    # Set these to param labels that directly transfer to method kwargs
    kw_labels = ['x', 'y', 'z']
    # Set these to empty lists if they are not used
    group_labels = {
        'style': ['y'],
        'size': ['z'],
    }
    group_labels['hue'] = list((ub.oset(basis) - {xlabel}) -
                               set.union(*map(set, group_labels.values())))
    grid_iter = list(ub.named_product(basis))

    # For each variation of your experiment, create a row.
    rows = []
    for params in grid_iter:
        group_keys = {}
        for gname, labels in group_labels.items():
            group_keys[gname + '_key'] = ub.repr2(ub.dict_isect(
                params, labels),
                                                  compact=1,
                                                  si=1)
        key = ub.repr2(params, compact=1, si=1)
        # Make any modifications you need to compute input kwargs for each
        # method here.
        kwargs = ub.dict_isect(params.copy(), kw_labels)
        method = method_lut[params['method']]
        # Timerit will run some user-specified number of loops.
        # and compute time stats with similar methodology to timeit
        for timer in ti.reset(key):
            # Put any setup logic you dont want to time here.
            # ...
            with timer:
                # Put the logic you want to time here
                method(**kwargs)

        if RECORD_ALL:
            # Seaborn will show the variance if this is enabled, otherwise
            # use the robust timerit mean / min times
            chunk_iter = ub.chunks(ti.times, ti.bestof)
            times = list(map(min, chunk_iter))  # TODO: timerit method for this
            for time in times:
                row = {
                    # 'mean': ti.mean(),
                    'time': time,
                    'key': key,
                    **group_keys,
                    **params,
                }
                rows.append(row)
        else:
            row = {
                'mean': ti.mean(),
                'min': ti.min(),
                'key': key,
                **group_keys,
                **params,
            }
            rows.append(row)

    time_key = 'time' if RECORD_ALL else 'min'

    # The rows define a long-form pandas data array.
    # Data in long-form makes it very easy to use seaborn.
    data = pd.DataFrame(rows)
    data = data.sort_values(time_key)

    if RECORD_ALL:
        # Show the min / mean if we record all
        min_times = data.groupby('key').min().rename({'time': 'min'}, axis=1)
        mean_times = data.groupby('key')[['time'
                                          ]].mean().rename({'time': 'mean'},
                                                           axis=1)
        stats_data = pd.concat([min_times, mean_times], axis=1)
        stats_data = stats_data.sort_values('min')
    else:
        stats_data = data

    USE_OPENSKILL = 1
    if USE_OPENSKILL:
        # Lets try a real ranking method
        # https://github.com/OpenDebates/openskill.py
        import openskill
        method_ratings = {m: openskill.Rating() for m in basis['method']}

    other_keys = sorted(
        set(stats_data.columns) -
        {'key', 'method', 'min', 'mean', 'hue_key', 'size_key', 'style_key'})
    for params, variants in stats_data.groupby(other_keys):
        variants = variants.sort_values('mean')
        ranking = variants['method'].reset_index(drop=True)

        mean_speedup = variants['mean'].max() / variants['mean']
        stats_data.loc[mean_speedup.index, 'mean_speedup'] = mean_speedup
        min_speedup = variants['min'].max() / variants['min']
        stats_data.loc[min_speedup.index, 'min_speedup'] = min_speedup

        if USE_OPENSKILL:
            # The idea is that each setting of parameters is a game, and each
            # "method" is a player. We rank the players by which is fastest,
            # and update their ranking according to the Weng-Lin Bayes ranking
            # model. This does not take the fact that some "games" (i.e.
            # parameter settings) are more important than others, but it should
            # be fairly robust on average.
            old_ratings = [[r] for r in ub.take(method_ratings, ranking)]
            new_values = openskill.rate(old_ratings)  # Not inplace
            new_ratings = [openskill.Rating(*new[0]) for new in new_values]
            method_ratings.update(ub.dzip(ranking, new_ratings))

    print('Statistics:')
    print(stats_data)

    if USE_OPENSKILL:
        from openskill import predict_win
        win_prob = predict_win([[r] for r in method_ratings.values()])
        skill_agg = pd.Series(ub.dzip(method_ratings.keys(),
                                      win_prob)).sort_values(ascending=False)
        print('Aggregated Rankings =\n{}'.format(skill_agg))

    plot = True
    if plot:
        # import seaborn as sns
        # kwplot autosns works well for IPython and script execution.
        # not sure about notebooks.
        import kwplot
        sns = kwplot.autosns()
        plt = kwplot.autoplt()

        plotkw = {}
        for gname, labels in group_labels.items():
            if labels:
                plotkw[gname] = gname + '_key'

        # Your variables may change
        ax = kwplot.figure(fnum=1, doclf=True).gca()
        sns.lineplot(data=data,
                     x=xlabel,
                     y=time_key,
                     marker='o',
                     ax=ax,
                     **plotkw)
        ax.set_title('Benchmark Name')
        ax.set_xlabel('Size (todo: A better x-variable description)')
        ax.set_ylabel('Time (todo: A better y-variable description)')
        # ax.set_xscale('log')
        # ax.set_yscale('log')

        try:
            __IPYTHON__
        except NameError:
            plt.show()
Exemple #6
0
def benchmark_ubelt_import_time_robust():
    import pandas as pd
    import ubelt as ub
    import kwplot
    sns = kwplot.autosns(force='Qt5Agg')

    prog = ub.codeblock(r'''
        def _main():
            import subprocess
            import ubelt as ub
            measurements = []
            for i in range(200):
                row = {}
                # info = ub.cmd('python -X importtime -c "import ubelt"')
                # text = info['err']
                prog = subprocess.Popen('python -X importtime -c "import ubelt"', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                _, text = prog.communicate()
                text = text.decode()
                final_line = text.rstrip().split('\n')[-1]
                partial = final_line.split(':')[1].split('|')
                row['self_us'] = float(partial[0].strip())
                row['cummulative'] = float(partial[1].strip())
                measurements.append(row)
            import pandas as pd
            df = pd.DataFrame(measurements)
            stats = pd.DataFrame({
                'mean': df.mean(),
                'std': df.std(),
                'min': df.min(),
                'max': df.max(),
                'total': df.sum(),
            })
            info = stats.to_dict()
            info['version'] = ub.__version__
            print(info)
            # print(stats)
        _main()
        ''')

    dpath = ub.Path(ub.ensure_app_cache_dir('ubelt/tests/test_version_import'))
    fpath = dpath / 'do_test.py'
    fpath.write_text(prog)

    repo_root = ub.Path('$HOME/code/ubelt').expand()

    info = ub.cmd('git tag', cwd=repo_root)

    versions = [p for p in info['out'].split('\n') if p]
    branches = ['dev/1.0.1', 'main'] + versions

    fig = kwplot.figure(doclf=True)
    ax = fig.gca()

    bname_to_info = {}
    rows = []
    for bname in branches:
        print('bname = {!r}'.format(bname))
        ub.cmd('git checkout {}'.format(bname),
               cwd=repo_root,
               verbose=3,
               check=True)
        info = ub.cmd('python {}'.format(fpath), verbose=2)
        dict_info = eval(info['out'])
        bname_to_info[bname] = dict_info
        for stat in ['mean', 'min', 'max']:
            for type in ['self_us', 'cummulative']:
                rows.append({
                    'version': dict_info['version'],
                    'stat': stat,
                    'type': type,
                    'time': dict_info[stat][type],
                })
        df = pd.DataFrame(rows[-1:])
        print(df)
        # ax.cla()
        # sns.lineplot(data=df, x='version', y='time', hue='stat', style='type', ax=ax)

    ub.cmd('git checkout {}'.format('dev/1.0.1'), cwd=repo_root)
    df = pd.DataFrame(rows)
    from distutils.version import LooseVersion
    unique_versions = list(
        map(str, sorted(map(LooseVersion, df['version'].unique()))))
    df['release_index'] = df['version'].apply(
        lambda x: unique_versions.index(x))
    ax.cla()
    kwplot.figure(fnum=2, pnum=(2, 1, 1), doclf=True)
    ax = sns.lineplot(data=df[df['type'] == 'cummulative'],
                      x='release_index',
                      y='time',
                      hue='stat',
                      style='type',
                      marker='o')
    ax.set_title('Ubelt import time over release history')
    kwplot.figure(fnum=2, pnum=(2, 1, 2))
    sns.lineplot(data=df[df['type'] == 'self_us'],
                 x='release_index',
                 y='time',
                 hue='stat',
                 style='type',
                 marker='o')
Exemple #7
0
def benchmark_repeat_vs_reduce_mul():
    import ubelt as ub
    import pandas as pd
    import timerit

    def reduce_daq_rec(func, arrs):
        if len(arrs) == 1:
            return arrs[0]
        if len(arrs) == 2:
            return func(arrs[0], arrs[1])
        elif len(arrs) == 3:
            return func(func(arrs[0], arrs[1]), arrs[3])
        else:
            arrs1 = arrs[0::2]
            arrs2 = arrs[1::2]
            res1 = reduce_daq_rec(func, arrs1)
            res2 = reduce_daq_rec(func, arrs2)
            res = func(res1, res2)
        return res

    def reduce_daq_iter(func, arrs):
        """
        https://www.baeldung.com/cs/convert-recursion-to-iteration
        https://stackoverflow.com/questions/159590/way-to-go-from-recursion-to-iteration
        arrs = [2, 3, 5, 7, 11, 13, 17, 21]
        """
        raise NotImplementedError
        # TODO: make the iterative version
        from collections import deque
        empty_result = None
        stack = deque([(arrs, empty_result)])
        idx = 0
        while stack:
            print('----')
            print('stack = {}'.format(ub.repr2(list(stack), nl=1)))
            arrs0, result = stack.pop()
            if len(arrs0) == 0:
                raise Exception
            if result is not None:
                # raise Exception
                results = [result]
                while stack:
                    next_arrs0, next_result = stack.pop()
                    if next_result is None:
                        break
                    else:
                        results.append(next_result)
                if results:
                    if len(results) == 1:
                        stack.append((results, results[0]))
                    else:
                        stack.append((results, None))
                if next_result is None:
                    stack.append((next_arrs0, None))
            elif result is None:
                if len(arrs0) == 1:
                    result = arrs0[0]
                    stack.append((arrs0, result))
                    # return arrs0[0]
                if len(arrs0) == 2:
                    result = func(arrs0[0], arrs0[1])
                    stack.append((arrs0, result))
                elif len(arrs0) == 3:
                    result = func(func(arrs0[0], arrs0[1]), arrs0[3])
                    stack.append((arrs0, result))
                else:
                    arrs01 = arrs0[0::2]
                    arrs02 = arrs0[1::2]
                    stack.append((arrs0, empty_result))
                    stack.append((arrs01, empty_result))
                    stack.append((arrs02, empty_result))
                    # res1 = reduce_daq_rec(func, arrs01)
                    # res2 = reduce_daq_rec(func, arrs2)
                    # res = func(res1, res2)
            idx += 1
            if idx > 10:
                raise Exception
        return res

    def method_daq_rec(arrs):
        return reduce_daq_rec(np.multiply, arrs)

    def method_repeat(arrs):
        """
        helper code:
            arr_names = ['a{:02d}'.format(idx) for idx in range(1, 32 + 1)]
            lhs = ', '.join(arr_names)
            rhs = ' * '.join(arr_names)
            print(f'{lhs} = arrs')
            print(f'ret = {rhs}')
        """
        # Hard coded pure python syntax for multiplying
        if len(arrs) == 4:
            a01, a02, a03, a04 = arrs
            ret = a01 * a02 * a03 * a04
        elif len(arrs) == 8:
            a01, a02, a03, a04, a05, a06, a07, a08 = arrs
            ret = a01 * a02 * a03 * a04 * a05 * a06 * a07 * a08
        elif len(arrs) == 32:
            a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, a31, a32 = arrs
            ret = a01 * a02 * a03 * a04 * a05 * a06 * a07 * a08 * a09 * a10 * a11 * a12 * a13 * a14 * a15 * a16 * a17 * a18 * a19 * a20 * a21 * a22 * a23 * a24 * a25 * a26 * a27 * a28 * a29 * a30 * a31 * a32
        return ret

    def method_reduce(arrs):
        ret = np.multiply.reduce(arrs)
        return ret

    def method_stack(arrs):
        stacked = np.stack(arrs)
        ret = stacked.prod(axis=0)
        return ret

    method_lut = locals()  # can populate this some other way

    ti = timerit.Timerit(10000, bestof=10, verbose=2)

    basis = {
        'method':
        ['method_repeat', 'method_reduce', 'method_stack', 'method_daq_rec'],
        'arr_size': [10, 100, 1000, 10000],
        'num_arrs': [4, 8, 32],
    }
    xlabel = 'arr_size'
    kw_labels = []
    group_labels = {
        'style': ['num_arrs'],
        'size': [],
    }
    group_labels['hue'] = list((ub.oset(basis) - {xlabel}) -
                               set.union(*map(set, group_labels.values())))
    grid_iter = list(ub.named_product(basis))

    # For each variation of your experiment, create a row.
    rows = []
    for params in grid_iter:
        group_keys = {}
        for gname, labels in group_labels.items():
            group_keys[gname + '_key'] = ub.repr2(ub.dict_isect(
                params, labels),
                                                  compact=1,
                                                  si=1)
        key = ub.repr2(params, compact=1, si=1)
        kwargs = ub.dict_isect(params.copy(), kw_labels)

        arr_size = params['arr_size']
        num_arrs = params['num_arrs']

        arrs = []
        for _ in range(num_arrs):
            arr = np.random.rand(arr_size)
            arrs.append(arr)
        kwargs['arrs'] = arrs
        method = method_lut[params['method']]
        # Timerit will run some user-specified number of loops.
        # and compute time stats with similar methodology to timeit
        for timer in ti.reset(key):
            # Put any setup logic you dont want to time here.
            # ...
            with timer:
                # Put the logic you want to time here
                method(**kwargs)
        row = {
            'mean': ti.mean(),
            'min': ti.min(),
            'key': key,
            **group_keys,
            **params,
        }
        rows.append(row)

    # The rows define a long-form pandas data array.
    # Data in long-form makes it very easy to use seaborn.
    data = pd.DataFrame(rows)
    data = data.sort_values('min')
    print(data)

    plot = True
    if plot:
        # import seaborn as sns
        # kwplot autosns works well for IPython and script execution.
        # not sure about notebooks.
        import kwplot
        sns = kwplot.autosns()

        plotkw = {}
        for gname, labels in group_labels.items():
            if labels:
                plotkw[gname] = gname + '_key'

        # Your variables may change
        ax = kwplot.figure(fnum=1, doclf=True).gca()
        sns.lineplot(data=data, x=xlabel, y='min', marker='o', ax=ax, **plotkw)
        ax.set_title('Benchmark')
        ax.set_xlabel('Array Size')
        ax.set_ylabel('Time')
Exemple #8
0
    owid_co2_data_fpath = ub.grabdata('https://nyc3.digitaloceanspaces.com/owid-public/data/co2/owid-co2-data.json', expires=timedelta(days=30))
    with open(owid_co2_data_fpath, 'r') as file:
        co2_data = json.load(file)
    us_co2_data = co2_data['United States']['data']

    _raw_us_data = pd.DataFrame(us_co2_data).set_index('year', drop=0)
    us_data = _raw_us_data.loc[1980:][columns_of_interest]
    us_data = us_data.assign(year=_raw_us_data['year'].apply(lambda x: x * reg.year))
    us_data = us_data.assign(co2=_raw_us_data['co2'].apply(lambda x: x * million * CO2_ton))
    us_data = us_data.assign(population=_raw_us_data['population'].apply(lambda x: x * reg.us_person))
    us_data = us_data.assign(primary_energy_consumption=_raw_us_data['primary_energy_consumption'].apply(lambda x: x * twh / reg.year))
    us_data = us_data.assign(gdp=_raw_us_data['gdp'].apply(lambda x: x * reg.dollar_2011))

    if 0:
        import kwplot
        sns = kwplot.autosns()
        sns.lineplot(data=_raw_us_data, x='year', y='co2')

    us_emissions_2018 = us_data['co2'].loc[2018]
    us_population_2018 = us_data['population'].loc[2018]
    us_person_anual_footprint = us_emissions_2018 / us_population_2018
    us_data['co2_per_capita'] = us_data['co2'] / us_data['population']
else:
    us_emissions_2018 = 5.27 * billion * CO2_ton / reg.year_2018
    us_population_2018 = 327.2 * million * reg.us_person
    us_person_anual_footprint = us_emissions_2018 / us_population_2018

# Different estimates for this number
us_person_anual_footprint_candidates = {
    'nature.org': 16 * CO2_ton / (reg.year * reg.us_person),
    'terrapass': (63_934 * CO2_pound).to(CO2_ton) / (reg.year * reg.us_person),
def benchmark_nested_break():
    """
    There are several ways to do a nested break, but which one is best?

    https://twitter.com/nedbat/status/1515345787563220996
    """
    import ubelt as ub
    import pandas as pd
    import timerit
    import itertools as it

    def method1_itertools(iter1, iter2):
        for i, j in it.product(iter1, iter2):
            if i == 20 and j == 20:
                break

    def method2_except(iter1, iter2):
        class Found(Exception):
            pass
        try:
            for i in iter1:
                for j in iter2:
                    if i == 20 and j == 20:
                        raise Found
        except Found:
            pass

    class FoundPredef(Exception):
        pass

    def method2_5_except_predef(iter1, iter2):
        try:
            for i in iter1:
                for j in iter2:
                    if i == 20 and j == 20:
                        raise FoundPredef
        except FoundPredef:
            pass

    def method3_gendef(iter1, iter2):
        def genfunc():
            for i in iter1:
                for j in iter2:
                    yield i, j

        for i, j in genfunc():
            if i == 20 and j == 20:
                break

    def method4_genexp(iter1, iter2):
        genexpr = ((i, j) for i in iter1 for j in iter2)
        for i, j in genexpr:
            if i == 20 and j == 20:
                break

    method_lut = locals()  # can populate this some other way

    # Change params here to modify number of trials
    ti = timerit.Timerit(1000, bestof=10, verbose=1)

    # if True, record every trail run and show variance in seaborn
    # if False, use the standard timerit min/mean measures
    RECORD_ALL = True

    # These are the parameters that we benchmark over
    import numpy as np
    basis = {
        'method': ['method1_itertools', 'method2_except', 'method2_5_except_predef', 'method3_gendef', 'method4_genexp'],
        # 'n1': np.logspace(1, np.log2(100), 30, base=2).astype(int),
        # 'n2': np.logspace(1, np.log2(100), 30, base=2).astype(int),
        'size': np.logspace(1, np.log2(10000), 30, base=2).astype(int),
        'input_style': ['range', 'list', 'customized_iter'],
        # 'param_name': [param values],
    }
    xlabel = 'size'
    xinput_labels = ['n1', 'n2', 'size']

    # Set these to param labels that directly transfer to method kwargs
    kw_labels = []
    # Set these to empty lists if they are not used
    group_labels = {
        'style': ['input_style'],
        'size': [],
    }
    group_labels['hue'] = list(
        (ub.oset(basis) - {xlabel} - xinput_labels) - set.union(*map(set, group_labels.values())))
    grid_iter = list(ub.named_product(basis))

    def make_input(params):
        # Given the parameterization make the benchmark function input
        # n1 = params['n1']
        # n2 = params['n2']
        size = params['size']
        n1 = int(np.sqrt(size))
        n2 = int(np.sqrt(size))
        if params['input_style'] == 'list':
            iter1 = list(range(n1))
            iter2 = list(range(n1))
        elif params['input_style'] == 'range':
            iter1 = range(n1)
            iter2 = range(n2)
        elif params['input_style'] == 'customized_iter':
            import random
            def rando1():
                rng1 = random.Random(0)
                for _ in range(n1):
                    yield rng1.randint(0, n2)

            def rando2():
                rng2 = random.Random(1)
                for _ in range(n1):
                    yield rng2.randint(0, n2)

            iter1 = rando1()
            iter2 = rando2()
        else:
            raise KeyError
        return {'iter1': iter1, 'iter2': iter2}

    # For each variation of your experiment, create a row.
    rows = []
    for params in grid_iter:
        # size = params['n1'] * params['n2']
        # params['size'] = size
        group_keys = {}
        for gname, labels in group_labels.items():
            group_keys[gname + '_key'] = ub.repr2(
                ub.dict_isect(params, labels), compact=1, si=1)
        key = ub.repr2(params, compact=1, si=1)
        # Make any modifications you need to compute input kwargs for each
        # method here.
        kwargs = ub.dict_isect(params.copy(),  kw_labels)

        method = method_lut[params['method']]
        # Timerit will run some user-specified number of loops.
        # and compute time stats with similar methodology to timeit
        for timer in ti.reset(key):
            # Put any setup logic you dont want to time here.
            # ...
            kwargs.update(make_input(params))
            with timer:
                # Put the logic you want to time here
                method(**kwargs)

        if RECORD_ALL:
            # Seaborn will show the variance if this is enabled, otherwise
            # use the robust timerit mean / min times
            # chunk_iter = ub.chunks(ti.times, ti.bestof)
            # times = list(map(min, chunk_iter))  # TODO: timerit method for this
            times = ti.robust_times()
            for time in times:
                row = {
                    # 'mean': ti.mean(),
                    'time': time,
                    'key': key,
                    **group_keys,
                    **params,
                }
                rows.append(row)
        else:
            row = {
                'mean': ti.mean(),
                'min': ti.min(),
                'key': key,
                **group_keys,
                **params,
            }
            rows.append(row)

    time_key = 'time' if RECORD_ALL else 'min'

    # The rows define a long-form pandas data array.
    # Data in long-form makes it very easy to use seaborn.
    data = pd.DataFrame(rows)
    data = data.sort_values(time_key)

    if RECORD_ALL:
        # Show the min / mean if we record all
        min_times = data.groupby('key').min().rename({'time': 'min'}, axis=1)
        mean_times = data.groupby('key')[['time']].mean().rename({'time': 'mean'}, axis=1)
        stats_data = pd.concat([min_times, mean_times], axis=1)
        stats_data = stats_data.sort_values('min')
    else:
        stats_data = data

    USE_OPENSKILL = 1
    if USE_OPENSKILL:
        # Lets try a real ranking method
        # https://github.com/OpenDebates/openskill.py
        import openskill
        method_ratings = {m: openskill.Rating() for m in basis['method']}

    other_keys = sorted(set(stats_data.columns) - {'key', 'method', 'min', 'mean', 'hue_key', 'size_key', 'style_key'})
    for params, variants in stats_data.groupby(other_keys):
        variants = variants.sort_values('mean')
        ranking = variants['method'].reset_index(drop=True)

        mean_speedup = variants['mean'].max() / variants['mean']
        stats_data.loc[mean_speedup.index, 'mean_speedup'] = mean_speedup
        min_speedup = variants['min'].max() / variants['min']
        stats_data.loc[min_speedup.index, 'min_speedup'] = min_speedup

        if USE_OPENSKILL:
            # The idea is that each setting of parameters is a game, and each
            # "method" is a player. We rank the players by which is fastest,
            # and update their ranking according to the Weng-Lin Bayes ranking
            # model. This does not take the fact that some "games" (i.e.
            # parameter settings) are more important than others, but it should
            # be fairly robust on average.
            old_ratings = [[r] for r in ub.take(method_ratings, ranking)]
            new_values = openskill.rate(old_ratings)  # Not inplace
            new_ratings = [openskill.Rating(*new[0]) for new in new_values]
            method_ratings.update(ub.dzip(ranking, new_ratings))

    print('Statistics:')
    print(stats_data)

    if USE_OPENSKILL:
        from openskill import predict_win
        win_prob = predict_win([[r] for r in method_ratings.values()])
        skill_agg = pd.Series(ub.dzip(method_ratings.keys(), win_prob)).sort_values(ascending=False)
        print('method_ratings = {}'.format(ub.repr2(method_ratings, nl=1)))
        print('Aggregated Rankings =\n{}'.format(skill_agg))

    plot = True
    if plot:
        # import seaborn as sns
        # kwplot autosns works well for IPython and script execution.
        # not sure about notebooks.
        import kwplot
        sns = kwplot.autosns()
        plt = kwplot.autoplt()

        plotkw = {}
        for gname, labels in group_labels.items():
            if labels:
                plotkw[gname] = gname + '_key'

        # Your variables may change
        ax = kwplot.figure(fnum=1, doclf=True).gca()
        sns.lineplot(data=data, x=xlabel, y=time_key, marker='o', ax=ax, **plotkw)
        ax.set_title(f'Benchmark Nested Breaks: #Trials {ti.num}, bestof {ti.bestof}')
        ax.set_xlabel(f'{xlabel}')
        ax.set_ylabel('Time')
        ax.set_xscale('log')
        ax.set_yscale('log')

        try:
            __IPYTHON__
        except NameError:
            plt.show()
Exemple #10
0
def benchmark_reversed_range():
    import ubelt as ub
    import pandas as pd
    import timerit
    import itertools as it

    methods = []

    def custom_reversed_range_v1(start, stop):
        final = stop - 1
        for idx in range(stop - start):
            yield final - idx

    def custom_reversed_range_v2(start, stop):
        yield from it.islice(it.count(stop - 1, step=-1), stop - start)

    @methods.append
    def reversed_builtin(x):
        start = 10
        stop = x + start
        ret = list(reversed(range(start, stop)))
        return ret

    @methods.append
    def negative_range(x):
        start = 10
        stop = x + start
        ret = list(range(stop - 1, start - 1, -1))
        return ret

    # @methods.append
    # def custom_v1(x):
    #     start = 10
    #     stop = x + start
    #     ret = list(custom_reversed_range_v1(start, stop))
    #     return ret

    # @methods.append
    # def custom_v2(x):
    #     start = 10
    #     stop = x + start
    #     ret = list(custom_reversed_range_v2(start, stop))
    #     return ret

    method_lut = {f.__name__: f for f in methods}

    results = {k: func(10) for k, func in method_lut.items()}
    print('results = {}'.format(ub.repr2(results, nl=1, align=':')))
    if not ub.allsame(results.values()):
        raise AssertionError('Failed consistency check')

    ti = timerit.Timerit(1000, bestof=10, verbose=2)

    basis = {
        'method': list(method_lut.keys()),
        'x': [2 ** i for i in range(14)],
    }
    grid_iter = ub.named_product(basis)

    # For each variation of your experiment, create a row.
    rows = []
    for params in grid_iter:
        key = ub.repr2(params, compact=1, si=1)
        kwargs = params.copy()
        method_key = kwargs.pop('method')
        method = method_lut[method_key]
        # Timerit will run some user-specified number of loops.
        # and compute time stats with similar methodology to timeit
        for timer in ti.reset(key):
            # Put any setup logic you dont want to time here.
            # ...
            with timer:
                # Put the logic you want to time here
                method(**kwargs)
        row = {
            'mean': ti.mean(),
            'min': ti.min(),
            'key': key,
            **params,
        }
        rows.append(row)

    # The rows define a long-form pandas data array.
    # Data in long-form makes it very easy to use seaborn.
    data = pd.DataFrame(rows)
    print(data)

    plot = True
    if plot:
        # import seaborn as sns
        # kwplot autosns works well for IPython and script execution.
        # not sure about notebooks.
        import kwplot
        sns = kwplot.autosns()

        # Your variables may change
        ax = kwplot.figure(fnum=1, doclf=True).gca()
        sns.lineplot(data=data, x='x', y='min', hue='method', marker='o', ax=ax)
        # ax.set_xscale('log')
        ax.set_title('Benchmark Reveral Methods ')
        ax.set_xlabel('A better x-variable description')
        ax.set_ylabel('A better y-variable description')
Exemple #11
0
def main():
    import kwplot
    plt = kwplot.autoplt()
    sns = kwplot.autosns()

    alias = {
        '3090': 'nvctrl GeForce GTX 1080 Ti 1 temp',
        '1080ti': 'nvctrl GeForce RTX 3090 0 temp',
        # 'cpu': 'lmsensor coretemp-isa-0000 Package id 0',
    }

    all_df = read_psensor_log()
    unique_rawdevs = all_df.device.unique()
    for rawdev in unique_rawdevs:
        cpu_prefix = 'lmsensor coretemp-isa'
        if rawdev.startswith(cpu_prefix):
            suffix = rawdev[len(cpu_prefix):].split(' ', 1)[1].strip()
            alias['CPU ' + suffix] = rawdev
        if 'nvctrl' in rawdev and 'temp' in rawdev:
            alias['GPU ' + rawdev[7:-5]] = rawdev

    mapper = ub.invert_dict(alias)

    all_df['device'] = all_df['device'].apply(lambda x: mapper.get(x, None))
    all_df = all_df[all_df['device'].apply(lambda x: x is not None)]

    hours = int(ub.argval('--hours', default=48))

    delta = datetime.timedelta(hours=hours)
    min_time = datetime.datetime.now() - delta
    is_recent = all_df.datetime > min_time
    recent_df = all_df[is_recent]

    chosen = recent_df
    # chosen = all_df

    if 0:

        pivtbl = recent_df.pivot('unix_timestamp', 'device', 'temp')
        pivtbl = pivtbl.sort_index()
        smoothed_rows = []
        for window_idxs in ub.iter_window(list(range(len(pivtbl))), size=10):
            window = pivtbl.iloc[list(window_idxs)]
            max_val = window.max(axis=0, skipna=True)
            for k, v in max_val.to_dict().items():
                smoothed_rows.append({
                    'unix_timestamp': window.index[1],
                    'device': k,
                    'temp': v,
                })

        max_extra = pd.DataFrame(smoothed_rows)
        sns.lineplot(data=max_extra,
                     x='unix_timestamp',
                     y='temp',
                     hue='device')

        df = recent_df.copy()
        df['device'] = df['device'].apply(lambda x: 'Core'
                                          if x.startswith('Core') else x)
        df['time'] = df['unix_timestamp'].apply(
            datetime.datetime.fromtimestamp)

    plt.gcf().clf()
    # sns.lineplot(data=chosen, x='unix_timestamp', y='temp', hue='device')

    for xx, (sess, group) in enumerate(chosen.groupby('session_x')):
        # ax.cla()
        ax = plt.gca()
        sns.lineplot(data=group,
                     x='unix_timestamp',
                     y='temp',
                     hue='device',
                     legend=xx == 0)

    label_xaxis_dates(ax)
    ax.figure.subplots_adjust(bottom=0.2)
    ax.set_ylim(0, 100)
    plt.locator_params(axis='y', nbins=10)

    # import matplotlib as mpl
    # Draw shutdown time as black lines
    end_times = []
    for sx, group in chosen.groupby('session_x'):
        shutdown_time = group['unix_timestamp'].max()
        end_times.append(shutdown_time)

    for shutdown_time in sorted(end_times)[:-1]:
        ax.plot((shutdown_time, shutdown_time), [0, 100], color='k')

    # ci_df = pd.concat([max_extra, recent_df])
    # ci_df['device'] = ci_df['device'].apply(lambda x: 'Core' if x.startswith('Core') else x)
    # sns.lineplot(data=ci_df, x='unix_timestamp', y='temp', hue='device')

    # from matplotlib.dates import date2num
    # all_df['date_ord'] = all_df['datetime'].map(lambda a: date2num(a))

    # sns.lineplot(data=pt)
    # sns.lineplot(data=recent_df, x='unix_timestamp', y='temp', hue='device')
    # sns.regplot(data=recent_df, x='unix_timestamp', y='temp', hue='device')
    plt.show()
Exemple #12
0
def ford_circles():
    """
    Draw Ford Circles

    This is a Ford Circle diagram of the Rationals and Float32 numbers.
    Only 163 of the 32608 rationals I generated can be exactly represented by a float32.

    [MF 14]
    [MF 95]

    [MF 14] https://www.youtube.com/watch?v=83ZjYvkdzYI&list=PL5A714C94D40392AB&index=14
    [MF 95] https://www.youtube.com/watch?v=gATEJ3f3FBM&list=PL5A714C94D40392AB&index=95

    Examples:
        import kwplot
        kwplot.autompl()
    """
    import kwplot
    import ubelt as ub
    import matplotlib as mpl
    plt = kwplot.autoplt()
    sns = kwplot.autosns()  # NOQA

    limit = 256 * 256
    print('limit = {!r}'.format(limit))
    rats_to_plot = set()

    maxx = 1
    _iter = Rational.members(limit=limit)
    _genrat = set(ub.ProgIter(_iter, total=limit, desc='gen rats'))
    rats_to_plot |= _genrat
    rats_to_plot2 = sorted({Rational(r % maxx) for r in rats_to_plot} | {maxx})

    floats = sorted(
        ub.unique(map(float, rats_to_plot2),
                  key=lambda f: f.as_integer_ratio()))
    print(f'{len(rats_to_plot)  = }')
    print(f'{len(rats_to_plot2) = }')
    print(f'{len(floats)        = }')
    import numpy as np

    ax = kwplot.figure(fnum=1, doclf=True).gca()
    prog = ub.ProgIter(sorted(rats_to_plot2), verbose=1)
    dtype = np.float32
    patches = ub.ddict(list)
    errors = []
    for rat in prog:
        denominator = rat.denominator
        radius = 1 / (2 * (denominator * denominator))
        point = (rat, radius)
        flt = dtype(rat)
        a, b = flt.as_integer_ratio()
        flt_as_rat = Rational(a, b)
        error = abs(rat - flt_as_rat)
        if error == 0:
            new_circle = plt.Circle(point,
                                    radius,
                                    facecolor='dodgerblue',
                                    edgecolor='none',
                                    linewidth=0,
                                    alpha=0.5)
            patches['good'].append(new_circle)
        else:
            errors.append(error)
            # Plot a line for error
            new_circle = plt.Circle(point,
                                    radius,
                                    facecolor='orangered',
                                    edgecolor='none',
                                    linewidth=0,
                                    alpha=0.5)
            patches['bad'].append(new_circle)
            ax.plot((rat - error, rat + error), (radius, radius),
                    'x-',
                    color='darkgray')

    print(ub.map_vals(len, patches))
    total = float(sum(errors))
    print('total = {!r}'.format(total))
    print(max(errors))
    print(min(errors))

    for v in patches.values():
        first = ub.peek(v)
        prop = ub.dict_isect(first.properties(),
                             ['facecolor', 'linewidth', 'alpha', 'edgecolor'])
        col = mpl.collections.PatchCollection(v, **prop)
        ax.add_collection(col)

    # Lets look for the holes in IEEE float
    # for flt in ub.ProgIter(sorted(floats), verbose=1):

    kwplot.phantom_legend({
        f'rationals without a {dtype}':
        'orangered',
        f'rationals with a {dtype}':
        'dodgerblue',
        f'x-x indicates {dtype} approximation error':
        'darkgray',
    })

    ax.set_title('Holes in IEEE 754 Float64')
    ax.set_xlabel('A rational number')
    ax.set_ylabel('The squared rational denominator')

    # import numpy as np
    # points = np.array([c.center for c in _circles])
    # maxx, maxy = points.max(axis=0)
    # print('maxx = {!r}'.format(maxx))
    # print('maxy = {!r}'.format(maxy))
    # maxx, maxy = maxx // 2, maxy // 2
    # ax.set_xlim(0, np.sqrt(int(maxx)))
    # ax.set_ylim(0, np.sqrt(int(maxy)))
    # ax.set_aspect('equal')
    # ax.set_xlim(0.2, 0.22)
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 0.1)
Exemple #13
0
def benchmark_unpack():
    """
    What is faster unpacking items with slice syntax or tuple-unpacking

    Slice unpacking seems to be a tad faster.
    """
    import ubelt as ub
    import random
    import pandas as pd
    import timerit
    import string

    def tuple_unpack(items):
        *prefix, key = items
        return prefix, key

    def slice_unpack(items):
        prefix, key = items[:-1], items[-1]
        return prefix, key

    method_lut = locals()  # can populate this some other way

    ti = timerit.Timerit(5000, bestof=3, verbose=2)

    basis = {
        'method': ['tuple_unpack', 'slice_unpack'],
        'size': list(range(1, 64 + 1)),
        'type': ['string', 'float'],
    }
    xlabel = 'size'
    kw_labels = []
    group_labels = {
        'style': ['type'],
        'size': [],
    }
    group_labels['hue'] = list((ub.oset(basis) - {xlabel}) -
                               set.union(*map(set, group_labels.values())))
    grid_iter = list(ub.named_product(basis))

    # For each variation of your experiment, create a row.
    rows = []
    for params in grid_iter:
        group_keys = {}
        for gname, labels in group_labels.items():
            group_keys[gname + '_key'] = ub.repr2(ub.dict_isect(
                params, labels),
                                                  compact=1,
                                                  si=1)
        key = ub.repr2(params, compact=1, si=1)
        size = params['size']
        method = method_lut[params['method']]
        # Timerit will run some user-specified number of loops.
        # and compute time stats with similar methodology to timeit
        for timer in ti.reset(key):
            if type == 'string':
                items = [
                    ''.join(random.choices(string.printable, k=5))
                    for _ in range(size)
                ]
            elif type == 'float':
                items = [random.random() for _ in range(size)]
            with timer:
                method(items)
        for time in ti.times:
            row = {
                'time': time,
                'key': key,
                **group_keys,
                **params,
            }
            rows.append(row)

    # The rows define a long-form pandas data array.
    # Data in long-form makes it very easy to use seaborn.
    data = pd.DataFrame(rows)
    data = data.sort_values('time')
    summary_rows = []
    for method, group in data.groupby('method'):
        row = {}
        row['method'] = method
        row['mean'] = group['time'].mean()
        row['std'] = group['time'].std()
        row['min'] = group['time'].min()
        row['max'] = group['time'].max()
        summary_rows.append(row)
    print(pd.DataFrame(summary_rows).sort_values('mean'))

    plot = True
    if plot:
        # import seaborn as sns
        # kwplot autosns works well for IPython and script execution.
        # not sure about notebooks.
        import kwplot
        sns = kwplot.autosns()

        plotkw = {}
        for gname, labels in group_labels.items():
            if labels:
                plotkw[gname] = gname + '_key'

        # Your variables may change
        ax = kwplot.figure(fnum=1, doclf=True).gca()
        sns.lineplot(data=data,
                     x=xlabel,
                     y='time',
                     marker='o',
                     ax=ax,
                     **plotkw)
        ax.set_title('Benchmark')
        ax.set_xlabel('Execution time')
        ax.set_ylabel('Size of slices')
def benchmark_pathlib_vs_fspath():
    import ubelt as ub
    import pathlib
    import pandas as pd
    import random
    import timerit
    import os

    def method_pathlib(inputs):
        p = pathlib.Path(*inputs)

    def method_ospath(inputs):
        p = os.path.join(*inputs)

    method_lut = locals()  # can populate this some other way

    ti = timerit.Timerit(10000, bestof=10, verbose=2)

    basis = {
        'method': ['method_pathlib', 'method_ospath'],
        'num_parts': [2, 4, 8, 12, 16],
    }
    xlabel = 'num_parts'
    kw_labels = []
    group_labels = {
        'style': [],
        'size': [],
    }
    group_labels['hue'] = list((ub.oset(basis) - {xlabel}) -
                               set.union(*map(set, group_labels.values())))
    grid_iter = list(ub.named_product(basis))

    # For each variation of your experiment, create a row.
    rows = []
    for params in grid_iter:
        group_keys = {}
        for gname, labels in group_labels.items():
            group_keys[gname + '_key'] = ub.repr2(ub.dict_isect(
                params, labels),
                                                  compact=1,
                                                  si=1)
        key = ub.repr2(params, compact=1, si=1)
        kwargs = ub.dict_isect(params.copy(), kw_labels)

        n = params['num_parts']
        inputs = [chr(random.randint(97, 120)) for _ in range(n)]
        kwargs['inputs'] = inputs
        method = method_lut[params['method']]
        # Timerit will run some user-specified number of loops.
        # and compute time stats with similar methodology to timeit
        for timer in ti.reset(key):
            # Put any setup logic you dont want to time here.
            # ...
            with timer:
                # Put the logic you want to time here
                method(**kwargs)
        row = {
            'mean': ti.mean(),
            'min': ti.min(),
            'key': key,
            **group_keys,
            **params,
        }
        rows.append(row)

    # The rows define a long-form pandas data array.
    # Data in long-form makes it very easy to use seaborn.
    data = pd.DataFrame(rows)
    data = data.sort_values('min')
    print(data)

    plot = True
    if plot:
        # import seaborn as sns
        # kwplot autosns works well for IPython and script execution.
        # not sure about notebooks.
        import kwplot
        sns = kwplot.autosns()

        plotkw = {}
        for gname, labels in group_labels.items():
            if labels:
                plotkw[gname] = gname + '_key'

        # Your variables may change
        ax = kwplot.figure(fnum=1, doclf=True).gca()
        sns.lineplot(data=data, x=xlabel, y='min', marker='o', ax=ax, **plotkw)
        ax.set_title('Benchmark')
        ax.set_xlabel('Time')
        ax.set_ylabel('Number of parts')
Exemple #15
0
def benchmark_dict_diff_impl():
    import ubelt as ub
    import pandas as pd
    import timerit
    import random

    def method_diffkeys(*args):
        first_dict = args[0]
        keys = set(first_dict)
        keys.difference_update(*map(set, args[1:]))
        new0 = dict((k, first_dict[k]) for k in keys)
        return new0

    def method_diffkeys_list(*args):
        first_dict = args[0]
        remove_keys = set.union(*map(set, args[1:]))
        keep_keys = [k for k in first_dict.keys() if k not in remove_keys]
        new = dict((k, first_dict[k]) for k in keep_keys)
        return new

    def method_diffkeys_oset(*args):
        first_dict = args[0]
        keys = ub.oset(first_dict)
        keys.difference_update(*map(set, args[1:]))
        new0 = dict((k, first_dict[k]) for k in keys)
        return new0

    def method_ifkeys_setcomp(*args):
        first_dict = args[0]
        remove_keys = {k for ks in args[1:] for k in ks}
        new1 = dict((k, v) for k, v in first_dict.items() if k not in remove_keys)
        return new1

    def method_ifkeys_setunion(*args):
        first_dict = args[0]
        remove_keys = set.union(*map(set, args[1:]))
        new2 = dict((k, v) for k, v in first_dict.items() if k not in remove_keys)
        return new2

    def method_ifkeys_getitem(*args):
        first_dict = args[0]
        remove_keys = set.union(*map(set, args[1:]))
        new3 = dict((k, first_dict[k]) for k in first_dict.keys() if k not in remove_keys)
        return new3

    def method_ifkeys_dictcomp(*args):
        # Cannot use until 3.6 is dropped (it is faster)
        first_dict = args[0]
        remove_keys = set.union(*map(set, args[1:]))
        new4 = {k: v for k, v in first_dict.items() if k not in remove_keys}
        return new4

    def method_ifkeys_dictcomp_getitem(*args):
        # Cannot use until 3.6 is dropped (it is faster)
        first_dict = args[0]
        remove_keys = set.union(*map(set, args[1:]))
        new4 = {k: first_dict[k] for k in first_dict.keys() if k not in remove_keys}
        return new4

    method_lut = locals()  # can populate this some other way

    def make_data(num_items, num_other, remove_fraction, keytype):
        if keytype == 'str':
            keytype = str
        if keytype == 'int':
            keytype = int
        first_keys = [random.randint(0, 1000) for _ in range(num_items)]
        k = int(remove_fraction * len(first_keys))
        remove_sets = [list(ub.unique(random.choices(first_keys, k=k) + [random.randint(0, 1000) for _ in range(num_items)])) for _ in range(num_other)]
        first_dict = {keytype(k): k for k in first_keys}
        args = [first_dict] + [{keytype(k): k for k in ks} for ks in remove_sets]
        return args

    ti = timerit.Timerit(200, bestof=1, verbose=2)

    basis = {
        'method': [
            # Cant use because unordered
            # 'method_diffkeys',

            # Cant use because python 3.6
            'method_ifkeys_dictcomp',
            'method_ifkeys_dictcomp_getitem',

            'method_ifkeys_setunion',
            'method_ifkeys_getitem',
            'method_diffkeys_list',

            # Probably not good
            # 'method_ifkeys_setcomp',
            # 'method_diffkeys_oset',
        ],
        'num_items': [10, 100, 1000],
        'num_other': [1, 3, 5],
        # 'num_other': [1],
        'remove_fraction': [0, 0.2, 0.5, 0.7, 1.0],
        # 'remove_fraction': [0.2, 0.8],
        'keytype': ['str', 'int'],
        # 'keytype': ['str'],
        # 'param_name': [param values],
    }
    xlabel = 'num_items'
    kw_labels = ['num_items', 'num_other', 'remove_fraction', 'keytype']
    group_labels = {
        'style': ['num_other', 'keytype'],
        'size': ['remove_fraction'],
    }
    group_labels['hue'] = list(
        (ub.oset(basis) - {xlabel}) - set.union(*map(set, group_labels.values())))
    grid_iter = list(ub.named_product(basis))

    # For each variation of your experiment, create a row.
    rows = []
    for params in grid_iter:
        group_keys = {}
        for gname, labels in group_labels.items():
            group_keys[gname + '_key'] = ub.repr2(
                ub.dict_isect(params, labels), compact=1, si=1)
        key = ub.repr2(params, compact=1, si=1)
        kwargs = ub.dict_isect(params.copy(),  kw_labels)
        args = make_data(**kwargs)
        method = method_lut[params['method']]
        # Timerit will run some user-specified number of loops.
        # and compute time stats with similar methodology to timeit
        for timer in ti.reset(key):
            # Put any setup logic you dont want to time here.
            # ...
            with timer:
                # Put the logic you want to time here
                method(*args)
        row = {
            'mean': ti.mean(),
            'min': ti.min(),
            'key': key,
            **group_keys,
            **params,
        }
        rows.append(row)

    # The rows define a long-form pandas data array.
    # Data in long-form makes it very easy to use seaborn.
    data = pd.DataFrame(rows)
    data = data.sort_values('min')
    print(data)

    # for each parameter setting, group all methods with that used those exact
    # comparable params. Then rank how good each method did.  That will be a
    # preference profile. We will give that preference profile a weight (e.g.
    # based on the fastest method in the bunch) and then aggregate them with
    # some voting method.

    USE_OPENSKILL = 1
    if USE_OPENSKILL:
        # Lets try a real ranking method
        # https://github.com/OpenDebates/openskill.py
        import openskill
        method_ratings = {m: openskill.Rating() for m in basis['method']}

    weighted_rankings = ub.ddict(lambda: ub.ddict(float))
    for params, variants in data.groupby(['num_other', 'keytype', 'remove_fraction', 'num_items']):
        variants = variants.sort_values('mean')
        ranking = variants['method'].reset_index(drop=True)

        if USE_OPENSKILL:
            # The idea is that each setting of parameters is a game, and each
            # "method" is a player. We rank the players by which is fastest,
            # and update their ranking according to the Weng-Lin Bayes ranking
            # model. This does not take the fact that some "games" (i.e.
            # parameter settings) are more important than others, but it should
            # be fairly robust on average.
            old_ratings = [[r] for r in ub.take(method_ratings, ranking)]
            new_values = openskill.rate(old_ratings)  # Not inplace
            new_ratings = [openskill.Rating(*new[0]) for new in new_values]
            method_ratings.update(ub.dzip(ranking, new_ratings))

        # Choose a ranking weight scheme
        weight = variants['mean'].min()
        # weight = 1
        for rank, method in enumerate(ranking):
            weighted_rankings[method][rank] += weight
            weighted_rankings[method]['total'] += weight

    # Probably a more robust voting method to do this
    weight_rank_rows = []
    for method_name, ranks in weighted_rankings.items():
        weights = ub.dict_diff(ranks, ['total'])
        p_rank = ub.map_vals(lambda w: w / ranks['total'], weights)

        for rank, w in p_rank.items():
            weight_rank_rows.append({'rank': rank, 'weight': w, 'name': method_name})
    weight_rank_df = pd.DataFrame(weight_rank_rows)
    piv = weight_rank_df.pivot(['name'], ['rank'], ['weight'])
    print(piv)

    if USE_OPENSKILL:
        from openskill import predict_win
        win_prob = predict_win([[r] for r in method_ratings.values()])
        skill_agg = pd.Series(ub.dzip(method_ratings.keys(), win_prob)).sort_values(ascending=False)
        print('skill_agg =\n{}'.format(skill_agg))

    aggregated = (piv * piv.columns.levels[1].values).sum(axis=1).sort_values()
    print('weight aggregated =\n{}'.format(aggregated))

    plot = True
    if plot:
        # import seaborn as sns
        # kwplot autosns works well for IPython and script execution.
        # not sure about notebooks.
        import kwplot
        sns = kwplot.autosns()

        plotkw = {}
        for gname, labels in group_labels.items():
            if labels:
                plotkw[gname] = gname + '_key'

        # Your variables may change
        ax = kwplot.figure(fnum=1, doclf=True).gca()
        sns.lineplot(data=data, x=xlabel, y='min', marker='o', ax=ax, **plotkw)
        ax.set_title('Benchmark')
        ax.set_xlabel('A better x-variable description')
        ax.set_ylabel('A better y-variable description')
Exemple #16
0
def benchmark_mul_vs_pow():
    import ubelt as ub
    import pandas as pd
    import timerit

    from functools import reduce
    import operator as op
    import itertools as it

    def method_pow_via_mul_raw(n):
        """ Construct a function that does multiplication of a value n times """
        return eval('lambda v: ' + ' * '.join(['v'] * n))

    def method_pow_via_mul_for(v, n):
        ret = v
        for _ in range(1, n):
            ret = ret * v
        return ret

    def method_pow_via_mul_reduce(v, n):
        """ Alternative way to multiply a value n times """
        return reduce(op.mul, it.repeat(v, n))

    def method_pow_via_pow(v, n):
        return v ** n

    method_lut = locals()  # can populate this some other way

    ti = timerit.Timerit(500000, bestof=1000, verbose=2)

    basis = {
        'method': ['method_pow_via_mul_raw', 'method_pow_via_pow'],
        'n': list(range(1, 20)),
        'v': ['random-int', 'random-float'],
        # 'param_name': [param values],
    }
    xlabel = 'n'
    kw_labels = ['v', 'n']
    group_labels = {
        'style': ['v'],
        'size': [],
    }
    group_labels['hue'] = list(
        (ub.oset(basis) - {xlabel}) - set.union(*map(set, group_labels.values())))
    grid_iter = list(ub.named_product(basis))

    # For each variation of your experiment, create a row.
    rows = []
    for params in grid_iter:
        group_keys = {}
        for gname, labels in group_labels.items():
            group_keys[gname + '_key'] = ub.repr2(
                ub.dict_isect(params, labels), compact=1, si=1)
        key = ub.repr2(params, compact=1, si=1)
        kwargs = ub.dict_isect(params.copy(),  kw_labels)
        method = method_lut[params['method']]
        # Timerit will run some user-specified number of loops.
        # and compute time stats with similar methodology to timeit

        if params['method'] == 'method_pow_via_mul_raw':
            method = method(kwargs.pop('n'))

        for timer in ti.reset(key):
            # Put any setup logic you dont want to time here.
            # ...
            import random
            if kwargs['v'] == 'random':
                kwargs['v'] = random.randint(1, 31000) if random.random() > 0.5 else random.random()
            elif kwargs['v'] == 'random-int':
                kwargs['v'] = random.randint(1, 31000)
            elif kwargs['v'] == 'random-float':
                kwargs['v'] = random.random()
            with timer:
                # Put the logic you want to time here
                method(**kwargs)
        for time in map(min, ub.chunks(ti.times, ti.bestof)):
            row = {
                # 'mean': ti.mean(),
                'time': time,
                'key': key,
                **group_keys,
                **params,
            }
            rows.append(row)

    # The rows define a long-form pandas data array.
    # Data in long-form makes it very easy to use seaborn.
    data = pd.DataFrame(rows)
    # data = data.sort_values('time')
    print(data)

    plot = True
    if plot:
        # import seaborn as sns
        # kwplot autosns works well for IPython and script execution.
        # not sure about notebooks.
        import kwplot
        sns = kwplot.autosns()
        plt = kwplot.autoplt()

        plotkw = {}
        for gname, labels in group_labels.items():
            if labels:
                plotkw[gname] = gname + '_key'

        # Your variables may change
        ax = kwplot.figure(fnum=1, doclf=True).gca()
        sns.lineplot(data=data, x=xlabel, y='time', marker='o', ax=ax, **plotkw)
        ax.set_title('Benchmark')
        ax.set_xlabel('N')
        ax.set_ylabel('Time')
        ax.set_yscale('log')

        plt.show()