def test_simulate(): tmpdir = tempfile.mkdtemp(prefix='psweep_test_simulate_') params = [{'a': 1}, {'a': 2}, {'a': 3}, {'a': 4}] params_sim = [{'a': 88}, {'a': 99}] calc_dir = "{}/calc".format(tmpdir) calc_dir_sim = calc_dir + '.simulate' df = ps.run(func, params, calc_dir=calc_dir) df_sim = ps.run(func, params_sim, calc_dir=calc_dir, simulate=True) dbfn = "{}/results.pk".format(calc_dir) dbfn_sim = "{}/results.pk".format(calc_dir_sim) assert len(df_sim) == 6 assert len(df) == 4 assert os.path.exists(dbfn) assert os.path.exists(dbfn_sim) assert df.equals(ps.df_read(dbfn)) assert df_sim.equals(ps.df_read(dbfn_sim)) assert df.iloc[:4].equals(df_sim.iloc[:4]) assert np.isnan(df_sim.result.values[-2:]).all() df2 = ps.run(func, params_sim, calc_dir=calc_dir) assert len(df2) == 6 assert df.iloc[:4].equals(df2.iloc[:4]) assert (df2.result.values[-2:] == np.array([880.0, 990.0])).all() shutil.rmtree(tmpdir)
def test_run(): tmpdir = tempfile.mkdtemp(prefix='psweep_test_run_') params = [{'a': 1}, {'a': 2}, {'a': 3}, {'a': 4}] calc_dir = "{}/calc".format(tmpdir) # run two times, updating the database, the second time, # also write tmp results df = ps.run(func, params, calc_dir=calc_dir) assert len(df) == 4 assert len(df._run_id.unique()) == 1 assert len(df._pset_id.unique()) == 4 df = ps.run(func, params, calc_dir=calc_dir, poolsize=2, tmpsave=True) assert len(df) == 8 assert len(df._run_id.unique()) == 2 assert len(df._pset_id.unique()) == 8 assert set(df.columns) == \ set(['_calc_dir', '_pset_id', '_run_id', '_time_utc', 'a', 'result']) dbfn = "{}/results.pk".format(calc_dir) assert os.path.exists(dbfn) assert df.equals(ps.df_read(dbfn)) # tmp results of second run run_id = df._run_id.unique()[-1] for pset_id in df[df._run_id == run_id]._pset_id: tmpsave_fn = "{calc_dir}/tmpsave/{run_id}/{pset_id}.pk".format( calc_dir=calc_dir, run_id=run_id, pset_id=pset_id) assert os.path.exists(tmpsave_fn) shutil.rmtree(tmpdir)
def get_holomaps(): # dict with jpeg byte strings of all images from the parameter study # {pset_id: jpegstr} jpegstr_dct = common.pkread('img_dct_rgb.pk') # shape: assume all imgs have the same shape shape = common.jpegstr2imgarr(jpegstr_dct[list( jpegstr_dct.keys())[0]]).shape # the parameter sweep database (created by the psweep package) df = ps.df_read('results.pk') df = df[df.fail_state.isna()] vary_cols = [ 'style_weight', 'tv_weight', 'learning_rate', 'style_scales', 'content_weight_blend', 'style_layer_weight_exp' ] holos = {} print("creating holomaps ...") for study in vary_cols: print(" " + study) this_df = df[df.study == study].sort_values(study) # {value of varied param (study): array shape (width, height, 3),...} imgs = dict((this_df.loc[this_df._pset_id == pset_id, study][0], hv.RGB(common.jpegstr2imgarr(jpegstr_dct[pset_id]))) for pset_id in this_df._pset_id) holos[study] = hv.HoloMap(imgs, kdims=study) # holoviews settings for matplotlib hv.util.opts({ 'RGB': { 'plot': { 'fig_latex': False, 'aspect': shape[1] / shape[0], 'fig_size': 200, 'xaxis': False, 'yaxis': False } } }) print("\nhang tight, we're rendering stuff ...") return holos
savefig(fig, '{}_{}'.format(study, maxsize_str)) def savefig(fig, name): os.makedirs('pics', exist_ok=True) ## for ext in ['pdf', 'png']: for ext in ['png']: fig.savefig("pics/{name}.{ext}".format(name=name, ext=ext), dpi=300) if __name__ == '__main__': if len(sys.argv) > 1: results = sys.argv[1] else: results = 'calc/results.pk' dfall = ps.df_read(results) if 'share_leafs' in dfall.columns: dfall.share_leafs = dfall.share_leafs.astype(bool) for maxsize_str in np.unique(dfall.maxsize_str.values): df = dfall[dfall.maxsize_str == maxsize_str] if 'main_blocksize_single' in df.study.values: plot('main_blocksize_single', df, 'blocksize', 'timing', 'filesize_str', plot='semilogx') if 'main_filesize_single' in df.study.values: plot('main_filesize_single', df, 'filesize', 'timing', 'blocksize_str') if 'main_blocksize' in df.study.values:
from matplotlib import pyplot as plt import numpy as np from psweep import psweep as ps import common def nearest_idx(src, tgt): return [np.abs(src - x).argmin() for x in tgt] if __name__ == '__main__': df = ps.df_read('results.pk') df = df[df.fail_state.isna()] # all possible parameters, with data limits vary_cols = dict( style_weight=[1, 30], tv_weight=None, learning_rate=None, style_scales=[None, 2], content_weight_blend=[None, 0.5], style_layer_weight_exp=None, ) img_dct = { key: common.jpegstr2imgarr(val) for key, val in common.pkread('img_dct_rgb.pk').items()
os.makedirs('pics', exist_ok=True) # anonymize dirs for plots which we publish anon_dirs = True # group commands which perform roughly the same thing together, the last # group's commands are unrelated cmd_groups = \ [('s', r"^(findsame|jdupes -rQ|rdfind|duff -ra)$"), ('o', r"^(jdupes -r|duff -rat)$"), ('^', r"^(findsame -l 4K|jdupes -rTT)$"), ('*', r"^findsame.*(-t1|-l 512K).*$"), ] df = ps.df_read('calc/results.pk') letters = iter(string.ascii_uppercase) datadirs = [ analyze.DataDir(pth, next(letters)) for pth in df.datadir.unique() ] datadirs.append(analyze.DataDir(os.environ['HOME'], 'HOME')) print(datadirs) # plot bench data, exclude HOME, for which we only calculate the histogram # later for datadir in datadirs[:-1]: fig, ax = plt.subplots() for cache, color in [('cold', 'tab:blue'), ('warm', 'tab:orange')]: this_df = df[(df.cache == cache) & (df.datadir == datadir.path)] cmds = list(map(filter_cmd, this_df.tool_cmd))
Find all _run_ids $ {this} {db} | jq -r '.[]|._run_id' | sort -u 02ca9694-696e-4fdd-ac08-8c343080bb63 0a1fb364-8681-4178-869e-1126f3719da4 97058ee2-2e81-426f-b674-04b7ec718c43 Print a table of some columns $ {this} {db} | jq -r '.[]|[._time_utc,.study,._run_id]|@tsv' | column -t Show which _run_ids have which study $ {this} {db} | jq -r '.[]|[.study,._run_id]|@tsv' | uniq | column -t | sort -k1 foo 02ca9694-696e-4fdd-ac08-8c343080bb63 bar 0a1fb364-8681-4178-869e-1126f3719da4 prod=foo:bar 97058ee2-2e81-426f-b674-04b7ec718c43 Complex example: show the start time of each run $ {this} {db} > /tmp/json $ for x in $(jq -r '.[]|._run_id' /tmp/json | sort -u); do \ ... echo $x $(jq -r "[.[]|select(._run_id==\"$x\")|._time_utc]|min" /tmp/json) ... done 02ca9694-696e-4fdd-ac08-8c343080bb63 2018-09-03T00:00:24Z 0a1fb364-8681-4178-869e-1126f3719da4 2018-09-02T22:23:47Z 97058ee2-2e81-426f-b674-04b7ec718c43 2018-09-02T22:09:44Z """.format(this=os.path.basename(__file__), db='results.pk') if __name__ == '__main__': args = docopt.docopt(__doc__) df = ps.df_read(args['<file>']) print(ps.df_to_json(df, orient=args['-o']))
def test_df_io(): from pandas.util.testing import assert_frame_equal letters = string.ascii_letters ri = np.random.randint rn = np.random.rand # random string rs = lambda n: ''.join(letters[ii] for ii in ri(0, len(letters), n)) for fmt in ['pickle', 'json']: df = pd.DataFrame() for _ in range(2): vals = [ ri(0, 100), rs(5), np.nan, '"{}"'.format(rs(5)), "'{}'".format(rs(5)), (ri(0, 99), rn(), '{}'.format(rs(5))), [ri(0, 99), rn(), "{}".format(rs(5))], rn(), rn(5), rn(5, 5), list(rn(5)), { 'a': 1, 'b': 3, 'c': [1, 2, 3] }, ] if fmt == 'pickle': vals += [ True, False, None, set(ri(0, 99, 10)), ] row = pd.DataFrame([dict(zip(letters, vals))]) df = df.append(row, ignore_index=True) if fmt == 'json': for orient in [ None, 'split', 'records', 'index', 'columns', '_default_' ]: print("orient: ", orient) fn = tempfile.mktemp( prefix='psweep_test_df_io_{}_{}_'.format(fmt, orient)) if orient != '_default_': ps.df_write(df, fn, fmt=fmt, orient=orient) read = ps.df_read(fn, fmt=fmt, orient=orient) else: ps.df_write(df, fn, fmt=fmt) read = ps.df_read(fn, fmt=fmt) os.remove(fn) assert_frame_equal(df, read, check_exact=False, check_less_precise=12) elif fmt == 'pickle': fn = tempfile.mktemp(prefix='psweep_test_df_io_{}_'.format(fmt)) ps.df_write(df, fn, fmt=fmt) read = ps.df_read(fn, fmt=fmt) os.remove(fn) ## assert_frame_equal(df, read, check_exact=True) assert_frame_equal(df, read) else: raise Exception("unknown fmt")
import os import common from psweep import psweep as ps pj = os.path.join if __name__ == '__main__': # jpeg compression quality (percent) quality = 40 # all data, not part of this repo basedir = ps.fullpath('~/work/data/hackathon/calc') df = ps.df_read(f'{basedir}/results.pk') # new column, detect failed run df = df.reindex(columns=df.columns.tolist() + ['fail_state']) cases = [ (r'std::bad_alloc', 'bad_alloc'), (r'Killed', 'killed'), ] img_dct_rgb = {} img_dct_gray = {} for ii, pset_id in enumerate(df._pset_id.values): with open(pj(basedir, pset_id, 'log')) as fd: txt = fd.read() go = True for regex, fail_state in cases: