def aggregate_multiple_runtime_trials(Ds, Ps): """Collapse multiple dataframes `Ds` from different timing runes into a single one, by taking the min over runtimes (i.e., new runtime will be "best-of k" where k=|Ds|). Actually, this function does more than that. It appears to collapse over sentence too, e.g., computing corpus-EVALB and avg[best-of-k runtimes]. """ D0 = Ds[0] # Append trials together foo = Ds[0] for dd in Ds[1:]: foo = foo.append(dd) # Take min over time_total for this policy-example pair. minz = foo[['policy','example','time_total']].groupby(['policy','example']).min() data = [] for policy in iterview(Ps): dump = path(policy).dirname() args = cPickle.load(file(dump / 'args.pkl')) log = pd.read_csv(dump / 'log.csv') # TODO: will need to add extra cases. if 'DP' in args.roll_out: type_ = 'DP' elif 'CP' in args.roll_out: type_ = 'CP' elif 'HY' in args.roll_out: type_ = 'HY' elif 'BODEN' in args.roll_out: type_ = 'baseline' else: raise ValueError(args.roll_out) min_times = minz.ix[policy]['time_total'] P = D0[D0.policy == policy] f = cgw_f(P.want_and_got.sum(), P.got.sum(), P.want.sum()) #pl.scatter(df.avg_bestof_time, df.evalb, c=C[name], lw=0) #show_frontier(df.avg_bestof_time, df.evalb, c=C[name], interpolation='linear', label=name) #[w,b] = np.polyfit(df.pushes, df.avg_bestof_time, deg=1) #show_frontier(df.pushes*w + b, df.evalb, interpolation='linear', c=C[name]) if 0: # log-log plot of pushes v. seconds. Really great correlation! PP = P[['example','pushes']].join(min_times, on='example') PP['log(pushes)'] = np.log(PP.pushes) PP['log(seconds)'] = np.log(PP.time_total) compare('log(pushes)', 'log(seconds)', data=PP, scatter=1, show_regression=1) #pl.figure() # pushes v. seconds. Really great correlation! #PP = P[['example','pushes']].join(min_times, on='example') #compare('pushes', 'time_total', data=PP, scatter=1, show_regression=1) pl.ioff(); pl.show() if 0: # empirical runtime estimates # scatter plot sentence length against runtime. n_by_time = P[['example','N']].join(min_times, on='example') pl.scatter(n_by_time.N, n_by_time.time_total, alpha=0.5, lw=0) # highlight median runtime per sentence length. n_by_median_time = n_by_time.groupby('N').median() pl.plot(n_by_median_time.index, n_by_median_time.time_total, c='k', lw=2) # empirical exponent and constant factor compare(np.log(n_by_time.time_total), np.log(n_by_time.N), scatter=1, show_regression=1) pl.ioff(); pl.show() # use early stopping on dev to pick the policy. dev = log.ix[log['dev_new_policy_reward'].argmax()] row = {'avg_bestof_time': np.mean(min_times), 'wps': np.mean(P.N) / np.mean(min_times), 'pushes': np.mean(P.pushes), 'pops': np.mean(P.pops), 'policy': policy, 'dev_pushes': dev.dev_new_policy_pushes, 'dev_evalb': dev.dev_new_policy_evalb_corpus, 'type': type_, 'evalb': f} row.update({'args_'+k: v for k,v in args.__dict__.items()}) data.append(row) # remove unused baselines (sorry this is a bit ugly). ddd = pd.DataFrame(data) others = ddd[ddd.type != 'baseline'] B = ddd[ddd.type == 'baseline'] used = set() for _, z in others.iterrows(): [ix] = B[B.policy == z.args_init_weights].index used.add(ix) B = B.ix[list(used)] ddd = others.append(B) return ddd
def get_acc_run(X): acc = cgw_f(X.want_and_got.sum(), X.got.sum(), X.want.sum()) run = X.pushes.mean() return acc, run
def main(): from argparse import ArgumentParser p = ArgumentParser() p.add_argument('directory', type=path) p.add_argument('--grammar', choices=('medium', 'big'), required=1) p.add_argument('--split', choices=('dev', 'test', 'other', 'train'), required=1) p.add_argument('--download-only', action='store_true') args = p.parse_args() if 'unpruned' in args.directory: policies = [path('unpruned') / 'unpruned'] else: # Grab the best model parameters according to early stopping on each dev # reward surrogate. policies = [] for x in (path('results/*-lols10-*/').glob('dump') + path('results/*-baseline9-*/').glob('dump')): # for x in (path('results/*-lols11-*/').glob('dump')): # only grab models matching the grammar specified. if cPickle.load(file(x / 'args.pkl')).grammar != args.grammar: continue df = pd.read_csv(x / 'log.csv') if df.get('dev_new_policy_evalb_corpus') is None: # [2016-04-29 Fri] SKIP some of the baselin8 jobs. These guys are # using the wrong evaluation. assert 'baseline9' in x continue # identify iteration with best dev reward (surrogate). el = df.datetime.map(pd.to_datetime).tolist() df['elapsed'] = [(t - el[0]).total_seconds() / (24 * 60 * 60) for t in el] df = df[df.elapsed <= 6] # take the best policy <= 6 days of training. best = df.ix[df.dev_new_policy_reward.argmax()] print colors.yellow % 'best policy:' print best[['iteration', 'elapsed']] # download model file for that iteration, if we don't already have it. policy = x / ('new_policy-%03d.npz' % best.iteration) if not policy.exists(): assert 0 == os.system( 'rsync --progress "[email protected]:/export/a11/timv/ldp/%s" %s' % (policy, policy)) policies.append(policy) if args.download_only: return s = Setup(grammar=args.grammar, train=0, dev=0, features=0) examples = list(s.load(args.split)) shuffle(policies) outdir = args.directory outdir.mkdir_p() for pp, policy in enumerate(policies, start=1): print print colors.green % '[%s/%s] %s' % (pp, len(policies), policy) evaluation_file = outdir / (policy.dirname() / '..').abspath( ).basename() + '-evaluation.csv.gz' print evaluation_file if path(evaluation_file).exists(): last_time = pd.read_csv(evaluation_file) [last_policy] = last_time.policy.unique() if last_policy == policy: print colors.yellow % 'SKIP: evaluation file exists.' continue print colors.red % 'replace old evaluation.' if 'unpruned' in policy: w = None else: w = np.load(policy)['coef'] d = get_data(s.grammar, policy, w, examples) df = pd.DataFrame(d) if 1: xx = df.groupby('policy').sum() yy = df.groupby('policy').mean() print 'evalb: %.3f' % (cgw_f(yy.want_and_got.sum(), yy.got.sum(), yy.want.sum())) print 'words/sec: %.1f' % (xx.N / xx.time_total) print 'sent/sec: %.1f' % (1 / yy.time_total) print 'sec/sent: %g' % (yy.time_total) #print 'features: %4.1f%%' % (100 * yy.time_feature / yy.time_total) #print 'parse: %4.1f%%' % (100 * yy.time_parse / yy.time_total) pd.DataFrame(df).to_csv(evaluation_file, compression='gzip')
def _main(args): if args.deps: import StanfordDependencies dep = StanfordDependencies.get_instance(backend='subprocess') delta = args.delta assert 0 <= delta <= 1 T = {} P = {} check_llh = 1 color = {name: c for name, c in zip(sorted(P), 'rgbym' * len(P))} marker = {name: 'o' for name in P} if args.experiment not in ('grammars', ): # benchmark default parser if args.grammar == 'medium': grammar_file = 'data/medium' g = Grammar.load('data/medium') elif args.grammar == 'big': grammar_file = 'data/bubs/wsj_6' chomsky = False g = Grammar.load(grammar_file) if args.experiment == 'default-parser': P['lchild'] = Parser(leftchild, g, chomsky=0) elif args.experiment == 'grammar-loops': # Experiment: grammar loops P['lcbptr'] = Parser(leftchild_bp, g, chomsky=chomsky) P['lchild'] = Parser(leftchild, g, chomsky=chomsky) #P['x-prod'] = Parser(xprod, g, chomsky=chomsky) #P['agenda'] = AgendaParser(g, chomsky=chomsky) elif args.experiment == 'grammars': #P, color, marker = _leftchild_v_dense_yj_on_many_grammars() P, color, marker = _many_grammars() check_llh = False else: raise ValueError('Fail to recognize experiment %r' % args.experiment) T = {x: Timer(x) for x in P} overall = [] errors = [] examples = ptb(args.fold, minlength=3, maxlength=args.maxlength, n=args.examples) if 1: examples = list(examples) np.random.shuffle(examples) _evalb_gold = {} _evalb_pred = {} for k, p in enumerate(P): _evalb_gold[p] = open('tmp/evalb-%s.gold' % k, 'wb') _evalb_pred[p] = open('tmp/evalb-%s.pred' % k, 'wb') if args.policy: from ldp.prune.features import Features theta = np.load(args.policy)['coef'] policy_grammar = Grammar.load( 'data/bubs/wsj_6') # FIXME: shouldn't be hardcoded! F = Features(policy_grammar, nfeatures=2**22) for i, (s, t) in enumerate(examples): print print green % 'Example: %s, length: %s' % (i, len(s.split())) print yellow % s e = Example(s, grammar=None, gold=t) sentence = e.tokens N = e.N if args.policy: e.tokens = policy_grammar.encode_sentence(e.sentence.split()) keep = F.mask(e, theta) else: # don't prune anything keep = np.ones((N, N + 1), dtype=np.int) for x in e.nodes: keep[x] = np.random.uniform(0, 1) <= delta for x in e.gold_spans: keep[x] = 1 data = [] #ugold = Tree.fromstring(e.gold_unbinarized) if args.deps: dep_gold = dep.convert_tree( e.gold_unbinarized, universal=0) # TODO: include function tags??? dep_unlabel_gold = {(z.index, z.head) for z in dep_gold} dep_label_gold = {(z.index, z.deprel, z.head) for z in dep_gold} for parser in sorted(P): b4 = time() with T[parser]: state = P[parser](e, keep) wallclock = time() - b4 s = state.likelihood d = state.derivation pops = state.pops pushes = state.pushes ucoarse = P[parser].decode(e, d) # print # print parser # print ucoarse # write gold and predicted trees to files so we can call evalb print >> _evalb_gold[parser], e.gold_unbinarized print >> _evalb_pred[parser], oneline(ucoarse) GW, G, W = evalb_unofficial(e.gold_unbinarized, binarize(ucoarse)) h = cgw_f(GW, G, W) # h = evalb(e.gold_unbinarized, ucoarse) row = { 'name': parser, 'llh': s, 'sentence': sentence, 'N': N, #'tree': tree, 'evalb': h, 'GotWant': GW, 'Got': G, 'Want': W, 'pops': pops, 'pushes': pushes, 'wallclock': wallclock } if args.deps: # TODO: include function tags? What is the correct way to get target trees? dep_parse = dep.convert_tree(oneline(ucoarse), universal=0) dep_label = {(z.index, z.deprel, z.head) for z in dep_parse} dep_unlabel = {(z.index, z.head) for z in dep_parse} # TODO: Use the official eval.pl script from CoNLL task. UAS = len(dep_unlabel & dep_unlabel_gold) / e.N LAS = len(dep_label & dep_label_gold) / e.N row['LAS'] = LAS row['UAS'] = UAS data.append(row) overall.append(row) df = DataFrame(overall).groupby('name').mean() #df['wallclock'] = sum_df.wallclock # use total time df.sort_values('wallclock', inplace=1) df['speedup'] = df.wallclock.max() / df.wallclock df['wps'] = df['N'] / df['wallclock'] # ok to use avg instead of sum # Determine which columns to display given command-line options. show_cols = [ 'evalb_corpus', 'wallclock', 'wps', 'speedup', 'pushes', 'pops', 'LAS', 'UAS' ] if len(P) == 1: show_cols.remove('speedup') if not args.deps: show_cols.remove('LAS') show_cols.remove('UAS') def foo(df): "Add column" s = DataFrame(overall).groupby( 'name').sum() # create separate sum dataframe. P = s.GotWant / s.Got R = s.GotWant / s.Want df['evalb_corpus'] = 2 * P * R / (P + R) df['evalb_avg'] = df.pop('evalb') # get rid of old column. foo(df) print df[show_cols] if args.pareto: accuracy_name = 'evalb' with axman('speed-accuracy ($\delta= %g$)' % delta) as ax: df = DataFrame(overall).groupby('name').mean() runtime = df.wallclock / df.wallclock.max() for name, x, y in zip(df.index, runtime, df[accuracy_name]): c = color[name] ax.scatter([x], [y], alpha=0.75, lw=0, s=50, c=c, label=name, marker=marker[name]) ax.legend(loc=4) ax.set_xlim(-0.1, 1.1) ax.set_ylim(0, 1) ax.grid(True) ax.set_xlabel('runtime (relative to slowest)') ax.set_ylabel('accuracy (%s)' % accuracy_name) show_frontier(runtime, df[accuracy_name], ax=ax) if args.bylength: # Breakdown runtime differences of parsers by length. bylength = {name: [] for name in T} for length, df in DataFrame(overall).groupby('N'): df = df.groupby('name').mean() for name, v in df.wallclock.iteritems(): bylength[name].append([length, v]) with axman('benchmark') as ax: for name, d in sorted(bylength.items()): d.sort() xs, ys = np.array(d).T ax.plot(xs, ys, alpha=0.5, c=color[name], label=name) ax.scatter(xs, ys, alpha=0.5, lw=1, c=color[name]) ax.legend(loc=2) ax.set_xlabel('sentence length') ax.set_ylabel('seconds / sentence') if check_llh: # Only run this test when it makes sense, e.g., when all parses come # from the same grammar. s0 = data[0]['llh'] for x in data: s = x['llh'] name = x['name'] if abs(s0 - s) > 1e-10: errors.append({'parser': name, 'sentence': sentence}) print '[%s]: name: %s expect: %g got: %g' % (red % 'error', name, s0, s) Timer.compare_many(*T.values(), verbose=False) if errors: print red % 'errors: %s' % len(errors) print print green % '===============================' print green % 'DONE!' print print 'EVALB-unofficial:' print 2 * (df.GotWant / df.Got * df.GotWant / df.Want) / (df.GotWant / df.Got + df.GotWant / df.Want) print print 'EVALB-official:' import os for k, p in enumerate(P): _evalb_pred[p].close() _evalb_gold[p].close() out = 'tmp/evalb-%s.out' % k os.system('./bin/EVALB/evalb %s %s > %s' % (_evalb_gold[p].name, _evalb_pred[p].name, out)) with file(out) as f: for x in f: if x.startswith('Bracketing FMeasure'): print p, float(x.strip().split()[-1]) break # use the first one which is for all lengths
def f1(self): return cgw_f(self.C, self.G, self.W)