예제 #1
0
 def callback(br, m):
     _ = users, sct   # FIXME: ip only seems to take local and global; sct is neither..
     newsizes = np.array(map(m.ix[0].similarity, users)) * 30
     #s = sct.get_sizes()
     #s[:] = newsizes
     sct._sizes = newsizes
     ip()
예제 #2
0
파일: mds_twitter.py 프로젝트: afcarl/viz
 def callback(br, m):
     _ = users, sct  # FIXME: ip only seems to take local and global; sct is neither..
     newsizes = np.array(map(m.ix[0].similarity, users)) * 30
     #s = sct.get_sizes()
     #s[:] = newsizes
     sct._sizes = newsizes
     ip()
예제 #3
0
파일: scattermatrix.py 프로젝트: afcarl/viz
def callback(event):
    print 'callback:', event
    ax = event.inaxes
    pl.ion()
    newfig = pl.figure()
    ax.set_figure(newfig)
    newfig.set_axes([ax])
    newfig.canvas.show()

    ip()
예제 #4
0
    def play(self):

        df = self.data_frame()
        df1 = df.set_index(['fontsize', 'fontname']).sort(ascending=False)

        for k,v in df.groupby(['fontsize', 'fontname'], sort=True):
            print '-----'
            print unicode(k).encode('utf8'), unicode(v).encode('utf8')

        print df1.to_string()

        from arsenal.debug import ip; ip()
예제 #5
0
파일: pdfmill.py 프로젝트: afcarl/skid
    def play(self):

        df = self.data_frame()
        df1 = df.set_index(['fontsize', 'fontname']).sort(ascending=False)

        for k, v in df.groupby(['fontsize', 'fontname'], sort=True):
            print '-----'
            print unicode(k).encode('utf8'), unicode(v).encode('utf8')

        print df1.to_string()

        from arsenal.debug import ip
        ip()
예제 #6
0
def main():
    "Command-line interface for running test cases."
    from argparse import ArgumentParser
    p = ArgumentParser()

    p.add_argument('--boolean', action='store_true')

    p.add_argument('--minlength', type=int, default=5)
    p.add_argument('--maxlength', type=int, default=30)
    p.add_argument('--examples', type=int, required=True)
    p.add_argument('--seed', type=int, default=None)
    p.add_argument('--grammar', choices=('medium', 'big'), default='medium')
    p.add_argument('--aggressive',
                   type=float,
                   default=0.5,
                   help='Pruning rate (zero=no pruning, one=lots of pruning).')

    args = p.parse_args()

    np.random.seed(args.seed)

    s = Setup(train=args.examples,
              grammar=args.grammar,
              maxlength=args.maxlength,
              minlength=args.minlength,
              features=False)

    test = _test_correctness_boolean if args.boolean else _test_correctness

    for i, example in enumerate(s.train):
        print colors.yellow % '=============================================================='
        print 'example: %s length: %s' % (i, example.N)
        test(example, s.grammar, args.aggressive)

    print colors.green % '=============================================================='
    print colors.green % 'DONE'
    print

    if 0:
        from arsenal.debug import ip
        ip()
    else:
        pl.ioff()
        pl.show()
예제 #7
0
파일: mds_twitter.py 프로젝트: afcarl/viz
def icons(users, distance):
    """Visualization using user profile images as the points."""

    # It would be pretty cool to put user thumbails where points are.
    # but i'm still not sure how to do this yet.
    images = []

    try:
        print 'getting images..'
        for p in users:
            print p
            f = p.image
            img = imread('image.tmp')
            images.append(img)
    except Exception as e:
        print 'got an error...'
        import traceback
        etype, evalue, tb = sys.exc_info()
        print yellow % '\n'.join(traceback.format_exception(etype, evalue, tb))
        ip()

    (W, H, _) = shape(img)  # thumbnails should all be the same size
    count = len(images)

    pl.figure()

    P2, _ = mds(distance, 2)
    X, Y = P2[:, 0], P2[:, 1]

    ## XXX: not a great transformation b/c we might stretch more in one dimension
    def N(x):
        "force x to fit in interval [0,1]"
        x = (x - x.min())
        x = x / x.max()
        assert all(x >= 0) and all(x <= 1)
        return x

    X = N(X) * 475
    Y = N(Y) * 425

    figimages = [
        pl.figimage(img, xo=x, yo=y) for img, x, y in zip(images, X, Y)
    ]
예제 #8
0
def icons(users, distance):
    """Visualization using user profile images as the points."""

    # It would be pretty cool to put user thumbails where points are.
    # but i'm still not sure how to do this yet.
    images = []

    try:
        print 'getting images..'
        for p in users:
            print p
            f = p.image
            img = imread('image.tmp')
            images.append(img)
    except Exception as e:
        print 'got an error...'
        import traceback
        etype, evalue, tb = sys.exc_info()
        print yellow % '\n'.join(traceback.format_exception(etype, evalue, tb))
        ip()

    (W, H, _) = shape(img)  # thumbnails should all be the same size
    count = len(images)

    pl.figure()

    P2, _ = mds(distance, 2)
    X,Y = P2[:,0], P2[:,1]

    ## XXX: not a great transformation b/c we might stretch more in one dimension
    def N(x):
        "force x to fit in interval [0,1]"
        x = (x - x.min())
        x = x / x.max()
        assert all(x >= 0) and all(x <= 1)
        return x
    X = N(X)*475
    Y = N(Y)*425

    figimages = [pl.figimage(img, xo=x, yo=y) for img, x, y in zip(images, X, Y)]
예제 #9
0
def main():
    from argparse import ArgumentParser
    p = ArgumentParser()
    p.add_argument('--save', default='tmp/results.csv')
    p.add_argument(
        '--interpolation',
        choices=['linear', 'pessimistic', 'parametric', 'linear-convex'],
        default='pessimistic')
    # reward definition
    p.add_argument(
        '--accuracy',
        #choices=ACC.OPTS,
        #default='evalb',
        required=True,
        help='Measurement used for plotting.')
    p.add_argument(
        '--runtime',
        #choices=RUN.OPTS,
        #default='pushes',
        required=True,
        help='Measurement used for plotting.')
    # what jobs to show
    p.add_argument('--target', required=True)
    p.add_argument('--baseline', required=False)
    p.add_argument('--others', nargs='*', default=[])
    p.add_argument('--filter',
                   nargs='*',
                   default=[],
                   help="e.g., --filter 'df.args_C==-12'")

    # TODO: added nicer filters for things I've been doing with --filter.
    #    p.add_argment('--grammar')
    #    p.add_argment('--surrogate-accuracy', help='measure to filter jobs by.')
    #    p.add_argment('--surrogate-runtime', help='measure to filter jobs by.')

    # finalization
    p.add_argument('--last', action='store_true')
    p.add_argument('--early-stop', action='store_true')
    p.add_argument('--early-stop-dev-cheat', action='store_true')
    p.add_argument('--baseline-is-init', action='store_true')
    # extra plots
    p.add_argument('--tradeoff-plot', action='store_true')
    p.add_argument('--lc', action='store_true')
    p.add_argument('--show-train', action='store_true')
    p.add_argument('--show-each',
                   action='store_true',
                   help='flip thru learning curves')
    # misc
    p.add_argument('--kill-mode',
                   action='store_true',
                   help='flip thru learning curves asking "kill? [y/n]"')
    p.add_argument('--jobids', nargs='*', default=[])
    p.add_argument('-i', action='store_true')

    p.add_argument('--other-files', nargs='*', default=[])

    args = p.parse_args()

    # use linear interpolation for plotting Pareto frontier
    global show_frontier
    if args.interpolation in {'linear', 'linear-convex'}:

        def _show_frontier_linear(*a, **k):
            k['interpolation'] = args.interpolation  #'linear'
            _show_frontier(*a, **k)

        show_frontier = _show_frontier_linear

    if args.kill_mode:
        args.show_each = True
    if args.show_each:
        args.lc = True

    results = path('results')

    filters = set(args.others + [args.target, args.baseline])

    ACCURACY = args.accuracy
    RUNTIME = args.runtime

    #    if args.load:
    #        D = read_csv('tmp/results.csv', index_col=0)
    #        jobs = read_csv('tmp/jobs.csv', index_col=0)
    #    else:

    data, jobs = load_results(results, args, filters)
    D = DataFrame(data)
    #D.to_csv('tmp/results.csv')

    jobs = DataFrame(jobs)
    #jobs.to_csv('tmp/jobs.csv')

    target = args.target
    df = D[D.name == target]

    # apply CLI filter options
    for f in args.filter:
        df = df[eval(f)]  # this is pretty ghetto.

    if args.baseline_is_init:
        # Note: this will use the same filters as the main experiment (e.g.,
        # regularization parameters).
        baseline = df[(df.iteration == 1)]
    else:
        baseline = D[D.name == args.baseline]

    # NOTE: do before iteration filters (e.g., early-stop/last)
    def PPPP(df):
        "Patience: Find out how long it's been since the last improvement."
        P = []
        for jobid, D in df.groupby('jobid'):
            #p = running_max(list(D.iteration), list(D.dev_reward))
            p = running_max(list(D.iteration), list(D.dev_new_policy_reward))
            P.append({
                'jobid': jobid,
                'patience': D.iteration.max() - p[-1, 0]
            })
        return DataFrame(P).set_index('jobid')

    P = PPPP(df)

    # TODO: In most experiments, train rewards are based on a sample which
    # varies across iterations -- maybe I should use the same examples
    # throughout. Thus, we probably don't really want to do early stopping based
    # on this value without smoothing or something.
    if args.last or args.early_stop or args.early_stop_dev_cheat:
        assert not (args.early_stop and args.last), "Can't have both."
        ddd = []
        for _, dd in df.groupby('jobid'):

            if args.last:
                best = dd.ix[dd.iteration ==
                             dd.iteration.max()]  # last iteration
            elif args.early_stop:
                best = dd.ix[
                    dd.train_new_policy_reward ==
                    dd.train_new_policy_reward.max()]  # best train iteration
            elif args.early_stop_dev_cheat:
                best = dd.ix[dd.dev_new_policy_reward == dd.
                             dev_new_policy_reward.max()]  # best dev iteration

                # Break ties in dev reward in favor of the training reward
                #_best = dd.ix[dd[dd.dev_new_policy_reward == dd.dev_new_policy_reward.max()].train_new_policy_reward.argmax()]
                #_best = dd.ix[dd.dev_new_policy_reward.argmax()]

                #print _best.iteration, 'out of', dd.iteration.max(), 'iterations'
                #print _best.log
                #print >> get_params, _best.log.dirname() / 'new_policy-%03d.npz' % _best.iteration

            else:
                raise ValueError('Unrecognized option.')

            # We require the following yucky code because `best.to_dict()`
            # returns a dict with values that are each
            # <strike>single-entry</strike> dicts.
            #
            #  ^^^ I think this happens because best might contain more than one
            #  value. so clearly when you convert it to a dict you should a
            #  collection of potential values -- that's why theres a dict.
            row = {}
            for k, v in best.to_dict().items():
                v = list(
                    v.values())[-1]  # take the last one if there are ties.
                row[k] = v
            ddd.append(row)

        df = DataFrame(ddd)

    assert not df.empty, 'DataFrame is empty.'

    if baseline is not None and not baseline.empty:
        args_check(baseline, 'baseline')
    args_check(df, 'df')

    if args.tradeoff_plot:
        # TODO: [2015-02-27 Fri] maybe we should sample tradeoff on a nonlinear
        #   scale (e.g., log-scale). We seem to get a much more linear response
        #   from training. This would help prevent the over-sampling of values
        #   with low-accuracy and low-runtime.
        pl.figure()
        pl.scatter(df.tradeoff, df.dev_accuracy, lw=0)
        pl.title(r'accuracy (%s) by $\lambda$' % ACCURACY)
        pl.xlabel(r'tradeoff ($\lambda$)')
        pl.ylabel(r'accuracy (%s)' % ACCURACY)
        pl.figure()
        pl.scatter(df.tradeoff, df.dev_runtime, lw=0)
        pl.title(r'runtime (%s) by $\lambda$' % RUNTIME)
        pl.xlabel(r'tradeoff ($\lambda$)')
        pl.ylabel(r'runtime (%s)' % RUNTIME)

        # TODO: It would be interesting to compare the baseline's tradeoff
        # parameter to ours, but they are sort of incomparable.
        #
        #pl.figure()
        #pl.scatter(baseline.tradeoff, baseline.dev_accuracy, lw=0)
        #pl.title(r'BASELINE accuracy (%s) by $\lambda$' % ACCURACY)
        #pl.xlabel(r'tradeoff ($\lambda$)')
        #pl.ylabel(r'accuracy (%s)' % ACCURACY)
        #pl.figure()
        #pl.scatter(baseline.tradeoff, baseline.dev_runtime, lw=0)
        #pl.title(r'BASELINE runtime (%s) by $\lambda$' % RUNTIME)
        #pl.xlabel(r'tradeoff ($\lambda$)')
        #pl.ylabel(r'runtime (%s)' % RUNTIME)

    frontier = Frontier(args.target, df, args.accuracy, args.runtime)
    frontier.plot()

    #    # Plot reference policies (oracle1, unpruned and fast-mle).
    #    #if args.target in {'searn4', 'searn5'}:
    #    if ACCURACY != 'no-fail':
    #        # Note: this is sort of silly. Jobs each have a copy of the
    #        # baseline.csv file. Here we have the first one that comes
    #        # up. (Warning: we might mix baselines, so be careful). This "guess'
    #        # let's us avoid passing the file in at the command-line. It's
    #        # conceivable that we might want to show multiple reference policies
    #        # (e.g., different grammars on the same plot), in which case we
    #        # should probably have a CLI option to specify these files.
    #
    #        # TODO: report baseline parser's accuracy at most-acc's runtime
    #        #
    #        #  - Create a class for representing a Pareto frontier, which supports the
    #        #    relevant query types: accuracy @ runtime and runtime @ accuracy.
    #
    #        show_reference_policies = 0
    #        if show_reference_policies:
    #
    #            baseline_csv = path('.').glob('results/*-%s-*/dump/baseline.csv' % args.target)[0]
    #            B = read_csv(baseline_csv)
    #
    #            # Show reference policies (e.g., unpruned, oracle)
    #            marker = {'oracle1': '*', 'fastmle': '^', 'unpruned': 'x'}
    #            for policy in ['oracle1', 'unpruned']:
    #                for name in ['train', 'dev']:
    #                    if policy == 'unpruned':   # XXX: skip unpruned because it makes the plot ugly
    #                        continue
    #                    getattr(frontier, '%s_ax' % name) \
    #                        .scatter([B['%s_%s_%s' % (name, policy, RUNTIME)]],
    #                                 [B['%s_%s_%s' % (name, policy, ACCURACY)]],
    #                                 c='r', s=40, marker=marker[policy])
    #
    #        if args.show_init:
    #            if len(init_run) == 0:
    #                print '[%s]' % red % 'error', 'Failed to find initializer.'
    #            else:
    #                frontier.dev_ax.scatter(init_run, init_acc, s=75, c='k', marker='^')
    #
    #        if show_reference_policies:
    #            [[unpruned_acc, unpruned_run]] = B[['dev_unpruned_%s' % ACCURACY, 'dev_unpruned_%s' % RUNTIME]].get_values()
    #            most_acc_acc, most_acc_run = df.ix[df['dev_accuracy'].argmax()][['dev_accuracy', 'dev_runtime']]
    #            print 'unpruned: %.4f %g' % (unpruned_acc*100, unpruned_run)
    #            print 'most_acc: %.4f %g' % (most_acc_acc*100, most_acc_run)
    #            print 'MOSTACC:  %.2f points more accurate and %.2fx faster than unpruned.' % (100*(most_acc_acc - unpruned_acc), unpruned_run / most_acc_run)
    #
    #        if show_reference_policies:
    #            # [2015-06-08 Mon] hack together fast-mle by piecing together
    #            #   unpruned with oracle runtime (which isn't really exact if more
    #            #   grammar rules fire on the unpruned mask... it's not unreasonble
    #            #   more rules fire in on unpruned since the gold mask might be
    #            #   unsupported by the parser).
    #            [acc] = B['dev_unpruned_%s' % ACCURACY].get_values()
    #            [run] = B['dev_oracle1_%s' % RUNTIME].get_values()
    #            frontier.dev_ax.scatter([run], [acc], c='r', s=40, marker=marker['fastmle'])
    #            [acc] = B['train_unpruned_%s' % ACCURACY].get_values()
    #            [run] = B['train_oracle1_%s' % RUNTIME].get_values()
    #            frontier.train_ax.scatter([run], [acc], c='r', s=40, marker=marker['fastmle'])
    #
    #    else:
    #        print '[%s] %s' % (red % 'ERROR', 'no baseline.csv file found')

    if 1:
        frontier.show_baseline(baseline)

    # Show frontiers for 'other' things. Not the baseline (because the baseline
    # gets special handling), but things like older experiments.
    others = set(D.name.unique()) - {args.target, args.baseline}
    if others:
        print
        print yellow % 'Other curves'
        print yellow % '============'
        for other, color in zip(sorted(others),
                                cycle(['m', 'g', 'c', 'k', 'b'])):
            print '%-9s' % other, color
            alpha = 1.0
            d = D[D.name == other]
            args_check(d, other)
            if not d.train_runtime.isnull().all():
                show_frontier(d.train_runtime,
                              d.train_accuracy,
                              ax=frontier.train_ax,
                              c=color,
                              alpha=alpha,
                              XMAX=frontier.XMAX,
                              YMIN=frontier.YMIN,
                              lw=LW)
            if not d.dev_runtime.isnull().all():

                # XXX: this is just some cruft from debugging set of jobs. Can probably delete.
                #print d[['dev_runtime','dev_accuracy']].sort('dev_accuracy')
                #print df[['dev_runtime','dev_accuracy']].sort('dev_accuracy')
                #assert (np.abs(np.array(df.dev_accuracy.sort_values()) - np.array(d.dev_accuracy.sort_values())) < 1e-5).all()
                #assert (np.abs(np.array(df.dev_runtime.sort_values()) - np.array(d.dev_runtime.sort_values())) < 1e-5).all()

                show_frontier(d.dev_runtime,
                              d.dev_accuracy,
                              ax=frontier.dev_ax,
                              c=color,
                              alpha=alpha,
                              lw=LW,
                              XMAX=frontier.XMAX,
                              YMIN=frontier.YMIN,
                              label=other)

    for other in args.other_files:
        dd = read_csv(other, index_col=0)

        dd['dev_accuracy'] = dd['dev_new_policy_%s' % ACCURACY]
        dd['dev_runtime'] = dd['dev_new_policy_%s' % RUNTIME]
        dd['dev_reward'] = dd.dev_accuracy - dd.tradeoff * dd.dev_runtime

        show_frontier(dd.dev_runtime,
                      dd.dev_accuracy,
                      ax=frontier.dev_ax,
                      lw=LW,
                      label=other)

    frontier.dev_ax.legend(loc=4)

    print

    if len(df.args_C.unique()) > 1:
        show_groupby_frontiers(df,
                               'args_C',
                               frontier.XMAX,
                               frontier.YMIN,
                               baseline=baseline)

#    if len(df.args_accuracy.unique()) > 1:
#        show_groupby_frontiers(df, 'args_accuracy', frontier.XMAX, frontier.YMIN)

    if len(df.args_accuracy.unique()) > 1:
        show_groupby_frontiers(df, 'args_roll_out', frontier.XMAX,
                               frontier.YMIN)


#    if len(df.args_classifier.unique()) > 1:
#        show_groupby_frontiers(df, 'args_classifier', frontier.XMAX, frontier.YMIN)

#show_groupby_frontiers(df, 'iteration', baseline=baseline)
#asymmetry_plots(baseline)

    job_summary(jobs)

    #pl.ion()
    #pl.show()

    # Summary of jobs that are currently running, e.g., How many iterations have
    # they run for? How long has it been since they improved (patience)?
    J = df.join(jobs, 'jobid',
                rsuffix='_xxx')  # needs a suffix because columns overlap.
    J = J.groupby('jobid').max()
    J = J.join(P)
    J['elapsed'] = map(htime, J.elapsed)
    J['startdate'] = J.start.map(lambda x: x.date())
    J = J.sort_values('start')

    show_cols = [
        'iteration',
        'running',
        'patience',
        'tradeoff',
        'elapsed',
        'startdate',
        'dev_accuracy',
        'dev_runtime',
        'log',
    ]
    running = J[J['running']][show_cols]
    if running.empty:
        print red % 'No jobs running.'
    else:
        print running

    #highlight_region(df, baseline, B, frontier.dev_ax, ACCURACY, RUNTIME)
    frontier.dev_ax.set_title('Pareto frontier *DEV*')
    frontier.dev_ax.set_xlabel('runtime (%s)' % RUNTIME)
    frontier.dev_ax.set_ylabel('accuracy (%s)' % ACCURACY)
    frontier.dev_ax.set_xlim(0, None)
    frontier.dev_ax.set_ylim(0, 1)
    frontier.dev_ax.figure.canvas.draw()
    frontier.dev_ax.figure.savefig('tmp/pareto.png')

    if args.save:
        df.to_csv(args.save)
    #baseline.to_csv('tmp/baseline.csv')

    # hide the train plot.
    if not args.show_train:
        pl.close(frontier.train_ax.figure)

    if args.i:
        from arsenal.debug import ip
        ip()
    else:
        pl.ioff()
        pl.show()
예제 #10
0
pl.tight_layout()
pl.legend(loc=4)
pl.show()


if 0:
    # Show separate plots for gold/nongold to highlight "label" errors
    pl.figure()
    pl.scatter(ok_gold.delta_run, ok_gold.delta_acc, c=c_g, lw=0, alpha=0.25)
    pl.scatter(bad_gold.delta_run, bad_gold.delta_acc, c=c_g, lw=0, alpha=0.25)
    pl.plot(xs, xs, c='k', lw=3)
    pl.title('gold')

    pl.figure()
    pl.scatter(bad_nongold.delta_run, bad_nongold.delta_acc, c=c_n, lw=0, alpha=0.25, label='bad nongold')
    pl.scatter(ok_nongold.delta_run, ok_nongold.delta_acc, c=c_n, lw=0, alpha=0.25, label='ok nongold')
    pl.plot(xs, xs, c='k', lw=3)
    pl.title('non-gold')

#sns.jointplot('delta_run', 'delta_acc', data=gold)
#pl.title('gold')

#sns.jointplot('delta_run', 'delta_acc', data=nongold)
#pl.title('nongold')

if 0:
    from arsenal.debug import ip; ip()
else:
    pl.ioff()
    pl.show()
예제 #11
0
    'args_roll_out', 'args_tradeoff', 'dev_evalb', 'earlystop_elapsed',
    'earlystop_passes', 'hours_iterations', 'hours_pass', 'total_time',
    'total_passes', 'args_results'
]
print cp[show_cols]
print
print dp[show_cols]

with file('tmp/convergence-and-iterations-%s.html' % _args.grammar,
          'wb') as html:
    html.write('<h1>%s</h1>' % _args.grammar)
    html.write('<h2>CP</h2>')
    html.write(cp[show_cols].to_html())
    html.write('<h2>DP</h2>')
    html.write(dp[show_cols].to_html())

    print >> html, '<center><table><tr style="text-align:center; font-size: 30pt;"><th>CP</th><th>DP</th></tr>'
    for c, d in zip(cp.args_results, dp.args_results):
        C = file(c / 'learning-curve.svg').read()
        D = file(d / 'learning-curve.svg').read()
        print >> html, '<tr><td>%s<td><td>%s</td></tr>' % (C, D)
    print >> html, '</table></center>'

print
print colors.green % 'wrote %s' % html.name
print

if _args.i:
    from arsenal.debug import ip
    ip()
def main():

    pl.ion()

    p = ArgumentParser()
    p.add_argument('root', type=path)
    p.add_argument('--quick', action='store_true',
                   help="Load a single evaluation log (for quick tests). Won't run bestof-k runtime.")
    p.add_argument('-i', action='store_true',
                   help='Interactive mode => open an IPython shell after execution.')
    args = p.parse_args()

    runs = [r for r in sorted(args.root.glob('*')) if r.isdir()]

    if args.quick:
        print colors.bold % colors.red % 'Warning! only using some of the runs for timing information.'
        runs = runs[:1]

    Ds = [(r, load(r)) for r in iterview(runs)]

    D0, Ds, bestof = sanity_check(Ds)

#    if 0:
#        pl.figure()
#        for name, df in D0.groupby('type'):
#            pl.scatter(df.avg_bestof_time, df.evalb, c=C[name], lw=0)
#            show_frontier(df.avg_bestof_time, df.evalb, c=C[name], interpolation='linear-convex', label=name)
#            #[w,b] = np.polyfit(df.pushes, df.avg_bestof_time, deg=1)
#            #show_frontier(df.pushes*w + b, df.evalb, interpolation='linear', c=C[name])
#        pl.xlabel('sec/sentence (best of %s)' % len(Ds))
#        pl.ylabel('Corpus EVALB-F1')
#        pl.legend(loc=4)
#        pl.show()

    rescale = 1/bestof.pushes.max()
    bestof['pushes_r'] = bestof.pushes*rescale

    B = bestof[bestof.type=='baseline'].copy()
    lols = bestof[bestof.type!='baseline']

    RO_types = lols.args_roll_out.unique()

    ax = pl.figure().add_subplot(111)
    for name, df in reversed(sorted(bestof.groupby('type'))):
        pl.scatter(df.pushes_r, df.evalb, c=C[name], lw=0, zorder=10, label='', s=50)
        pts = show_frontier(df.pushes_r, df.evalb, interpolation='linear-convex', lw=2, c=C[name], label=name)
        ax.plot(pts[:,0], pts[:,1], label=name, c=C[name])

    pl.ylabel('Corpus $F_1$')
    pl.legend(loc=4)
    pl.tight_layout()

    ax = pl.gca()
    conesize = .06
    lambda_cone(np.array(B.evalb), np.array(B.pushes_r), ax=ax, c=c_baseline, conesize=conesize, lines=0)

    # --------------------------------------------------------------------------
    # Fit parametric curve to dev points show arrows on test points.
    from ldp.viz.parametric_fit import fit

    df = join_with_dev(B)

    ff, gg = fit(df.dev_pushes, df.dev_evalb)
    if 0:
        # enable to show the parametric curve.
        xs = pl.linspace(0, df.dev_pushes.max()+.1*df.dev_pushes.ptp(), 100)
        ax.plot(xs*rescale, ff(xs), c='k')

    ax = pl.gca()
    for _, z in df.iterrows():
        x, y = z.test_pushes*rescale, z.test_evalb
        arrow(x, y, gg(z.dev_pushes)/rescale, offset=-conesize, c=c_vec_baseline, ax=ax)

    # --------------------------------------------------------------------------
    B.loc[:,'tradeoff'] = np.nan

    data = []

    # Loop over all rollout types joined on initial policy (i.e., the baseline).
    for i, bl in B.iterrows():
        spawn = lols[lols.args_initializer_penalty == bl.args_tradeoff]
        assert len(spawn) == len(RO_types)

        models = {}
        for ro in RO_types:
            [ix] = spawn[spawn.args_roll_out == ro].index
            models[ro] = lols.ix[ix]

        [dev_pushes] = df[df.policy == bl.policy].dev_pushes
        tradeoff = gg(dev_pushes)
        B.loc[i,'tradeoff'] = tradeoff

        if 1:
            # uncomment to run hypothesis tests.

            print colors.bold % colors.green % '============================================================='
            print 'tradeoff: %g' % tradeoff
            print

            baseline_acc, baseline_run = get_acc_run(D0[D0.policy == bl.policy])

            row = {
                'baseline': bl.policy,
                'baseline_accuracy': baseline_acc,
                'baseline_runtime': baseline_run,
                'baseline_reward': baseline_acc - tradeoff*baseline_run,
                'wps_baseline': bl.wps,
                'wallclock_baseline': bl.avg_bestof_time,
                'tradeoff': tradeoff,
            }

            star_sty = dict(alpha=1, lw=0, marker='*', s=700, zorder=100)

            for ro, model in sorted(models.items()):
                print colors.bold % '# %s' % ro
                sig, win = paired_permutation_test(D0,
                                                   a=bl.policy,
                                                   b=model.policy,
                                                   tradeoff=tradeoff,
                                                   R=5000)
                acc, run = get_acc_run(D0[D0.policy == model.policy])

                row[ro] = model.policy
                row['%s_accuracy'  % ro] = acc
                row['%s_runtime'   % ro] = run
                row['%s_reward'    % ro] = acc - tradeoff*run
                row['wps_%s'       % ro] = model.wps
                row['wallclock_%s' % ro] = model.avg_bestof_time
                row['winner_%s'    % ro] = win
                row['sig_%s'       % ro] = sig

                if win == +1:
                    pl.scatter([model.pushes_r],
                               [model.evalb],
                               c=C[ro],
                               **star_sty)

                elif win == -1:
                    pl.scatter([bl.pushes_r],
                               [bl.evalb],
                               c=C['baseline'],
                               **star_sty)

                # draw a dotten line to the baseline point.
                pl.plot([bl.pushes_r, model.pushes_r],
                        [bl.evalb, model.evalb],
                        c=C[ro],
                        lw=1,
                        alpha=0.75,
                        label=None,
                        linestyle='--')

            data.append(row)

    #[w,b] = np.polyfit(B.pushes, B.avg_bestof_time, deg=1)

    xx = lols.pushes_r
    xx = np.linspace(xx.min(), xx.max(), 12)

    # put ticks on the top of the plot.
    #ax.xaxis.tick_top()

    #pl.xticks(xx, ['%.2g\m(%.2g)' % (x/rescale / 1e6, (x/rescale*w+b)*100) for x in xx], rotation=0)
    #pl.text(0.4, 0.401, re.sub('(\d)e([\-+]\d+)', r'\1e^{\2}', r'$\textit{seconds} \approx %.2g \cdot \textit{pushes} + %.2g$' % (w,b)))
    #pl.xlabel('average megapushes ($\\approx$ milliseconds)')

    pl.xticks(xx, [r'$%.2g$' % (x/rescale / 1e6) for x in xx])
#    pl.xticks(xx, [r'%.2g' % (x/rescale / 1e6) for x in xx], rotation=45)

    if 'medium' not in args.root:
        pl.xlabel('millions of hyperedges built per sentence')

    pl.ylim(bestof.evalb.min()-0.02, bestof.evalb.max()+0.015)
    pl.xlim(bestof.pushes_r.min()-.01, bestof.pushes_r.max()+0.01)


    zf = pd.DataFrame(data).sort_values('tradeoff')
#    print zf[['tradeoff', 'baseline_reward', 'cp_reward', 'dp_reward', 'winner_cp', 'winner_dp']].sort_values('tradeoff').to_string(float_format='%.4g'.__mod__, index=0)
#    print zf[['tradeoff',
#              'baseline_accuracy', 'baseline_runtime',
#              'cp_accuracy', 'cp_runtime',
#              'dp_accuracy', 'dp_runtime',
#              'winner_cp', 'winner_dp']].sort_values('tradeoff').to_string(float_format='%.4g'.__mod__, index=0)

    if not args.quick:
        sig_file = args.root / 'significance.csv'
        print
        print colors.green % 'wrote %s' % sig_file
        print
        zf.to_csv(sig_file)

    if args.i:
        pl.ion(); pl.show()
        from arsenal.debug import ip; ip()
    else:
        pl.ioff(); pl.show()