예제 #1
0
def main():
    from arsenal.iterview import iterview
    from time import sleep
    from numpy.random import uniform

    T = Benchmark('A vs B')
    for _ in iterview(range(1000), T.title):
        with T['A']:
            sleep(np.random.exponential(.001))
        with T['B']:
            sleep(np.random.exponential(.001))

    T.compare()

    t = Timer('test')

    for i in iterview(range(1, 20)):
        for _ in range(10):
            with t(i=i):
                c = 0.01
                z = max(i**2 * 0.0001 + uniform(-c, c), 0.0)
                sleep(z)

    t.plot_feature('i')
    pl.show()
예제 #2
0
파일: util.py 프로젝트: Prodject/arsenal-1
def contour_plot(f,
                 xdomain,
                 ydomain,
                 color='viridis',
                 alpha=0.5,
                 levels=None,
                 ax=None):
    "Contour plot of a function of two variables."
    from arsenal import iterview
    if ax is None: ax = pl.gca()
    [xmin, xmax, _] = xdomain
    [ymin, ymax, _] = ydomain
    X, Y = np.meshgrid(np.linspace(*xdomain), np.linspace(*ydomain))
    Z = np.array([
        f(np.array([x, y]))
        for (x, y) in iterview(zip(X.flat, Y.flat), length=len(X.flat))
    ]).reshape(X.shape)
    contours = ax.contour(X, Y, Z, 20, colors='black', levels=levels)
    ax.clabel(contours, inline=True, fontsize=8)
    if color is not None:
        ax.imshow(Z,
                  extent=[xmin, xmax, ymin, ymax],
                  origin='lower',
                  cmap=color,
                  alpha=alpha)
        ax.axis(aspect='scalar')
    ax.figure.tight_layout()
    ax.set_xlim(xmin, xmax)
    ax.set_ylim(ymin, ymax)
def build(GRAMMAR, ACC, RUN, INIT):

    data = []

    if INIT == 'policy':
        csvs = path('results').glob('*/dump/new_policy*.npz.inspect_rollouts.csv.gz')
    elif INIT == 'init':
        csvs = path('results').glob('*/dump/init.npz.inspect_rollouts.csv.gz')
    else:
        raise ValueError('dont understand INIT=%s' % INIT)

    for f in iterview(csvs):
        args = cPickle.load(file((f / '..' / 'args.pkl').abspath()))
        if args.grammar != GRAMMAR or args.accuracy != ACC or args.runtime != RUN:
            continue
        try:
            breakdown(data,
                      args,
                      pd.read_csv(f))
        except pd.io.common.EmptyDataError:
            print colors.red % '*** skipping empty file %s' % f
            print

    df = pd.DataFrame(data)
    df = df.sort_values('tradeoff')
    print df
    return df
예제 #4
0
    def inner_optimization(self, iterations, prox_every=25):
        budget = self.group_budget
        for t in range(iterations):
            print()
            np.random.shuffle(self.train)
            for x in iterview(self.train, colors.green % 'Pass %s' % (t+1)):

                S = ScoringModel(x, self.A, self.feature_backoff, self.sparse, self.dense)
                self.gradient(x.N, x.tags, S)
                S.backprop()

                if budget is not None and self.sparse.step % prox_every == 0:
                    self.dense.prox_budget(budget)

                self.sparse.step += 1

            assert np.isfinite(self.sparse.w).all()
            assert np.isfinite(self.dense.w).all()

            # make sure to call prox udate before finishing this pass. This will
            # keep the number of features within the budget.
            if budget is not None:
                self.dense.prox_budget(budget)

            self.after_inner_pass()
예제 #5
0
 def make_instances(self, fold, cls):
     "Convert tuples in data `fold` to instances of `cls`."
     data = []
     for x in iterview(getattr(self, fold), msg='Features (%s)' % fold):
         tags, tokens = list(zip(*x))
         data.append(cls(tokens, self.Y.map(tags), self))
     return data
예제 #6
0
def fdcheck(func, w, g, keys=None, eps=1e-5):
    """
    Finite-difference check.

    Returns `arsenal.math.compare` instance.

    - `func`: zero argument function, which references `w` in caller's scope.
    - `w`: parameters.
    - `g`: gradient estimate to compare against
    - `keys`: dimensions to check
    - `eps`: perturbation size

    """
    if keys is None:
        if hasattr(w, 'keys'):
            keys = w.keys()
        else:
            keys = range(len(w))
    fd = {}
    for key in iterview(keys):
        was = w[key]
        w[key] = was + eps
        b = func()
        w[key] = was - eps
        a = func()
        w[key] = was
        fd[key] = (b - a) / (2 * eps)

    return compare([fd[k] for k in keys], [g[k] for k in keys])
예제 #7
0
def test_parse():
    from arsenal.cache import memoize
    from hypergraphs.semirings import LazySort

    w = memoize(lambda *edge: np.exp(np.random.randn()))

    sentence = 'Papa ate the caviar with the spoon .'.split()

    grammar = load_grammar("""
    S       X .
    X       X X
    X       Papa
    X       ate
    X       the
    X       caviar
    X       with
    X       spoon
    X       in
    X       park
    """)

    def distribution():
        root = parser(sentence, grammar, w, LazySort)
        Z = 0.0
        p = {}
        for x in root:
            p[str(extract2(x.data))] = x.score
            Z += x.score
        return normalize(p)

    p = distribution()

    # TODO: make the parser return a the root rather than the full chart.
    q = {x: 0 for x in p}
    reps = 10_000

    def run():
        return parser(sentence, grammar, w, Sample)

    if EAGER:
        def sampler():
            while True:
                yield run().value

    else:
        def sampler():
            yield from run()


    sample = iter(sampler())
    #sample = iter(lazy_sampler())

    for r in iterview(range(1, 1+reps)):
        _, s = next(sample)
        q[str(s)] += 1

        if r % 1_000 == 0:
            print(f'\nerr({r})=', 0.5*sum(abs(p[x] - q[x]/r) for x in p))
예제 #8
0
 def __call__(self, policy, examples, grammar, msg='eval'):
     "Evaluate test-time pruning policy ``c`` on ``examples``."
     rs = []
     for e in iterview(examples, msg=msg):
         rs.append(self.parse(e, grammar, policy))
     a = AvgReward(rs)
     if hasattr(a, self.ACCURACY):
         a.accuracy = getattr(a, self.ACCURACY)
     else:
         a.accuracy = a.attrs[self.ACCURACY]
     a.attrs['accuracy'] = a.accuracy
     return a
예제 #9
0
파일: util.py 프로젝트: timvieira/arsenal
def contour_plot(f, xdomain, ydomain, color='viridis', alpha=0.5, levels=None):
    "Contour plot of a function of two variables."
    from arsenal import iterview
    [xmin, xmax, _] = xdomain; [ymin, ymax, _] = ydomain
    X, Y = np.meshgrid(np.linspace(*xdomain), np.linspace(*ydomain))
    Z = np.array([f(np.array([x,y])) for (x,y) in iterview(zip(X.flat, Y.flat), length=len(X.flat))]).reshape(X.shape)
    contours = pl.contour(X, Y, Z, 20, colors='black', levels=levels)
    pl.clabel(contours, inline=True, fontsize=8)
    if color is not None:
        pl.imshow(Z, extent=[xmin, xmax, ymin, ymax], origin='lower', cmap=color, alpha=alpha)
        pl.axis(aspect='scalar')
    pl.gcf().tight_layout()
    pl.xlim(xmin,xmax); pl.ylim(ymin,ymax)
예제 #10
0
파일: timer.py 프로젝트: alok/arsenal
    def run(self, methods, reps):
        from arsenal import iterview, restore_random_state
        if isinstance(methods, (tuple, list)):
            methods = {m.__name__: m for m in methods}

        jobs = [
            (name, seed)
            for seed in range(reps)   # TODO: use a better strategy for picking random seeds.
            for name in methods
        ]
        np.random.shuffle(jobs)       # shuffle jobs to avoid weird ordering correlations
        for name, seed in iterview(jobs):
            with restore_random_state(seed):
                with self[name]:
                    methods[name]()
예제 #11
0
파일: timer.py 프로젝트: alok/arsenal
def main():
    from arsenal.iterview import iterview
    from time import sleep
    from numpy.random import uniform
    t = Timer('test')

    for i in iterview(range(1, 20)):
        for _ in range(10):
            with t(i=i):
                c = 0.01
                z = max(i**2 * 0.0001 + uniform(-c, c), 0.0)
                sleep(z)

    a = t.plot_feature('i')
    print(a)
    pl.show()
예제 #12
0
def test():
    import numpy as np
    import matplotlib.pyplot as pl
    from arsenal.timer import timers
    from arsenal import iterview

    T = timers()

    params = [
        (i, rep)
        for i in range(5, 20)
        for rep in range(5)
    ]

    np.random.shuffle(params)

    for (i, _) in iterview(params):

        # BY's runtime scales logarithmically with the bigger set and linearly
        # with the smaller set.  Hash-based intersetion scales with their sum.
        n = 2**i
        m = 2*10

        U = range(max(n,m)*5)
        A = list(np.random.choice(U, n, replace=0))
        B = list(np.random.choice(U, m, replace=0))
        A.sort()
        B.sort()

        sA = set(A)
        sB = set(B)
        with T['set'](i=2**i):
            E = (sA & sB)

        with T['make-set'](i=2**i):
            E = (set(A) & set(B))

        with T['B-Y'](i=2**i):
            C = list(sorted_intersection(A, B))

        assert sorted(E) == C

    T.compare()
    T.plot_feature('i')

    pl.show()
예제 #13
0
 def evaluate(self, predict, data, name, verbosity=1):
     if not data:
         return
     if verbosity:
         print()
         print('Phrase-based F1:', name)
     f1 = F1()
     for i, x in enumerate(iterview(data, msg='Eval %s' % name)):
         pred = extract_contiguous(predict(x))
         gold = extract_contiguous(self.Y.lookup_many(x.tags))
         # (i,begin,end) uniquely identifies the span
         for (label, begins, ends) in gold:
             f1.add_relevant(label, (i, begins, ends))
         for (label, begins, ends) in pred:
             f1.add_retrieved(label, (i, begins, ends))
     if verbosity:
         print()
     return f1.scores(verbose=verbosity >= 1)
예제 #14
0
    def evaluate(self, predict, data, msg, verbosity=2):
        "Run predict `predict` function on data."

        if not data:
            return float('nan'), []

        ff = F1()

        correct = Counter()
        total = Counter()

        for ii, x in enumerate(iterview(data,
                                        colors.blue % 'Eval (%s)' % msg)):

            y = predict(x)
            gold = self.Y.lookup_many(x.tags)

            for t, (got, want) in enumerate(zip(y, gold)):
                if verbosity >= 2:
                    ff.report(instance=(ii, t), prediction=got, target=want)
                for c in self.error_classifications(x, t):
                    if got == want:
                        correct[c] += 1
                    total[c] += 1

        #print 'sentences:', len(data), 'tokens:', total['overall']

        c = 'overall'
        acc = '%s: %.2f' % (colors.light_yellow % c,
                            100 * correct[c] / total[c])
        other = total.keys()
        other.remove(c)
        breakdown = ', '.join('%s: %.2f' % (c, 100 * correct[c] / total[c])
                              for c in sorted(other))

        print '%s (%s)' % (acc, breakdown)

        if verbosity >= 2:
            print
            print 'F1 breakdown'
            print '============'
            ff.scores()

        return correct['overall'] / total['overall']
예제 #15
0
    def __init__(self, args, setup, policy, output_file, tradeoff, roll_out):
        self.setup = setup
        self.tradeoff = tradeoff
        self.grammar = grammar = setup.grammar
        self.train = setup.train
        self.ACCURACY = args.accuracy
        self.RUNTIME = args.runtime

        self.policy_name = policy
        self.output_file = output_file

        self.nfeatures = nfeatures = setup.nfeatures
        self.policy = GLM(nfeatures, C=np.nan, loss=0)  # dummy

        self.policy._coef = np.load(policy)['coef']

        if roll_out == ROLLOUT.CP:
            Rollouts = CP
        elif roll_out == ROLLOUT.BF:
            Rollouts = BF
        elif roll_out == ROLLOUT.HY:
            Rollouts = HY
        elif roll_out == ROLLOUT.DP:
            Rollouts = DP
        else:
            raise ValueError('Unrecognized rollout option %s' % roll_out)

        tmp = workaround(output_file, context=self)
        for e in iterview(self.train, msg='rollouts'):
            p = Rollouts(grammar,
                         e,
                         self.policy,
                         accuracy=args.accuracy,
                         runtime=args.runtime,
                         tradeoff=tradeoff)
            p.roll_outs(tmp)

        tmp.save()
예제 #16
0
    def __init__(self,
                 args,
                 setup,
                 tradeoff,
                 iterations,
                 minibatch,
                 results=None,
                 C=None,
                 roll_out=ROLLOUT.CP,
                 initializer=INIT.BODENSTAB_GOLD,
                 initializer_penalty=0.01,
                 show_reference=0):

        self.evals = []
        self.tradeoff = tradeoff
        self.grammar = grammar = setup.grammar
        self.nfeatures = nfeatures = setup.nfeatures

        self.ACCURACY = args.accuracy
        self.RUNTIME = args.runtime

        self.C = C
        if args.classifier == CLASSIFIER.LOGISTIC:
            self.policy = GLM(nfeatures, C=self.C, loss=0)
        elif args.classifier == CLASSIFIER.LINEAR:
            self.policy = GLM(nfeatures, C=self.C, loss=1)
        elif args.classifier == CLASSIFIER.ADAGRAD:
            self.policy = Adagrad(self.nfeatures,
                                  C=self.C,
                                  loss=0,
                                  eta=args.learning_rate)
        elif args.classifier == CLASSIFIER.ADAGRAD_LINEAR:
            self.policy = Adagrad(self.nfeatures,
                                  C=self.C,
                                  loss=1,
                                  eta=args.learning_rate)
        elif args.classifier == CLASSIFIER.ADAGRAD_HINGE:
            self.policy = Adagrad(self.nfeatures,
                                  C=self.C,
                                  loss=2,
                                  eta=args.learning_rate)
        elif args.classifier == CLASSIFIER.SVM:
            self.policy = SVM(self.nfeatures, C=self.C)
        elif args.classifier == CLASSIFIER.PERCEPTRON:
            self.policy = Perceptron(self.nfeatures)
        else:
            raise AssertionError('Unrecognized classifier option %r' %
                                 args.classifier)

        if args.init_weights is not None:
            # XXX: Hack to warm start weights
            print '[init weights]', args.init_weights
            assert args.init_weights.exists()
            self.policy._coef = np.load(args.init_weights)['coef']

        self.evaluate = Evaluate(args.accuracy, args.runtime)

        sty = {
            'oracle1': dict(c='k', alpha=0.5, linestyle=':'),
            'fastmle': dict(c='g', alpha=0.5, linestyle=':'),
            'unpruned': dict(c='k', alpha=0.5, linestyle='--'),
            'new_policy': dict(c='b', lw=2),
        }
        self.lc = ddict(lambda name: viz.LearningCurve(name, sty=sty))

        train = list(setup.train)
        random.shuffle(train)
        dev = list(setup.dev)
        dataset = [('train', train)]
        if dev:
            dataset.append(('dev', dev))
        self.train = train
        self.dev = dev
        self.results = results

        # Do we need to run the unpruned parser? (This can be very slow, so we
        # should only do it when necessary.)
        if (show_reference or roll_out == ROLLOUT.BODENSTAB_MLE
                or initializer == INIT.BODENSTAB_MLE):

            from ldp.parsing.util import item_tree, item_tree_get_items
            for e in iterview(train + dev, msg='unpruned'):
                # unpruned
                r = self.evaluate.parse(e,
                                        grammar,
                                        mask=e.mask,
                                        with_derivations=1)
                e.mle_spans = frozenset({
                    (I, K)
                    for (_, I, K) in item_tree_get_items(item_tree(r.coarse))
                    if K - I > 1 and K - I != e.N
                })
                del r.coarse, r.derivation  # delete to save memory
                e.baseline = r
        # Do we need to run the oracle parser?
        if show_reference:
            for e in iterview(train + dev, msg='oracle'):
                # oracle
                m = e.mask
                for x in e.nodes:
                    m[x] = (x in e.gold_spans)
                r = self.evaluate.parse(e, grammar, mask=m, with_derivations=1)
                del r.coarse, r.derivation  # delete to save memory
                e.oracle = r
            # plot/log baselines
            self.baselines()

        # ----------------------------------
        # Baseline --
        # Assumes no dynamic features.
        if roll_out in {ROLLOUT.BODENSTAB_GOLD, ROLLOUT.BODENSTAB_MLE}:
            for e in iterview(train, msg='rollouts'):
                p = BodenstabParser(grammar,
                                    e,
                                    target=roll_out,
                                    tradeoff=tradeoff)
                p.roll_outs()
            print colors.yellow % 'Training...'
            with timeit('train'):
                self.policy.train(train)
            ps = dict(new_policy=self.policy)
            x = dict(iteration=1, tradeoff=tradeoff)
            self.iteration = 1
            self.performance(dataset, ps, x)
            return
        # ----------------------------------
        if roll_out == ROLLOUT.CP:
            Rollouts = CP
        elif roll_out == ROLLOUT.BF:
            Rollouts = BF
        elif roll_out == ROLLOUT.HY:
            Rollouts = HY
        elif roll_out == ROLLOUT.DP:
            Rollouts = DP
        else:
            raise ValueError('Unrecognized rollout option %s' % roll_out)
        for iteration in xrange(1, iterations + 1):
            self.iteration = iteration
            print
            print colors.green % 'Iter %s' % iteration
            if iteration == 1 and initializer in [
                    ROLLOUT.BODENSTAB_MLE, ROLLOUT.BODENSTAB_GOLD
            ]:
                # first iteration uses asymmetric classification to initialize
                for e in iterview(train, msg='rollouts'):
                    p = BodenstabParser(grammar,
                                        e,
                                        target=initializer,
                                        tradeoff=initializer_penalty)
                    p.roll_outs()
            else:
                M = choice(train, min(len(train), minibatch), replace=0)

                tmp = []
                for e in iterview(M, msg='rollouts'):
                    p = Rollouts(grammar,
                                 e,
                                 self.policy,
                                 accuracy=args.accuracy,
                                 runtime=args.runtime,
                                 tradeoff=tradeoff)
                    p.roll_outs(tmp)

                # corpus-level accuracy
                if args.accuracy in (ACC.EVALB_corpus,
                                     ACC.EXPECTED_RECALL_corpus):
                    self.postprocess_rollouts_corpus(M, tmp)

                else:

                    # Compare baseline labels to LOLS's "labels" via rollouts.
                    self.asym_v_lols(tmp)

                    # propagate back to CSC datasets
                    for [e, (I, K), w, (action0, r0), (action1, r1)] in tmp:

                        if args.accuracy == ACC.EVALB_avg:
                            acc1 = r1.f1()
                            acc0 = r0.f1()
                        elif args.accuracy == ACC.EXPECTED_RECALL_avg:
                            acc1 = r1.recall()
                            acc0 = r0.recall()
                        else:
                            acc1 = r1.accuracy
                            acc0 = r0.accuracy

                        e.Q[I, K,
                            action1] += w * (acc1 - tradeoff * r1.runtime)
                        e.Q[I, K,
                            action0] += w * (acc0 - tradeoff * r0.runtime)

            print colors.yellow % 'Training...'
            with timeit('train'):
                self.policy.train(train)
            # Specify additional policies to evaluate on training data.
            ps = dict(new_policy=self.policy)
            # metadata to log
            x = dict(iteration=iteration, tradeoff=tradeoff)
            self.performance(dataset, ps, x)
예제 #17
0
def get_data(G, policy_name, w, examples, verbose=0):

    data = []
    for eid, e in enumerate(iterview(examples, msg='evalb')):
        if verbose:
            print
            print
            print colors.yellow % e.sentence

        F = Features(G, nfeatures=2**22)

        words = e.sentence.split()

        if 'unpruned' in policy_name:
            # <TIMING BLOCK>
            mask = e.mask
            b1 = time()
            e.tokens = np.asarray(G.encode_sentence(words))
            b2 = time()
            state = pruned_parser(e.tokens, G, mask)
            b3 = time()
            # </TIMING BLOCK>
            coarse = G.coarse_derivation(state.derivation)

        else:
            # <TIMING BLOCK>
            b1 = time()
            e.tokens = np.asarray(G.encode_sentence(words))
            mask = F.mask(e, w)
            b2 = time()
            state = pruned_parser(e.tokens, G, mask)
            b3 = time()
            # </TIMING BLOCK>
            coarse = G.coarse_derivation(state.derivation)

        nodes = e.nodes
        mask_size = sum(mask[x] for x in nodes)
        keep_rate = mask_size / len(nodes) if len(nodes) > 0 else 0

        want_and_got, got, want = e.evalb_unofficial(coarse)
        evalb_avg, _, recall_avg = fpr(want_and_got, got, want)

        if isinstance(coarse, Tree):
            parse = oneline(coarse)
            fail = 0
        else:
            parse = '(FAIL %s)' % ' '.join('(X %s)' % x
                                           for x in e.sentence.split())
            fail = 1

        data.append({
            'example': e,
            'eid': eid,
            'N': e.N,
            'fail': fail,
            'mask': mask_size,
            'keep_rate': keep_rate,
            'parse': parse,
            'policy': policy_name,
            'time_total': b3 - b1,
            'time_feature': b2 - b1,
            'time_parse': b3 - b2,
            'evalb_avg': evalb_avg,
            'recall_avg': recall_avg,
            'want_and_got': want_and_got,
            'want': want,
            'got': got,
            'pushes': state.pushes,
            'pops': state.pops
        })

    return data
def aggregate_multiple_runtime_trials(Ds, Ps):
    """Collapse multiple dataframes `Ds` from different timing runes into a single
    one, by taking the min over runtimes (i.e., new runtime will be "best-of k"
    where k=|Ds|).

    Actually, this function does more than that. It appears to collapse over
    sentence too, e.g., computing corpus-EVALB and avg[best-of-k runtimes].

    """
    D0 = Ds[0]

    # Append trials together
    foo = Ds[0]
    for dd in Ds[1:]:
        foo = foo.append(dd)

    # Take min over time_total for this policy-example pair.
    minz = foo[['policy','example','time_total']].groupby(['policy','example']).min()

    data = []
    for policy in iterview(Ps):
        dump = path(policy).dirname()
        args = cPickle.load(file(dump / 'args.pkl'))
        log = pd.read_csv(dump / 'log.csv')

        # TODO: will need to add extra cases.
        if 'DP' in args.roll_out:
            type_ = 'DP'
        elif 'CP' in args.roll_out:
            type_ = 'CP'
        elif 'HY' in args.roll_out:
            type_ = 'HY'
        elif 'BODEN' in args.roll_out:
            type_ = 'baseline'
        else:
            raise ValueError(args.roll_out)

        min_times = minz.ix[policy]['time_total']

        P = D0[D0.policy == policy]
        f = cgw_f(P.want_and_got.sum(), P.got.sum(), P.want.sum())

        #pl.scatter(df.avg_bestof_time, df.evalb, c=C[name], lw=0)
        #show_frontier(df.avg_bestof_time, df.evalb, c=C[name], interpolation='linear', label=name)
        #[w,b] = np.polyfit(df.pushes, df.avg_bestof_time, deg=1)
        #show_frontier(df.pushes*w + b, df.evalb, interpolation='linear', c=C[name])

        if 0:
            # log-log plot of pushes v. seconds. Really great correlation!
            PP = P[['example','pushes']].join(min_times, on='example')
            PP['log(pushes)'] = np.log(PP.pushes)
            PP['log(seconds)'] = np.log(PP.time_total)
            compare('log(pushes)', 'log(seconds)', data=PP, scatter=1, show_regression=1)
            #pl.figure()
            # pushes v. seconds. Really great correlation!
            #PP = P[['example','pushes']].join(min_times, on='example')
            #compare('pushes', 'time_total', data=PP, scatter=1, show_regression=1)
            pl.ioff(); pl.show()

        if 0:
            # empirical runtime estimates

            # scatter plot sentence length against runtime.
            n_by_time = P[['example','N']].join(min_times, on='example')
            pl.scatter(n_by_time.N, n_by_time.time_total, alpha=0.5, lw=0)

            # highlight median runtime per sentence length.
            n_by_median_time = n_by_time.groupby('N').median()
            pl.plot(n_by_median_time.index, n_by_median_time.time_total, c='k', lw=2)

            # empirical exponent and constant factor
            compare(np.log(n_by_time.time_total), np.log(n_by_time.N), scatter=1, show_regression=1)
            pl.ioff(); pl.show()

        # use early stopping on dev to pick the policy.
        dev = log.ix[log['dev_new_policy_reward'].argmax()]

        row = {'avg_bestof_time': np.mean(min_times),
               'wps': np.mean(P.N) / np.mean(min_times),
               'pushes': np.mean(P.pushes),
               'pops': np.mean(P.pops),
               'policy': policy,
               'dev_pushes': dev.dev_new_policy_pushes,
               'dev_evalb': dev.dev_new_policy_evalb_corpus,
               'type': type_,
               'evalb': f}

        row.update({'args_'+k: v for k,v in args.__dict__.items()})

        data.append(row)

    # remove unused baselines (sorry this is a bit ugly).
    ddd = pd.DataFrame(data)
    others = ddd[ddd.type != 'baseline']
    B = ddd[ddd.type == 'baseline']
    used = set()
    for _, z in others.iterrows():
        [ix] = B[B.policy == z.args_init_weights].index
        used.add(ix)
    B = B.ix[list(used)]
    ddd = others.append(B)

    return ddd
def main():

    pl.ion()

    p = ArgumentParser()
    p.add_argument('root', type=path)
    p.add_argument('--quick', action='store_true',
                   help="Load a single evaluation log (for quick tests). Won't run bestof-k runtime.")
    p.add_argument('-i', action='store_true',
                   help='Interactive mode => open an IPython shell after execution.')
    args = p.parse_args()

    runs = [r for r in sorted(args.root.glob('*')) if r.isdir()]

    if args.quick:
        print colors.bold % colors.red % 'Warning! only using some of the runs for timing information.'
        runs = runs[:1]

    Ds = [(r, load(r)) for r in iterview(runs)]

    D0, Ds, bestof = sanity_check(Ds)

#    if 0:
#        pl.figure()
#        for name, df in D0.groupby('type'):
#            pl.scatter(df.avg_bestof_time, df.evalb, c=C[name], lw=0)
#            show_frontier(df.avg_bestof_time, df.evalb, c=C[name], interpolation='linear-convex', label=name)
#            #[w,b] = np.polyfit(df.pushes, df.avg_bestof_time, deg=1)
#            #show_frontier(df.pushes*w + b, df.evalb, interpolation='linear', c=C[name])
#        pl.xlabel('sec/sentence (best of %s)' % len(Ds))
#        pl.ylabel('Corpus EVALB-F1')
#        pl.legend(loc=4)
#        pl.show()

    rescale = 1/bestof.pushes.max()
    bestof['pushes_r'] = bestof.pushes*rescale

    B = bestof[bestof.type=='baseline'].copy()
    lols = bestof[bestof.type!='baseline']

    RO_types = lols.args_roll_out.unique()

    ax = pl.figure().add_subplot(111)
    for name, df in reversed(sorted(bestof.groupby('type'))):
        pl.scatter(df.pushes_r, df.evalb, c=C[name], lw=0, zorder=10, label='', s=50)
        pts = show_frontier(df.pushes_r, df.evalb, interpolation='linear-convex', lw=2, c=C[name], label=name)
        ax.plot(pts[:,0], pts[:,1], label=name, c=C[name])

    pl.ylabel('Corpus $F_1$')
    pl.legend(loc=4)
    pl.tight_layout()

    ax = pl.gca()
    conesize = .06
    lambda_cone(np.array(B.evalb), np.array(B.pushes_r), ax=ax, c=c_baseline, conesize=conesize, lines=0)

    # --------------------------------------------------------------------------
    # Fit parametric curve to dev points show arrows on test points.
    from ldp.viz.parametric_fit import fit

    df = join_with_dev(B)

    ff, gg = fit(df.dev_pushes, df.dev_evalb)
    if 0:
        # enable to show the parametric curve.
        xs = pl.linspace(0, df.dev_pushes.max()+.1*df.dev_pushes.ptp(), 100)
        ax.plot(xs*rescale, ff(xs), c='k')

    ax = pl.gca()
    for _, z in df.iterrows():
        x, y = z.test_pushes*rescale, z.test_evalb
        arrow(x, y, gg(z.dev_pushes)/rescale, offset=-conesize, c=c_vec_baseline, ax=ax)

    # --------------------------------------------------------------------------
    B.loc[:,'tradeoff'] = np.nan

    data = []

    # Loop over all rollout types joined on initial policy (i.e., the baseline).
    for i, bl in B.iterrows():
        spawn = lols[lols.args_initializer_penalty == bl.args_tradeoff]
        assert len(spawn) == len(RO_types)

        models = {}
        for ro in RO_types:
            [ix] = spawn[spawn.args_roll_out == ro].index
            models[ro] = lols.ix[ix]

        [dev_pushes] = df[df.policy == bl.policy].dev_pushes
        tradeoff = gg(dev_pushes)
        B.loc[i,'tradeoff'] = tradeoff

        if 1:
            # uncomment to run hypothesis tests.

            print colors.bold % colors.green % '============================================================='
            print 'tradeoff: %g' % tradeoff
            print

            baseline_acc, baseline_run = get_acc_run(D0[D0.policy == bl.policy])

            row = {
                'baseline': bl.policy,
                'baseline_accuracy': baseline_acc,
                'baseline_runtime': baseline_run,
                'baseline_reward': baseline_acc - tradeoff*baseline_run,
                'wps_baseline': bl.wps,
                'wallclock_baseline': bl.avg_bestof_time,
                'tradeoff': tradeoff,
            }

            star_sty = dict(alpha=1, lw=0, marker='*', s=700, zorder=100)

            for ro, model in sorted(models.items()):
                print colors.bold % '# %s' % ro
                sig, win = paired_permutation_test(D0,
                                                   a=bl.policy,
                                                   b=model.policy,
                                                   tradeoff=tradeoff,
                                                   R=5000)
                acc, run = get_acc_run(D0[D0.policy == model.policy])

                row[ro] = model.policy
                row['%s_accuracy'  % ro] = acc
                row['%s_runtime'   % ro] = run
                row['%s_reward'    % ro] = acc - tradeoff*run
                row['wps_%s'       % ro] = model.wps
                row['wallclock_%s' % ro] = model.avg_bestof_time
                row['winner_%s'    % ro] = win
                row['sig_%s'       % ro] = sig

                if win == +1:
                    pl.scatter([model.pushes_r],
                               [model.evalb],
                               c=C[ro],
                               **star_sty)

                elif win == -1:
                    pl.scatter([bl.pushes_r],
                               [bl.evalb],
                               c=C['baseline'],
                               **star_sty)

                # draw a dotten line to the baseline point.
                pl.plot([bl.pushes_r, model.pushes_r],
                        [bl.evalb, model.evalb],
                        c=C[ro],
                        lw=1,
                        alpha=0.75,
                        label=None,
                        linestyle='--')

            data.append(row)

    #[w,b] = np.polyfit(B.pushes, B.avg_bestof_time, deg=1)

    xx = lols.pushes_r
    xx = np.linspace(xx.min(), xx.max(), 12)

    # put ticks on the top of the plot.
    #ax.xaxis.tick_top()

    #pl.xticks(xx, ['%.2g\m(%.2g)' % (x/rescale / 1e6, (x/rescale*w+b)*100) for x in xx], rotation=0)
    #pl.text(0.4, 0.401, re.sub('(\d)e([\-+]\d+)', r'\1e^{\2}', r'$\textit{seconds} \approx %.2g \cdot \textit{pushes} + %.2g$' % (w,b)))
    #pl.xlabel('average megapushes ($\\approx$ milliseconds)')

    pl.xticks(xx, [r'$%.2g$' % (x/rescale / 1e6) for x in xx])
#    pl.xticks(xx, [r'%.2g' % (x/rescale / 1e6) for x in xx], rotation=45)

    if 'medium' not in args.root:
        pl.xlabel('millions of hyperedges built per sentence')

    pl.ylim(bestof.evalb.min()-0.02, bestof.evalb.max()+0.015)
    pl.xlim(bestof.pushes_r.min()-.01, bestof.pushes_r.max()+0.01)


    zf = pd.DataFrame(data).sort_values('tradeoff')
#    print zf[['tradeoff', 'baseline_reward', 'cp_reward', 'dp_reward', 'winner_cp', 'winner_dp']].sort_values('tradeoff').to_string(float_format='%.4g'.__mod__, index=0)
#    print zf[['tradeoff',
#              'baseline_accuracy', 'baseline_runtime',
#              'cp_accuracy', 'cp_runtime',
#              'dp_accuracy', 'dp_runtime',
#              'winner_cp', 'winner_dp']].sort_values('tradeoff').to_string(float_format='%.4g'.__mod__, index=0)

    if not args.quick:
        sig_file = args.root / 'significance.csv'
        print
        print colors.green % 'wrote %s' % sig_file
        print
        zf.to_csv(sig_file)

    if args.i:
        pl.ion(); pl.show()
        from arsenal.debug import ip; ip()
    else:
        pl.ioff(); pl.show()