def aggregate_multiple_runtime_trials(Ds, Ps):
    """Collapse multiple dataframes `Ds` from different timing runes into a single
    one, by taking the min over runtimes (i.e., new runtime will be "best-of k"
    where k=|Ds|).

    Actually, this function does more than that. It appears to collapse over
    sentence too, e.g., computing corpus-EVALB and avg[best-of-k runtimes].

    """
    D0 = Ds[0]

    # Append trials together
    foo = Ds[0]
    for dd in Ds[1:]:
        foo = foo.append(dd)

    # Take min over time_total for this policy-example pair.
    minz = foo[['policy','example','time_total']].groupby(['policy','example']).min()

    data = []
    for policy in iterview(Ps):
        dump = path(policy).dirname()
        args = cPickle.load(file(dump / 'args.pkl'))
        log = pd.read_csv(dump / 'log.csv')

        # TODO: will need to add extra cases.
        if 'DP' in args.roll_out:
            type_ = 'DP'
        elif 'CP' in args.roll_out:
            type_ = 'CP'
        elif 'HY' in args.roll_out:
            type_ = 'HY'
        elif 'BODEN' in args.roll_out:
            type_ = 'baseline'
        else:
            raise ValueError(args.roll_out)

        min_times = minz.ix[policy]['time_total']

        P = D0[D0.policy == policy]
        f = cgw_f(P.want_and_got.sum(), P.got.sum(), P.want.sum())

        #pl.scatter(df.avg_bestof_time, df.evalb, c=C[name], lw=0)
        #show_frontier(df.avg_bestof_time, df.evalb, c=C[name], interpolation='linear', label=name)
        #[w,b] = np.polyfit(df.pushes, df.avg_bestof_time, deg=1)
        #show_frontier(df.pushes*w + b, df.evalb, interpolation='linear', c=C[name])

        if 0:
            # log-log plot of pushes v. seconds. Really great correlation!
            PP = P[['example','pushes']].join(min_times, on='example')
            PP['log(pushes)'] = np.log(PP.pushes)
            PP['log(seconds)'] = np.log(PP.time_total)
            compare('log(pushes)', 'log(seconds)', data=PP, scatter=1, show_regression=1)
            #pl.figure()
            # pushes v. seconds. Really great correlation!
            #PP = P[['example','pushes']].join(min_times, on='example')
            #compare('pushes', 'time_total', data=PP, scatter=1, show_regression=1)
            pl.ioff(); pl.show()

        if 0:
            # empirical runtime estimates

            # scatter plot sentence length against runtime.
            n_by_time = P[['example','N']].join(min_times, on='example')
            pl.scatter(n_by_time.N, n_by_time.time_total, alpha=0.5, lw=0)

            # highlight median runtime per sentence length.
            n_by_median_time = n_by_time.groupby('N').median()
            pl.plot(n_by_median_time.index, n_by_median_time.time_total, c='k', lw=2)

            # empirical exponent and constant factor
            compare(np.log(n_by_time.time_total), np.log(n_by_time.N), scatter=1, show_regression=1)
            pl.ioff(); pl.show()

        # use early stopping on dev to pick the policy.
        dev = log.ix[log['dev_new_policy_reward'].argmax()]

        row = {'avg_bestof_time': np.mean(min_times),
               'wps': np.mean(P.N) / np.mean(min_times),
               'pushes': np.mean(P.pushes),
               'pops': np.mean(P.pops),
               'policy': policy,
               'dev_pushes': dev.dev_new_policy_pushes,
               'dev_evalb': dev.dev_new_policy_evalb_corpus,
               'type': type_,
               'evalb': f}

        row.update({'args_'+k: v for k,v in args.__dict__.items()})

        data.append(row)

    # remove unused baselines (sorry this is a bit ugly).
    ddd = pd.DataFrame(data)
    others = ddd[ddd.type != 'baseline']
    B = ddd[ddd.type == 'baseline']
    used = set()
    for _, z in others.iterrows():
        [ix] = B[B.policy == z.args_init_weights].index
        used.add(ix)
    B = B.ix[list(used)]
    ddd = others.append(B)

    return ddd
def get_acc_run(X):
    acc = cgw_f(X.want_and_got.sum(), X.got.sum(), X.want.sum())
    run = X.pushes.mean()
    return acc, run
Ejemplo n.º 3
0
def main():

    from argparse import ArgumentParser
    p = ArgumentParser()
    p.add_argument('directory', type=path)
    p.add_argument('--grammar', choices=('medium', 'big'), required=1)
    p.add_argument('--split',
                   choices=('dev', 'test', 'other', 'train'),
                   required=1)
    p.add_argument('--download-only', action='store_true')

    args = p.parse_args()

    if 'unpruned' in args.directory:
        policies = [path('unpruned') / 'unpruned']
    else:
        # Grab the best model parameters according to early stopping on each dev
        # reward surrogate.
        policies = []
        for x in (path('results/*-lols10-*/').glob('dump') +
                  path('results/*-baseline9-*/').glob('dump')):
            #        for x in (path('results/*-lols11-*/').glob('dump')):

            # only grab models matching the grammar specified.
            if cPickle.load(file(x / 'args.pkl')).grammar != args.grammar:
                continue
            df = pd.read_csv(x / 'log.csv')
            if df.get('dev_new_policy_evalb_corpus') is None:
                # [2016-04-29 Fri] SKIP some of the baselin8 jobs. These guys are
                # using the wrong evaluation.
                assert 'baseline9' in x
                continue

            # identify iteration with best dev reward (surrogate).
            el = df.datetime.map(pd.to_datetime).tolist()
            df['elapsed'] = [(t - el[0]).total_seconds() / (24 * 60 * 60)
                             for t in el]
            df = df[df.elapsed <=
                    6]  # take the best policy <= 6 days of training.

            best = df.ix[df.dev_new_policy_reward.argmax()]

            print colors.yellow % 'best policy:'
            print best[['iteration', 'elapsed']]

            # download model file for that iteration, if we don't already have it.
            policy = x / ('new_policy-%03d.npz' % best.iteration)
            if not policy.exists():
                assert 0 == os.system(
                    'rsync --progress "[email protected]:/export/a11/timv/ldp/%s" %s'
                    % (policy, policy))
            policies.append(policy)

    if args.download_only:
        return

    s = Setup(grammar=args.grammar, train=0, dev=0, features=0)

    examples = list(s.load(args.split))
    shuffle(policies)

    outdir = args.directory
    outdir.mkdir_p()

    for pp, policy in enumerate(policies, start=1):
        print
        print colors.green % '[%s/%s] %s' % (pp, len(policies), policy)

        evaluation_file = outdir / (policy.dirname() / '..').abspath(
        ).basename() + '-evaluation.csv.gz'
        print evaluation_file

        if path(evaluation_file).exists():
            last_time = pd.read_csv(evaluation_file)
            [last_policy] = last_time.policy.unique()
            if last_policy == policy:
                print colors.yellow % 'SKIP: evaluation file exists.'
                continue
            print colors.red % 'replace old evaluation.'

        if 'unpruned' in policy:
            w = None
        else:
            w = np.load(policy)['coef']

        d = get_data(s.grammar, policy, w, examples)
        df = pd.DataFrame(d)

        if 1:
            xx = df.groupby('policy').sum()
            yy = df.groupby('policy').mean()
            print 'evalb:     %.3f' % (cgw_f(yy.want_and_got.sum(),
                                             yy.got.sum(), yy.want.sum()))
            print 'words/sec: %.1f' % (xx.N / xx.time_total)
            print 'sent/sec:  %.1f' % (1 / yy.time_total)
            print 'sec/sent:  %g' % (yy.time_total)
            #print 'features:  %4.1f%%' % (100 * yy.time_feature / yy.time_total)
            #print 'parse:     %4.1f%%' % (100 * yy.time_parse / yy.time_total)

        pd.DataFrame(df).to_csv(evaluation_file, compression='gzip')
Ejemplo n.º 4
0
def _main(args):

    if args.deps:
        import StanfordDependencies
        dep = StanfordDependencies.get_instance(backend='subprocess')

    delta = args.delta
    assert 0 <= delta <= 1

    T = {}
    P = {}
    check_llh = 1

    color = {name: c for name, c in zip(sorted(P), 'rgbym' * len(P))}
    marker = {name: 'o' for name in P}

    if args.experiment not in ('grammars', ):
        # benchmark default parser
        if args.grammar == 'medium':
            grammar_file = 'data/medium'
            g = Grammar.load('data/medium')
        elif args.grammar == 'big':
            grammar_file = 'data/bubs/wsj_6'
        chomsky = False
        g = Grammar.load(grammar_file)

    if args.experiment == 'default-parser':
        P['lchild'] = Parser(leftchild, g, chomsky=0)

    elif args.experiment == 'grammar-loops':
        # Experiment: grammar loops
        P['lcbptr'] = Parser(leftchild_bp, g, chomsky=chomsky)
        P['lchild'] = Parser(leftchild, g, chomsky=chomsky)
        #P['x-prod'] = Parser(xprod, g, chomsky=chomsky)
        #P['agenda'] = AgendaParser(g, chomsky=chomsky)

    elif args.experiment == 'grammars':
        #P, color, marker = _leftchild_v_dense_yj_on_many_grammars()
        P, color, marker = _many_grammars()
        check_llh = False
    else:
        raise ValueError('Fail to recognize experiment %r' % args.experiment)

    T = {x: Timer(x) for x in P}
    overall = []
    errors = []

    examples = ptb(args.fold,
                   minlength=3,
                   maxlength=args.maxlength,
                   n=args.examples)

    if 1:
        examples = list(examples)
        np.random.shuffle(examples)

    _evalb_gold = {}
    _evalb_pred = {}
    for k, p in enumerate(P):
        _evalb_gold[p] = open('tmp/evalb-%s.gold' % k, 'wb')
        _evalb_pred[p] = open('tmp/evalb-%s.pred' % k, 'wb')

    if args.policy:
        from ldp.prune.features import Features
        theta = np.load(args.policy)['coef']

        policy_grammar = Grammar.load(
            'data/bubs/wsj_6')  # FIXME: shouldn't be hardcoded!
        F = Features(policy_grammar, nfeatures=2**22)

    for i, (s, t) in enumerate(examples):
        print
        print green % 'Example: %s, length: %s' % (i, len(s.split()))
        print yellow % s

        e = Example(s, grammar=None, gold=t)
        sentence = e.tokens
        N = e.N

        if args.policy:
            e.tokens = policy_grammar.encode_sentence(e.sentence.split())
            keep = F.mask(e, theta)

        else:
            # don't prune anything
            keep = np.ones((N, N + 1), dtype=np.int)
            for x in e.nodes:
                keep[x] = np.random.uniform(0, 1) <= delta
            for x in e.gold_spans:
                keep[x] = 1

        data = []

        #ugold = Tree.fromstring(e.gold_unbinarized)

        if args.deps:
            dep_gold = dep.convert_tree(
                e.gold_unbinarized,
                universal=0)  # TODO: include function tags???
            dep_unlabel_gold = {(z.index, z.head) for z in dep_gold}
            dep_label_gold = {(z.index, z.deprel, z.head) for z in dep_gold}

        for parser in sorted(P):

            b4 = time()

            with T[parser]:
                state = P[parser](e, keep)

            wallclock = time() - b4

            s = state.likelihood
            d = state.derivation
            pops = state.pops
            pushes = state.pushes

            ucoarse = P[parser].decode(e, d)

            #            print
            #            print parser
            #            print ucoarse

            # write gold and predicted trees to files so we can call evalb
            print >> _evalb_gold[parser], e.gold_unbinarized
            print >> _evalb_pred[parser], oneline(ucoarse)

            GW, G, W = evalb_unofficial(e.gold_unbinarized, binarize(ucoarse))
            h = cgw_f(GW, G, W)
            #            h = evalb(e.gold_unbinarized, ucoarse)

            row = {
                'name': parser,
                'llh': s,
                'sentence': sentence,
                'N': N,
                #'tree': tree,
                'evalb': h,
                'GotWant': GW,
                'Got': G,
                'Want': W,
                'pops': pops,
                'pushes': pushes,
                'wallclock': wallclock
            }

            if args.deps:
                # TODO: include function tags? What is the correct way to get target trees?
                dep_parse = dep.convert_tree(oneline(ucoarse), universal=0)
                dep_label = {(z.index, z.deprel, z.head) for z in dep_parse}
                dep_unlabel = {(z.index, z.head) for z in dep_parse}

                # TODO: Use the official eval.pl script from CoNLL task.
                UAS = len(dep_unlabel & dep_unlabel_gold) / e.N
                LAS = len(dep_label & dep_label_gold) / e.N
                row['LAS'] = LAS
                row['UAS'] = UAS

            data.append(row)
            overall.append(row)

        df = DataFrame(overall).groupby('name').mean()
        #df['wallclock'] = sum_df.wallclock  # use total time

        df.sort_values('wallclock', inplace=1)
        df['speedup'] = df.wallclock.max() / df.wallclock
        df['wps'] = df['N'] / df['wallclock']  # ok to use avg instead of sum

        # Determine which columns to display given command-line options.
        show_cols = [
            'evalb_corpus', 'wallclock', 'wps', 'speedup', 'pushes', 'pops',
            'LAS', 'UAS'
        ]
        if len(P) == 1:
            show_cols.remove('speedup')
        if not args.deps:
            show_cols.remove('LAS')
            show_cols.remove('UAS')

        def foo(df):
            "Add column"
            s = DataFrame(overall).groupby(
                'name').sum()  # create separate sum dataframe.
            P = s.GotWant / s.Got
            R = s.GotWant / s.Want
            df['evalb_corpus'] = 2 * P * R / (P + R)
            df['evalb_avg'] = df.pop('evalb')  # get rid of old column.

        foo(df)

        print df[show_cols]

        if args.pareto:
            accuracy_name = 'evalb'
            with axman('speed-accuracy ($\delta= %g$)' % delta) as ax:
                df = DataFrame(overall).groupby('name').mean()
                runtime = df.wallclock / df.wallclock.max()
                for name, x, y in zip(df.index, runtime, df[accuracy_name]):
                    c = color[name]
                    ax.scatter([x], [y],
                               alpha=0.75,
                               lw=0,
                               s=50,
                               c=c,
                               label=name,
                               marker=marker[name])
                ax.legend(loc=4)
                ax.set_xlim(-0.1, 1.1)
                ax.set_ylim(0, 1)
                ax.grid(True)
                ax.set_xlabel('runtime (relative to slowest)')
                ax.set_ylabel('accuracy (%s)' % accuracy_name)
                show_frontier(runtime, df[accuracy_name], ax=ax)

        if args.bylength:
            # Breakdown runtime differences of parsers by length.
            bylength = {name: [] for name in T}
            for length, df in DataFrame(overall).groupby('N'):
                df = df.groupby('name').mean()
                for name, v in df.wallclock.iteritems():
                    bylength[name].append([length, v])
            with axman('benchmark') as ax:
                for name, d in sorted(bylength.items()):
                    d.sort()
                    xs, ys = np.array(d).T
                    ax.plot(xs, ys, alpha=0.5, c=color[name], label=name)
                    ax.scatter(xs, ys, alpha=0.5, lw=1, c=color[name])
                ax.legend(loc=2)
                ax.set_xlabel('sentence length')
                ax.set_ylabel('seconds / sentence')

        if check_llh:
            # Only run this test when it makes sense, e.g., when all parses come
            # from the same grammar.
            s0 = data[0]['llh']
            for x in data:
                s = x['llh']
                name = x['name']
                if abs(s0 - s) > 1e-10:
                    errors.append({'parser': name, 'sentence': sentence})
                    print '[%s]: name: %s expect: %g got: %g' % (red % 'error',
                                                                 name, s0, s)

        Timer.compare_many(*T.values(), verbose=False)

        if errors:
            print red % 'errors: %s' % len(errors)

    print
    print green % '==============================='
    print green % 'DONE!'
    print

    print 'EVALB-unofficial:'
    print 2 * (df.GotWant / df.Got * df.GotWant /
               df.Want) / (df.GotWant / df.Got + df.GotWant / df.Want)
    print
    print 'EVALB-official:'
    import os
    for k, p in enumerate(P):
        _evalb_pred[p].close()
        _evalb_gold[p].close()
        out = 'tmp/evalb-%s.out' % k
        os.system('./bin/EVALB/evalb %s %s > %s' %
                  (_evalb_gold[p].name, _evalb_pred[p].name, out))
        with file(out) as f:
            for x in f:
                if x.startswith('Bracketing FMeasure'):
                    print p, float(x.strip().split()[-1])
                    break  # use the first one which is for all lengths
Ejemplo n.º 5
0
 def f1(self):
     return cgw_f(self.C, self.G, self.W)