Ejemplo n.º 1
0
def test_load():
    from arsenal.timer import timeit
    with timeit('load 10 sentences maxlength 8'):
        assert len(list(ptb('train', n=10, maxlength=8))) == 10
    d = PTB.standard_split(PTB_ROOT)
    # skip training because it's slow to load all of it.
    #with timeit('loading train'):
    #    train = list(d.load_fold(d.train))
    with timeit('load dev'):
        dev = list(d.load_fold(d.dev))
    with timeit('load test'):
        test = list(d.load_fold(d.test))

    print 'n_sentences: dev %s' % len(dev)
    print 'n_sentences: test %s' % len(test)
    assert len(dev) == 1700
    assert len(test) == 2416
    with timeit('load dev w/ preprocessing'):
        dev = list(ptb('dev'))
    assert len(dev) == 1700
    with timeit('load test w/ preprocessing'):
        test = list(ptb('test'))
    assert len(test) == 2416

    from ldp.prune.example import Setup
    s = Setup(grammar='medium',
              train=0,
              dev=3000,
              minlength=0,
              maxlength=1000000)
    assert len(s.dev) == 1700

    #s = Setup(grammar='medium', train=0, dev=0, minlength=0, maxlength=1000000)
    assert len(list(s.load('test'))) == 2416
Ejemplo n.º 2
0
def load(filename, default=None, saveit=False, verbose=False):
    "Load cached item by `name`, on miss call `get` function and cached results."
    f = path(filename)
    if f.exists():
        if verbose:
            print '[load] %s, size = %s' % (f, filesize(f))
            with timeit('[load] %s' % filename):
                with file(f) as pkl:
                    return cPickle.load(pkl)
        else:
            with file(f) as pkl:
                return cPickle.load(pkl)
    else:
        if default is None:
            raise OSError("File not found '%s'" % filename)
        with timeit('[load] make %s' % filename):
            val = default()
        if saveit:
            save(filename, val, verbose=verbose)
        return val
Ejemplo n.º 3
0
def save(filename, val, verbose=False):
    "Save `val` so we can load it via `load`."
    if verbose:
        with timeit('[save] %s' % filename):
            with file(filename, 'wb') as pkl:
                cPickle.dump(val, pkl)
            print '[save] %s, size = %s' % (filename, filesize(filename))
    else:
        with file(filename, 'wb') as pkl:
            cPickle.dump(val, pkl)
    return val
Ejemplo n.º 4
0
def _main(args):
    with timeit('load data'):
        corpus = CoNLL_U('data/UD/{lang}/UD_{lang}'.format(lang=args.lang),
                         tag_type=args.tag_type)

    if args.quick:
        corpus.train = corpus.train[:100]
        corpus.dev = corpus.train[:0]

    allowed_contexts = None
    if args.context_count is not None:
        print 'context count filter threshold %s' % args.context_count

        max_order = args.initial_order + args.outer_iterations,
        if args.max_order is not None:
            max_order = args.max_order

        allowed_contexts = contexts_by_count(corpus, max_order,
                                             args.context_count)
        print 'allowed_contexts:', len(allowed_contexts)

        B = groupby2(allowed_contexts, len)
        print '(sizes %s)' % (', '.join('%s: %s' % (z, len(B[z]))
                                        for z in sorted(B)))

        if 0:
            # things that survived the threshold.
            for k, v in B.items():
                if k >= 10:  # context size >= 10
                    print
                    print k
                    for vv in v:
                        print '-'.join(vv)
            pl.plot(B.keys(), map(len, B.values()))
            pl.show()

        if 0:
            max_order = args.outer_iterations
            C = {}
            for n in xrange(1, max_order + 1):  # initial order + num iters
                C.update(corpus.tag_ngram_counts(n=n))
            pl.scatter(map(len, C.keys()), C.values(), lw=0, alpha=0.5)
            pl.show()

    elif args.max_order is not None:
        allowed_contexts = prefix_closure(
            fixed_order_contexts(corpus.Y, order=args.max_order))
        print 'allowed_contexts:', len(allowed_contexts)

    A = ActiveSet(corpus,
                  Y=corpus.Y,
                  train=corpus.make_instances('train', Instance),
                  dev=corpus.make_instances('dev', Instance),
                  group_budget=args.budget,
                  regularizer=args.C,
                  outer_iterations=args.outer_iterations,
                  inner_iterations=args.inner_iterations,
                  initial_contexts=fixed_order_contexts(
                      corpus.Y, args.initial_order),
                  allowed_contexts=allowed_contexts,
                  no_failure_arcs=args.baseline,
                  dump=args.dump)

    A.active_set()