def training_job(corpus_fname, k, p, seed, static, dev_fname, model_out_prefix):
   sents = [s for s in io.conll_to_sents(file(corpus_fname)) if isprojective.is_projective(s)]
   print "training ",corpus_fname,k,p,seed,len(sents)

   explore = ExplorePolicy(k,p)
   TRAIN_OUT_FILE = "%s-ef.kps-k%s-p%s-seed%s" % (model_out_prefix, k, p, seed)
   if static:
      TRAIN_OUT_FILE = "%s-ef.kps-static-seed%s" % (model_out_prefix, seed)
      explore=None

   model = Model("features/znp.py", "%s.weights" % TRAIN_OUT_FILE)
   model.save("%s.model" % TRAIN_OUT_FILE)
   random.seed(seed)
   train(sents, model, dev=None, ITERS=20,save_every=None,explore_policy=explore,shuffle_sents=True)
   print "training of",corpus_fname,k,p,seed,"done"
   print "parsing"

   parsed = parse_corpus(dev_fname, TRAIN_OUT_FILE + ".weights.FINAL", "features/znp.py")
   outf = file(TRAIN_OUT_FILE + ".dev.parsed","w")
   for sent in parsed:
      io.out_conll(sent, outf, parent='pparent',prel='pprel')
   uas,las,complete = eval(parsed)
   puas,plas,complete = eval(parsed,ignore_punct=True)
   outf.close()
   outf = file(TRAIN_OUT_FILE + ".dev.scores","w")
   print >> outf, "UAS:",uas,"LAS:",las,"NP_UAS:",puas,"NP_LAS:",plas
   outf.close()

   print "deleting"
   os.unlink(TRAIN_OUT_FILE + ".weights.FINAL")
   os.unlink(TRAIN_OUT_FILE + ".model")
def test_bu_labeler(fname, sents, fext=AnEdgeLabelFeatureExtractor()):
    labeler = BottomupSimpleSentenceLabeler(Labeler.load(fname), fext)
    stime = time.time()
    for sent in sents:
        labeler.label(sent, par='parent', prelout='pprel')
        io.out_conll(sent, prel='pprel')
    print >> sys.stderr, time.time() - stime
Esempio n. 3
0
def training_job(corpus_fname, k, p, seed, static, dev_fname,
                 model_out_prefix):
    from training import online_greedy_train
    sents = [
        s for s in io.conll_to_sents(file(corpus_fname))
        if isprojective.is_projective(s)
    ]
    print "training ", corpus_fname, k, p, seed, len(sents)
    labels = set()
    for sent in sents:
        for tok in sent:
            if tok['prel'] == '_': tok['prel'] = 'dep'
            #tok['prel'] = 'dep'
            labels.add(tok['prel'])

    oracle = ArcHybridStaticOracle() if static else ArcHybridDynamicOracle()
    explore = None if static else ExplorePolicy(k, p)
    print "start"
    feature_extractor = features.extractors.get("hybrid.1")
    action_map, params = online_greedy_train(
        sents,
        transition_system=ArcHybridState,
        oracle=oracle,
        feature_extractor=feature_extractor,
        labels=labels,
        iterations=15,
        explore_policy=ExplorePolicy(k, p),
        random_seed=seed,
        shuffle_corpus=True)
    print "end"
    params.finalize()
    TRAIN_OUT_FILE = "%s-hybrid-k%s-p%s-seed%s" % (model_out_prefix, k, p,
                                                   seed)
    if static:
        TRAIN_OUT_FILE = "%s-hybrid-static-seed%s" % (model_out_prefix, seed)
    params.dump(file(TRAIN_OUT_FILE, "w"), sparse=True)
    pickle.dump(action_map, file(TRAIN_OUT_FILE + ".amap", "w"))
    print "training of", corpus_fname, k, p, seed, "done"

    print "parsing"
    parsed = parse_corpus(dev_fname, TRAIN_OUT_FILE, feature_extractor,
                          ArcHybridState)
    print "writing"
    outf = file(TRAIN_OUT_FILE + ".dev.parsed", "w")
    for sent in parsed:
        io.out_conll(sent, outf, parent='pparent', prel='pprel')
    uas, las, complete = eval(parsed)
    puas, plas, complete = eval(parsed, ignore_punct=True)
    outf.close()
    outf = file(TRAIN_OUT_FILE + ".dev.scores", "w")
    print >> outf, "UAS:", uas, "LAS:", las, "NP_UAS:", puas, "NP_LAS:", plas
    print "UAS:", uas, "LAS:", las, "NP_UAS:", puas, "NP_LAS:", plas
    outf.close()

    print "deleting"
    os.unlink(TRAIN_OUT_FILE)
    os.unlink(TRAIN_OUT_FILE + ".amap")
def test_labeler(fname,
                 sents,
                 fext=AnEdgeLabelFeatureExtractor(),
                 guides=_dummyguides()):
    labeler = SimpleSentenceLabeler(Labeler.load(fname), fext)
    stime = time.time()
    for sent, sguide in zip(sents, guides):
        labeler.label(sent, par='parent', prelout='pprel', sent_guides=sguide)
        io.out_conll(sent, prel='pprel')
    print >> sys.stderr, "time:", time.time() - stime
Esempio n. 5
0
def conll_out(sent, extra=None, outf=sys.stdout):
    from pio import io
    for rel, par, chl in extra:
        newchl = copy.copy(chl)
        newchl['tag'] = 'COIDX_%s' % chl['id']
        newchl['id'] = len(sent) + 1
        newchl['pparent'] = par['id']
        newchl['prel'] = rel
        sent.append(newchl)
    io.out_conll(sent, parent='pparent', out=outf)
def test():
    '''
    Test Model
    '''
    logging.info("test ...")
    featExt = extractors.get(FLAGS.feature_extarctor)
    p = ArcEagerParser(
        MLActionDecider(ml.MulticlassModel(FLAGS.model), featExt))

    good = 0.0
    bad = 0.0
    complete = 0.0

    # main test loop
    reals = set()
    preds = set()
    with open(FLAGS.test_results, "w") as fout:
        sents = io.transform_conll_sents(FLAGS.test_data,
                                         FLAGS.only_projective, FLAGS.unlex)
        for i, sent in enumerate(sents):
            sgood = 0.0
            sbad = 0.0
            mistake = False
            sys.stderr.write("%s %s %s\n" % ("@@@", i, good /
                                             (good + bad + 1)))
            try:
                d = p.parse(sent)
            except MLTrainerWrongActionException:
                # this happens only in "early update" parsers, and then we just go on to
                # the next sentence..
                continue
            sent = d.annotate_allow_none(sent)
            for tok in sent:
                if FLAGS.ignore_punc and tok['form'][0] in "`',.-;:!?{}":
                    continue
                reals.add((i, tok['parent'], tok['id']))
                preds.add((i, tok['pparent'], tok['id']))
                if tok['pparent'] == -1:
                    continue
                if tok['parent'] == tok['pparent'] or tok['pparent'] == -1:
                    good += 1
                    sgood += 1
                else:
                    bad += 1
                    sbad += 1
                    mistake = True
            if FLAGS.unlex:
                io.out_conll(sent, parent='pparent', form='oform')
            else:
                io.out_conll(sent, parent='pparent', form='form')
            if not mistake:
                complete += 1
            # sys.exit()
            logging.info("test result: sgood[%s], sbad[%s]", sgood, sbad)
            if sgood > 0.0 and sbad > 0.0:
                fout.write("%s\n" % (sgood / (sgood + sbad)))

        logging.info("accuracy: %s", good / (good + bad))
        logging.info("complete: %s", complete / len(sents))
        preds = set([(i, p, c) for i, p, c in preds if p != -1])
        logging.info("recall: %s",
                     len(preds.intersection(reals)) / float(len(reals)))
        logging.info("precision: %s",
                     len(preds.intersection(reals)) / float(len(preds)))
        logging.info("assigned: %s", len(preds) / float(len(reals)))
Esempio n. 7
0
now = time.time()
for i, sent in enumerate(sents):
    sgood = 0.0
    sbad = 0.0
    mistake = False
    print >> sys.stdout, "%s %s %s\n" % ("@@@", i, good / (good + bad + 1))
    #io.out_conll(sent)
    try:
        d = p.parse(sent)
    except MLTrainerWrongActionException:
        # this happens only in "early update" parsers, and then we just go on to
        # the next sentence..
        print "WTF"
        continue
    sent = d.annotate(sent)
    io.out_conll(sent, parent='parent')
    io.out_conll(sent, parent='pparent')
    for tok in sent:
        #print tok['parent'],tok['pparent']
        if FLAGS.ignore_punc and tok['form'][0] in "`',.-;:!?{}": continue
        reals.add((i, tok['parent'], tok['id']))
        preds.add((i, tok['pparent'], tok['id']))
        if FLAGS.ignore_punc and tok['prel'] == 'punct': continue
        if tok['pparent'] == -1: continue
        if tok['parent'] == tok['pparent'] or tok['pparent'] == -1:
            if tok['prel'] == tok['pprel']: lgood += 1
            else: print "badl:", tok['prel'], tok['pprel']
            good += 1
            sgood += 1
        else:
            bad += 1