def training_job(corpus_fname, k, p, seed, static, dev_fname, model_out_prefix): sents = [s for s in io.conll_to_sents(file(corpus_fname)) if isprojective.is_projective(s)] print "training ",corpus_fname,k,p,seed,len(sents) explore = ExplorePolicy(k,p) TRAIN_OUT_FILE = "%s-ef.kps-k%s-p%s-seed%s" % (model_out_prefix, k, p, seed) if static: TRAIN_OUT_FILE = "%s-ef.kps-static-seed%s" % (model_out_prefix, seed) explore=None model = Model("features/znp.py", "%s.weights" % TRAIN_OUT_FILE) model.save("%s.model" % TRAIN_OUT_FILE) random.seed(seed) train(sents, model, dev=None, ITERS=20,save_every=None,explore_policy=explore,shuffle_sents=True) print "training of",corpus_fname,k,p,seed,"done" print "parsing" parsed = parse_corpus(dev_fname, TRAIN_OUT_FILE + ".weights.FINAL", "features/znp.py") outf = file(TRAIN_OUT_FILE + ".dev.parsed","w") for sent in parsed: io.out_conll(sent, outf, parent='pparent',prel='pprel') uas,las,complete = eval(parsed) puas,plas,complete = eval(parsed,ignore_punct=True) outf.close() outf = file(TRAIN_OUT_FILE + ".dev.scores","w") print >> outf, "UAS:",uas,"LAS:",las,"NP_UAS:",puas,"NP_LAS:",plas outf.close() print "deleting" os.unlink(TRAIN_OUT_FILE + ".weights.FINAL") os.unlink(TRAIN_OUT_FILE + ".model")
def test_bu_labeler(fname, sents, fext=AnEdgeLabelFeatureExtractor()): labeler = BottomupSimpleSentenceLabeler(Labeler.load(fname), fext) stime = time.time() for sent in sents: labeler.label(sent, par='parent', prelout='pprel') io.out_conll(sent, prel='pprel') print >> sys.stderr, time.time() - stime
def training_job(corpus_fname, k, p, seed, static, dev_fname, model_out_prefix): from training import online_greedy_train sents = [ s for s in io.conll_to_sents(file(corpus_fname)) if isprojective.is_projective(s) ] print "training ", corpus_fname, k, p, seed, len(sents) labels = set() for sent in sents: for tok in sent: if tok['prel'] == '_': tok['prel'] = 'dep' #tok['prel'] = 'dep' labels.add(tok['prel']) oracle = ArcHybridStaticOracle() if static else ArcHybridDynamicOracle() explore = None if static else ExplorePolicy(k, p) print "start" feature_extractor = features.extractors.get("hybrid.1") action_map, params = online_greedy_train( sents, transition_system=ArcHybridState, oracle=oracle, feature_extractor=feature_extractor, labels=labels, iterations=15, explore_policy=ExplorePolicy(k, p), random_seed=seed, shuffle_corpus=True) print "end" params.finalize() TRAIN_OUT_FILE = "%s-hybrid-k%s-p%s-seed%s" % (model_out_prefix, k, p, seed) if static: TRAIN_OUT_FILE = "%s-hybrid-static-seed%s" % (model_out_prefix, seed) params.dump(file(TRAIN_OUT_FILE, "w"), sparse=True) pickle.dump(action_map, file(TRAIN_OUT_FILE + ".amap", "w")) print "training of", corpus_fname, k, p, seed, "done" print "parsing" parsed = parse_corpus(dev_fname, TRAIN_OUT_FILE, feature_extractor, ArcHybridState) print "writing" outf = file(TRAIN_OUT_FILE + ".dev.parsed", "w") for sent in parsed: io.out_conll(sent, outf, parent='pparent', prel='pprel') uas, las, complete = eval(parsed) puas, plas, complete = eval(parsed, ignore_punct=True) outf.close() outf = file(TRAIN_OUT_FILE + ".dev.scores", "w") print >> outf, "UAS:", uas, "LAS:", las, "NP_UAS:", puas, "NP_LAS:", plas print "UAS:", uas, "LAS:", las, "NP_UAS:", puas, "NP_LAS:", plas outf.close() print "deleting" os.unlink(TRAIN_OUT_FILE) os.unlink(TRAIN_OUT_FILE + ".amap")
def test_labeler(fname, sents, fext=AnEdgeLabelFeatureExtractor(), guides=_dummyguides()): labeler = SimpleSentenceLabeler(Labeler.load(fname), fext) stime = time.time() for sent, sguide in zip(sents, guides): labeler.label(sent, par='parent', prelout='pprel', sent_guides=sguide) io.out_conll(sent, prel='pprel') print >> sys.stderr, "time:", time.time() - stime
def conll_out(sent, extra=None, outf=sys.stdout): from pio import io for rel, par, chl in extra: newchl = copy.copy(chl) newchl['tag'] = 'COIDX_%s' % chl['id'] newchl['id'] = len(sent) + 1 newchl['pparent'] = par['id'] newchl['prel'] = rel sent.append(newchl) io.out_conll(sent, parent='pparent', out=outf)
def test(): ''' Test Model ''' logging.info("test ...") featExt = extractors.get(FLAGS.feature_extarctor) p = ArcEagerParser( MLActionDecider(ml.MulticlassModel(FLAGS.model), featExt)) good = 0.0 bad = 0.0 complete = 0.0 # main test loop reals = set() preds = set() with open(FLAGS.test_results, "w") as fout: sents = io.transform_conll_sents(FLAGS.test_data, FLAGS.only_projective, FLAGS.unlex) for i, sent in enumerate(sents): sgood = 0.0 sbad = 0.0 mistake = False sys.stderr.write("%s %s %s\n" % ("@@@", i, good / (good + bad + 1))) try: d = p.parse(sent) except MLTrainerWrongActionException: # this happens only in "early update" parsers, and then we just go on to # the next sentence.. continue sent = d.annotate_allow_none(sent) for tok in sent: if FLAGS.ignore_punc and tok['form'][0] in "`',.-;:!?{}": continue reals.add((i, tok['parent'], tok['id'])) preds.add((i, tok['pparent'], tok['id'])) if tok['pparent'] == -1: continue if tok['parent'] == tok['pparent'] or tok['pparent'] == -1: good += 1 sgood += 1 else: bad += 1 sbad += 1 mistake = True if FLAGS.unlex: io.out_conll(sent, parent='pparent', form='oform') else: io.out_conll(sent, parent='pparent', form='form') if not mistake: complete += 1 # sys.exit() logging.info("test result: sgood[%s], sbad[%s]", sgood, sbad) if sgood > 0.0 and sbad > 0.0: fout.write("%s\n" % (sgood / (sgood + sbad))) logging.info("accuracy: %s", good / (good + bad)) logging.info("complete: %s", complete / len(sents)) preds = set([(i, p, c) for i, p, c in preds if p != -1]) logging.info("recall: %s", len(preds.intersection(reals)) / float(len(reals))) logging.info("precision: %s", len(preds.intersection(reals)) / float(len(preds))) logging.info("assigned: %s", len(preds) / float(len(reals)))
now = time.time() for i, sent in enumerate(sents): sgood = 0.0 sbad = 0.0 mistake = False print >> sys.stdout, "%s %s %s\n" % ("@@@", i, good / (good + bad + 1)) #io.out_conll(sent) try: d = p.parse(sent) except MLTrainerWrongActionException: # this happens only in "early update" parsers, and then we just go on to # the next sentence.. print "WTF" continue sent = d.annotate(sent) io.out_conll(sent, parent='parent') io.out_conll(sent, parent='pparent') for tok in sent: #print tok['parent'],tok['pparent'] if FLAGS.ignore_punc and tok['form'][0] in "`',.-;:!?{}": continue reals.add((i, tok['parent'], tok['id'])) preds.add((i, tok['pparent'], tok['id'])) if FLAGS.ignore_punc and tok['prel'] == 'punct': continue if tok['pparent'] == -1: continue if tok['parent'] == tok['pparent'] or tok['pparent'] == -1: if tok['prel'] == tok['pprel']: lgood += 1 else: print "badl:", tok['prel'], tok['pprel'] good += 1 sgood += 1 else: bad += 1