def train(): MODE = 'train' featExt = extractors.get(FLAGS.feature_extarctor) sents = io.transform_conll_sents(FLAGS.train_data, FLAGS.only_projective, FLAGS.unlex) trainer = MLTrainerActionDecider(ml.MultitronParameters(3), ArcStandardParsingOracle(), featExt) p = ArcStandardParser2(trainer) total = len(sents) random.seed("seed") for x in xrange(FLAGS.epoch): random.shuffle(sents) logging.info("iter %s/%s", x + 1, FLAGS.epoch) logging.info(" shuffle data ...") for i, sent in enumerate(sents): if i % 500 == 0: logging.info(" step %s/%s ...", i, total) try: d = p.parse(sent) except Exception as e: logging.info("prob in sent: %s", i) logging.info("\n".join([ "%s %s %s %s" % (t['id'], t['form'], t['tag'], t['parent']) for t in sent ])) raise e with open(FLAGS.model, "w") as fout: logging.info("save model file to disk [%s] ...", FLAGS.model) trainer.save(fout)
def train(): ''' Train Model ''' MODE = 'train' TRAIN_OUT_FILE = FLAGS.model if FLAGS.externaltrainfile: ''' create feature vector files for training with an external classifier. If you don't know what it means, just ignore this option. The model file format is the same as Megam's. ''' MODE = 'write' TRAIN_OUT_FILE = FLAGS.externaltrainfile featExt = extractors.get(FLAGS.feature_extarctor) sents = io.transform_conll_sents(FLAGS.train_data, FLAGS.only_projective, FLAGS.unlex) if MODE == "write": fout = file(TRAIN_OUT_FILE, "w") trainer = LoggingActionDecider( ArcEagerParsingOracle(pop_when_can=FLAGS.lazypop), featExt, fout) p = ArcEagerParser(trainer) for i, sent in enumerate(sents): sys.stderr.write(". %s " % i) sys.stderr.flush() d = p.parse(sent) sys.exit() if MODE == "train": fout = file(TRAIN_OUT_FILE, "w") nactions = 4 trainer = MLTrainerActionDecider( ml.MultitronParameters(nactions), ArcEagerParsingOracle(pop_when_can=FLAGS.lazypop), featExt) p = ArcEagerParser(trainer) import random random.seed("seed") total = len(sents) for x in xrange(FLAGS.epoch): # epoch logging.info("iter %s/%s", x + 1, FLAGS.epoch) logging.info(" shuffle data ...") random.shuffle(sents) for i, sent in enumerate(sents): if i % 500 == 0: logging.info(" step %s/%s ...", i, total) try: d = p.parse(sent) except IndexError as e: logging.info("prob in sent: %s", i) logging.info("\n".join([ "%s %s %s %s" % (t['id'], t['form'], t['tag'], t['parent']) for t in sent ])) raise e logging.info("save model file to disk [%s] ...", TRAIN_OUT_FILE) trainer.save(fout)
def test(): featExt = extractors.get(FLAGS.feature_extarctor) sents = io.transform_conll_sents(FLAGS.test_data, FLAGS.only_projective, FLAGS.unlex) p = ArcStandardParser2( MLActionDecider(ml.MulticlassModel(FLAGS.model, True), featExt)) good = 0.0 bad = 0.0 complete = 0.0 with open(FLAGS.test_results, "w") as fout: for i, sent in enumerate(sents): mistake = False sgood = 0.0 sbad = 0.0 fout.write("%s %s %s\n" % ("@@@", i, good / (good + bad + 1))) try: d = p.parse(sent) except MLTrainerWrongActionException: continue sent = d.annotate(sent) for tok in sent: # print tok['id'], tok['form'], "_",tok['tag'],tok['tag'],"_",tok['pparent'],"_ _ _" if FLAGS.ignore_punc and tok['form'][0] in "`',.-;:!?{}": continue if tok['parent'] == tok['pparent']: good += 1 sgood += 1 else: bad += 1 sbad += 1 mistake = True if not mistake: complete += 1 fout.write("%s\n" % (sgood / (sgood + sbad))) print("accuracy:", good / (good + bad)) print("complete:", complete / len(sents))
def test(): ''' Test Model ''' logging.info("test ...") featExt = extractors.get(FLAGS.feature_extarctor) p = ArcEagerParser( MLActionDecider(ml.MulticlassModel(FLAGS.model), featExt)) good = 0.0 bad = 0.0 complete = 0.0 # main test loop reals = set() preds = set() with open(FLAGS.test_results, "w") as fout: sents = io.transform_conll_sents(FLAGS.test_data, FLAGS.only_projective, FLAGS.unlex) for i, sent in enumerate(sents): sgood = 0.0 sbad = 0.0 mistake = False sys.stderr.write("%s %s %s\n" % ("@@@", i, good / (good + bad + 1))) try: d = p.parse(sent) except MLTrainerWrongActionException: # this happens only in "early update" parsers, and then we just go on to # the next sentence.. continue sent = d.annotate_allow_none(sent) for tok in sent: if FLAGS.ignore_punc and tok['form'][0] in "`',.-;:!?{}": continue reals.add((i, tok['parent'], tok['id'])) preds.add((i, tok['pparent'], tok['id'])) if tok['pparent'] == -1: continue if tok['parent'] == tok['pparent'] or tok['pparent'] == -1: good += 1 sgood += 1 else: bad += 1 sbad += 1 mistake = True if FLAGS.unlex: io.out_conll(sent, parent='pparent', form='oform') else: io.out_conll(sent, parent='pparent', form='form') if not mistake: complete += 1 # sys.exit() logging.info("test result: sgood[%s], sbad[%s]", sgood, sbad) if sgood > 0.0 and sbad > 0.0: fout.write("%s\n" % (sgood / (sgood + sbad))) logging.info("accuracy: %s", good / (good + bad)) logging.info("complete: %s", complete / len(sents)) preds = set([(i, p, c) for i, p, c in preds if p != -1]) logging.info("recall: %s", len(preds.intersection(reals)) / float(len(reals))) logging.info("precision: %s", len(preds.intersection(reals)) / float(len(preds))) logging.info("assigned: %s", len(preds) / float(len(reals)))