def training_job(corpus_fname, k, p, seed, static, dev_fname, model_out_prefix): sents = [s for s in io.conll_to_sents(file(corpus_fname)) if isprojective.is_projective(s)] print "training ",corpus_fname,k,p,seed,len(sents) explore = ExplorePolicy(k,p) TRAIN_OUT_FILE = "%s-ef.kps-k%s-p%s-seed%s" % (model_out_prefix, k, p, seed) if static: TRAIN_OUT_FILE = "%s-ef.kps-static-seed%s" % (model_out_prefix, seed) explore=None model = Model("features/znp.py", "%s.weights" % TRAIN_OUT_FILE) model.save("%s.model" % TRAIN_OUT_FILE) random.seed(seed) train(sents, model, dev=None, ITERS=20,save_every=None,explore_policy=explore,shuffle_sents=True) print "training of",corpus_fname,k,p,seed,"done" print "parsing" parsed = parse_corpus(dev_fname, TRAIN_OUT_FILE + ".weights.FINAL", "features/znp.py") outf = file(TRAIN_OUT_FILE + ".dev.parsed","w") for sent in parsed: io.out_conll(sent, outf, parent='pparent',prel='pprel') uas,las,complete = eval(parsed) puas,plas,complete = eval(parsed,ignore_punct=True) outf.close() outf = file(TRAIN_OUT_FILE + ".dev.scores","w") print >> outf, "UAS:",uas,"LAS:",las,"NP_UAS:",puas,"NP_LAS:",plas outf.close() print "deleting" os.unlink(TRAIN_OUT_FILE + ".weights.FINAL") os.unlink(TRAIN_OUT_FILE + ".model")
def training_job(corpus_fname, k, p, seed, static, dev_fname, model_out_prefix): from training import online_greedy_train sents = [ s for s in io.conll_to_sents(file(corpus_fname)) if isprojective.is_projective(s) ] print "training ", corpus_fname, k, p, seed, len(sents) labels = set() for sent in sents: for tok in sent: if tok['prel'] == '_': tok['prel'] = 'dep' #tok['prel'] = 'dep' labels.add(tok['prel']) oracle = ArcHybridStaticOracle() if static else ArcHybridDynamicOracle() explore = None if static else ExplorePolicy(k, p) print "start" feature_extractor = features.extractors.get("hybrid.1") action_map, params = online_greedy_train( sents, transition_system=ArcHybridState, oracle=oracle, feature_extractor=feature_extractor, labels=labels, iterations=15, explore_policy=ExplorePolicy(k, p), random_seed=seed, shuffle_corpus=True) print "end" params.finalize() TRAIN_OUT_FILE = "%s-hybrid-k%s-p%s-seed%s" % (model_out_prefix, k, p, seed) if static: TRAIN_OUT_FILE = "%s-hybrid-static-seed%s" % (model_out_prefix, seed) params.dump(file(TRAIN_OUT_FILE, "w"), sparse=True) pickle.dump(action_map, file(TRAIN_OUT_FILE + ".amap", "w")) print "training of", corpus_fname, k, p, seed, "done" print "parsing" parsed = parse_corpus(dev_fname, TRAIN_OUT_FILE, feature_extractor, ArcHybridState) print "writing" outf = file(TRAIN_OUT_FILE + ".dev.parsed", "w") for sent in parsed: io.out_conll(sent, outf, parent='pparent', prel='pprel') uas, las, complete = eval(parsed) puas, plas, complete = eval(parsed, ignore_punct=True) outf.close() outf = file(TRAIN_OUT_FILE + ".dev.scores", "w") print >> outf, "UAS:", uas, "LAS:", las, "NP_UAS:", puas, "NP_LAS:", plas print "UAS:", uas, "LAS:", las, "NP_UAS:", puas, "NP_LAS:", plas outf.close() print "deleting" os.unlink(TRAIN_OUT_FILE) os.unlink(TRAIN_OUT_FILE + ".amap")
def main(): model_dir = 'test_model' train_file = 'data' test_file = 'data' beam_size = 1 output_file = None iter = 200 is_train = False if is_train: train_data = list(read_corpus(train_file)) print len(train_data) train_sents = [s for s in train_data if isprojective.is_projective(s)] print len(train_sents) train(model_dir, train_sents, iter, beam_size) else: test_data = list(read_corpus(test_file)) test(model_dir, test_data, output_file, beam_size, str(iter))
type="int", default=20) parser.add_option("--every", dest="save_every", action="store", type="int", default=1) opts, args = parser.parse_args() if len(args) < 1 or not (opts.model_file or opts.features_file): parser.print_usage() sys.exit(1) TRAIN_FILE = args[0] DEV_FILE = args[1] if len(args) > 1 else None FEATURES = opts.features_file MODEL = opts.model_file model = Model(FEATURES, "%s.weights" % MODEL) model.save("%s.model" % MODEL) dev = [s for s in io.conll_to_sents(file(DEV_FILE))] if DEV_FILE else [] train_sents = list(io.conll_to_sents(file(TRAIN_FILE))) print len(train_sents) train_sents = [s for s in train_sents if isprojective.is_projective(s)] print len(train_sents) train(train_sents, model, dev, opts.iters, save_every=opts.save_every)
attachonly = False if TRAIN_FILE[-3:] == "E00": print TRAIN_FILE[-3:] attachonly = True model = Model(FEATURES, "%s.weights" % MODEL) model.save("%s.model" % MODEL) dev = [s for s in io.conll_to_sents(file(DEV_FILE))] if DEV_FILE else [] train_sents = list(io.conll_to_sents(file(TRAIN_FILE))) gold_sents = list(io.conll_to_sents(file(GOLD_FILE))) print len(train_sents), len(gold_sents) nonproj = [(s_g, s_t) for s_g, s_t in zip(gold_sents, train_sents) if isprojective.is_projective(s_g)] gold_sents, train_sents = zip(*nonproj) train_sents = list(train_sents) gold_sents = list(gold_sents) print len(train_sents), len(gold_sents) assert len(train_sents) == len(gold_sents) random.seed(opts.random_seed) if opts.follow_incorrect: explore = ExplorePolicy(2, 0.9) # almost always else: explore = None if (opts.labeled): from easyfirst import train_labeled train_labeled(train_sents, gold_sents,
parser.add_option("-o","--model",dest="model_file") parser.add_option("-f","--features",dest="features_file",default="None") parser.add_option("--iters",dest="iters",action="store",type="int",default=20) parser.add_option("--every",dest="save_every",action="store",type="int",default=1) opts, args = parser.parse_args() if len(args)<1 or not (opts.model_file or opts.features_file): parser.print_usage() sys.exit(1) TRAIN_FILE = args[0] DEV_FILE = args[1] if len(args)>1 else None FEATURES = opts.features_file MODEL = opts.model_file model = Model(FEATURES, "%s.weights" % MODEL) model.save("%s.model" % MODEL) dev = [s for s in io.conll_to_sents(file(DEV_FILE))] if DEV_FILE else [] train_sents = list(io.conll_to_sents(file(TRAIN_FILE))) print len(train_sents) train_sents = [s for s in train_sents if isprojective.is_projective(s)] print len(train_sents) train(train_sents, model, dev, opts.iters,save_every=opts.save_every)
def load_sentences(filename,ONLY_PROJECTIVE=False): sents = [s for s in io.conll_to_sents(file(filename)) if (not ONLY_PROJECTIVE) or isprojective.is_projective(s)] return sents
gflags.DEFINE_integer("random_seed", 0, "Random seed.") gflags.DEFINE_integer("save_every", 0, "Dump a model every k iterations.") args = FLAGS(sys.argv) print args DATA_FILE = args[1] featExt = extractors.get(FLAGS.feature_extractor) sents = list(io.conll_to_sents(file(DATA_FILE))) if FLAGS.train and (True or FLAGS.only_proj): import isprojective sents = [s for s in sents if isprojective.is_projective(s)] if FLAGS.add_dep_label: for sent in sents: for tok in sent: if tok['prel'] == '_': tok['prel'] = "dep" EXPLORE = 1 LABELED = True MODE = "train" if FLAGS.train else "test" #MODE="test_system" system = FLAGS.system
def load_sentences(filename, ONLY_PROJECTIVE=False): sents = [ s for s in io.conll_to_sents(file(filename)) if (not ONLY_PROJECTIVE) or isprojective.is_projective(s) ] return sents