def main(): from ngram import Ngram from model import Model from forest import Forest flags.DEFINE_integer("beam", 100, "beam size", short_name="b") flags.DEFINE_integer("debuglevel", 0, "debug level") flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost>)") flags.DEFINE_boolean("cube", True, "using cube pruning to speedup") flags.DEFINE_integer("kbest", 1, "kbest output", short_name="k") flags.DEFINE_integer("ratio", 3, "the maximum items (pop from PQ): ratio*b", short_name="r") argv = FLAGS(sys.argv) weights = Model.cmdline_model() lm = Ngram.cmdline_ngram() false_decoder = CYKDecoder(weights, lm) def non_local_scorer(cedge, ders): (lmsc, alltrans, sig) = false_decoder.deltLMScore(cedge.lhsstr, ders) fv = Vector() fv["lm"] = lmsc return ((weights.dot(fv), fv), alltrans, sig) cube_prune = CubePruning(FeatureScorer(weights), non_local_scorer, FLAGS.k, FLAGS.ratio) for i, forest in enumerate(Forest.load("-", is_tforest=True, lm=lm), 1): a = false_decoder.beam_search(forest, b = FLAGS.beam) b = cube_prune.run(forest.root) assert a[0], b[0].score[0] assert a[1], b[0].score[1] print a print b[0]
def main(): from ngram import Ngram from model import Model from forest import Forest flags.DEFINE_integer("beam", 100, "beam size", short_name="b") flags.DEFINE_integer("debuglevel", 0, "debug level") flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost>)") flags.DEFINE_boolean("cube", True, "using cube pruning to speedup") flags.DEFINE_integer("kbest", 1, "kbest output", short_name="k") flags.DEFINE_integer("ratio", 3, "the maximum items (pop from PQ): ratio*b", short_name="r") argv = FLAGS(sys.argv) [outfile] = argv[1:] weights = Model.cmdline_model() lm = Ngram.cmdline_ngram() false_decoder = CYKDecoder(weights, lm) out = utility.getfile(outfile, 1) old_bleu = Bleu() new_bleu = Bleu() for i, forest in enumerate(Forest.load("-", is_tforest=True, lm=lm), 1): oracle_forest, oracle_item = oracle_extracter(forest, weights, false_decoder, 100, 2, extract=100) print >>sys.stderr, "processed sent %s " % i oracle_forest.dump(out) bleu, hyp, fv, edgelist = forest.compute_oracle(weights, 0.0, 1) forest.bleu.rescore(hyp) old_bleu += forest.bleu forest.bleu.rescore(oracle_item[0].full_derivation) new_bleu += forest.bleu bad_bleu, _, _, _ = oracle_forest.compute_oracle(weights, 0.0, -1) #for i in range(min(len(oracle_item), 5)): # print >>sys.stderr, "Oracle Trans: %s %s %s" %(oracle_item[i].full_derivation, oracle_item[i].score, str(oracle_item[i].score[2])) # print >>sys.stderr, "Oracle BLEU Score: %s"% (forest.bleu.rescore(oracle_item[i].full_derivation)) print >>sys.stderr, "Oracle BLEU Score: %s"% (forest.bleu.rescore(oracle_item[0].full_derivation)) print >>sys.stderr, "Worst new Oracle BLEU Score: %s"% (bad_bleu) print >>sys.stderr, "Old Oracle BLEU Score: %s"% (bleu) print >>sys.stderr, "Running Oracle BLEU Score: %s"% (new_bleu.compute_score()) print >>sys.stderr, "Running Old Oracle BLEU Score: %s"% (old_bleu.compute_score())
from ngram import Ngram from model import Model from forest import Forest flags.DEFINE_integer("beam", 100, "beam size", short_name="b") flags.DEFINE_integer("debuglevel", 0, "debug level") flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost>)") flags.DEFINE_boolean("cube", True, "using cube pruning to speedup") flags.DEFINE_integer("kbest", 1, "kbest output", short_name="k") flags.DEFINE_integer("ratio", 3, "the maximum items (pop from PQ): ratio*b", short_name="r") argv = FLAGS(sys.argv) weights = Model.cmdline_model() lm = Ngram.cmdline_ngram() decoder = CYKDecoder(weights, lm) tot_bleu = Bleu() tot_score = 0. tot_time = 0. tot_len = tot_fnodes = tot_fedges = 0 tot_lmedges = 0 tot_lmnodes = 0 if FLAGS.debuglevel > 0: print >>logs, "beam size = %d" % FLAGS.beam for i, forest in enumerate(Forest.load("-", is_tforest=True, lm=lm), 1):
def main(): gc.set_threshold(100000, 10, 10) flags.DEFINE_integer("beam", 100, "beam size", short_name="b") flags.DEFINE_integer("debuglevel", 0, "debug level") flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost>)") flags.DEFINE_boolean("cube", True, "using cube pruning to speedup") flags.DEFINE_integer("kbest", 1, "kbest output", short_name="k") flags.DEFINE_integer("ratio", 3, "the maximum items (pop from PQ): ratio*b", short_name="r") flags.DEFINE_boolean("dist", False, "ditributed (hadoop) training)") flags.DEFINE_string("prefix", "", "prefix for distributed training") flags.DEFINE_string("hadoop_weights", "", "hadoop weights (formatted specially)") flags.DEFINE_boolean("add_features", False, "add features to training data") flags.DEFINE_boolean("prune_train", False, "prune before decoding") flags.DEFINE_boolean("no_lm", False, "don't use the unigram language model") flags.DEFINE_boolean("pickleinput", False, "assumed input is pickled") flags.DEFINE_string("oracle_forests", None, "oracle forests", short_name="o") flags.DEFINE_string("feature_map_file", None, "file with the integer to feature mapping (for lbfgs)") flags.DEFINE_boolean("cache_input", False, "cache input sentences (only works for pruned input)") flags.DEFINE_string("rm_features", None, "list of features to remove") flags.DEFINE_boolean("just_basic", False, "remove all features but basic") argv = FLAGS(sys.argv) if FLAGS.weights: weights = Model.cmdline_model() else: vector = Vector() assert glob.glob(FLAGS.hadoop_weights) for file in glob.glob(FLAGS.hadoop_weights): for l in open(file): if not l.strip(): continue f, v = l.strip().split() vector[f] = float(v) weights = Model(vector) rm_features = set() if FLAGS.rm_features: for l in open(FLAGS.rm_features): rm_features.add(l.strip()) lm = Ngram.cmdline_ngram() if FLAGS.no_lm: lm = None if argv[1] == "train": local_decode = ChiangPerceptronDecoder(weights, lm) elif argv[1] == "sgd" or argv[1] == "crf": local_decode = MarginalDecoder(weights, lm) else: local_decode = MarginalDecoder(weights, lm) if FLAGS.add_features: tdm = local_features.TargetDataManager() local_decode.feature_adder = FeatureAdder(tdm) local_decode.prune_train = FLAGS.prune_train local_decode.use_pickle = FLAGS.pickleinput local_decode.cache_input = FLAGS.cache_input print >> logs, "Cache input is %s" % FLAGS.cache_input if FLAGS.debuglevel > 0: print >> logs, "beam size = %d" % FLAGS.beam if argv[1] == "train": if not FLAGS.dist: perc = trainer.Perceptron.cmdline_perc(local_decode) else: train_files = [FLAGS.prefix + file.strip() for file in sys.stdin] perc = distributed_trainer.DistributedPerceptron.cmdline_perc(local_decode) perc.set_training(train_files) perc.train() elif argv[1] == "sgd": crf = sgd.BaseCRF.cmdline_crf(local_decode) crf.set_oracle_files([FLAGS.oracle_forests]) crf.train() elif argv[1] == "crf": if not FLAGS.dist: crf = CRF.LBFGSCRF.cmdline_crf(local_decode) crf.set_oracle_files([FLAGS.oracle_forests]) crf.set_feature_mappers(add_features.read_features(FLAGS.feature_map_file)) crf.rm_features(rm_features) if FLAGS.just_basic: print "Enforcing Basic" crf.enforce_just_basic() crf.train() else: # train_files = [FLAGS.prefix+file.strip() for file in sys.stdin] # oracle_files = [file+".oracle" for file in train_files] print >> sys.stderr, "DistributedCRF" crf = distCRF.DistributedCRF.cmdline_distibuted_crf(local_decode) # os.system("~/.python/bin/dumbo rm train_input -hadoop /home/nlg-03/mt-apps/hadoop/0.20.1+169.89/") # os.system("~/.python/bin/dumbo put "+crf.trainfiles[0]+" train_input -hadoop /home/nlg-03/mt-apps/hadoop/0.20.1+169.89/") crf.set_feature_mappers(add_features.read_features(FLAGS.feature_map_file)) crf.rm_features(rm_features) if FLAGS.just_basic: print "Enforcing Basic" crf.enforce_just_basic() # crf.set_oracle_files(oracle_files) crf.train() else: if not FLAGS.dist: print "Evaluating" eval = Evaluator(local_decode, [FLAGS.dev]) eval.tune() else: dev_files = [FLAGS.prefix + file.strip() for file in sys.stdin] eval = Evaluator(local_decode, dev_files) print eval.eval(verbose=True).compute_score()