def __init__(self, opts): self.opts = opts self.tm = models.TM(opts.tm, sys.maxint) self.lm = models.LM(opts.lm) self.french = [ tuple(line.strip().split()) for line in open(opts.input).readlines() ] # tm should translate unknown words as-is with probability 1 for word in set(sum(self.french, ())): if (word, ) not in self.tm: self.tm[(word, )] = [models.phrase(word, 0.0)]
def main(): parser = argparse.ArgumentParser( description= 'Compute unnormalized translation probability by marginalizing over alignments.' ) parser.add_argument( '-i', '--input', dest='input', default='data/input', help='File containing sentences to translate (default=data/input)') parser.add_argument( '-t', '--translation-model', dest='tm', default='data/tm', help='File containing translation model (default=data/tm)') parser.add_argument( '-l', '--language-model', dest='lm', default='data/lm', help='File containing ARPA-format language model (default=data/lm)') opts = parser.parse_args() tm = models.TM(opts.tm, sys.maxint) lm = models.LM(opts.lm) french_sents = [ tuple(line.strip().split()) for line in open(opts.input).readlines() ] english_sents = [tuple(line.strip().split()) for line in sys.stdin] if (len(french_sents) != len(english_sents)): sys.stderr.write( "ERROR: French and English files are not the same length! Only complete output can be graded!\n" ) sys.exit(1) total_logprob = 0.0 unaligned_sentences = 0 for sent_num, (f, e) in enumerate(zip(french_sents, english_sents)): total_logprob += sent_logp(sent_num, (f, e)) if unaligned_sentences > 0: sys.stderr.write( "ERROR: There were %d unaligned sentences! Only sentences that align under the model can be graded!\n" % unaligned_sentences) sys.stdout.write("%f\n" % total_logprob)
import models from collections import namedtuple optparser = optparse.OptionParser() optparser.add_option("-i", "--input", dest="input", default="data/input", help="File containing sentences to translate (default=data/input)") optparser.add_option("-t", "--translation-model", dest="tm", default="data/tm", help="File containing translation model (default=data/tm)") optparser.add_option("-l", "--language-model", dest="lm", default="data/lm", help="File containing ARPA-format language model (default=data/lm)") optparser.add_option("-n", "--num_sentences", dest="num_sents", default=sys.maxint, type="int", help="Number of sentences to decode (default=no limit)") optparser.add_option("-k", "--translations-per-phrase", dest="k", default=1, type="int", help="Limit on number of translations to consider per phrase (default=1)") optparser.add_option("-s", "--stack-size", dest="s", default=1, type="int", help="Maximum stack size (default=1)") optparser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Verbose mode (default=off)") opts = optparser.parse_args()[0] opts.k = 4 tm = models.TM(opts.tm, opts.k) lm = models.LM(opts.lm) french = [tuple(line.strip().split()) for line in open(opts.input).readlines()[:opts.num_sents]] # tm should translate unknown words as-is with probability 1 for word in set(sum(french,())): if (word,) not in tm: tm[(word,)] = [models.phrase(word, 0.0)] # values for the model parameter dd = 5 nn = -4 beta = 2 gooby = tm[("de", "ce")]
help="weight for language model") optparser.add_option("-b", "--beta", dest="beta", default=1.0, type="float", help="weight for translation model") optparser.add_option("-m", "--mute", dest="mute", default=0, type="int", help="mute the output") opts = optparser.parse_args()[0] tm = models.TM(opts.tm, opts.k, opts.mute) lm = models.LM(opts.lm, opts.mute) french = [ tuple(line.strip().split()) for line in open(opts.input).readlines()[:opts.num_sents] ] bound_width = float(opts.bwidth) hypothesis = namedtuple( "hypothesis", "lm_state, logprob, coverage, end, predecessor, phrase") def bitmap(sequence): """ Generate a coverage bitmap for a sequence of indexes """ return reduce(lambda x, y: x | y, map(lambda i: long('1' + '0' * i, 2), sequence), 0)
batch_size=trainingconfig['batch_size'], shuffle=True, num_workers=dataconfig["fetchworker_num"]) cv_loader = torch.utils.data.DataLoader( valid_set, collate_fn=collate, batch_size=trainingconfig['batch_size'], shuffle=False, num_workers=dataconfig["fetchworker_num"]) if modelconfig["type"] == "lstm": lmlayer = lm_layers.LSTM(modelconfig) else: raise ValueError("Unknown model") model = models.LM(lmlayer) logging.info("\nModel info:\n{}".format(model)) if args.continue_training: logging.info("Load package from {}.".format( os.path.join(trainingconfig["exp_dir"], "last.pt"))) pkg = torch.load(os.path.join(trainingconfig["exp_dir"], "last.pt")) model.restore(pkg["model"]) if "multi_gpu" in trainingconfig and trainingconfig["multi_gpu"] == True: logging.info("Let's use {} GPUs!".format(torch.cuda.device_count())) model = torch.nn.DataParallel(model) model = model.cuda() trainer = LmTrainer(model, trainingconfig, tr_loader, cv_loader)
argparser.add_argument("-o", "--output", dest="output", default="output", help="Ouput result file") argparser.add_argument("-n", "--num_sentences", dest="num_sents", default=2**64, type=int, help="Number of sentences to decode (default=2^64)") argparser.add_argument("-k", "--translations-per-phrase", dest="k", default=2**64, type=int, help="Limit on number of translations to consider per phrase (default=2^64)") argparser.add_argument("-b", "--beam-size", dest="beam_size", default=1000, type=int, help="Maximum beam size (default=1000)") argparser.add_argument("-dl", "--distortion-limit", dest="distortion_limit", default=10, type=int, help="Hard distortion limit (default=10)") argparser.add_argument("-dp", "--distortion-parameter", dest="distortion_parameter", default=-0.01, type=float, help="Soft distortion parameter (default=-0.01)") args = argparser.parse_args() tm = models.TM(args.tm, args.k) lm = models.LM(args.lm) french = [tuple(line.strip().split()) for line in open(args.input, 'r', encoding='utf-8').readlines()[:args.num_sents]] output = open(args.output, 'w', encoding='utf-8') # tm should translate unknown words as-is with probability 1 for word in set(sum(french,())): if (word,) not in tm: tm[(word,)] = [models.phrase(word, 0.0)] hypothesis = namedtuple("hypothesis", "logprob, state, predecessor, phrase") state = namedtuple("state", "e1, e2, bitString, r") ph = namedtuple("ph", "s, t, phrase") for f in french: initial_state = state(None, lm.begin(), 0, 0.0) initial_hypothesis = hypothesis(0.0, initial_state, None, None)
import scorereranker optparser = optparse.OptionParser() optparser.add_option("-i", "--input", dest="input", default="data/dev/all.cn-en.cn", help="File containing sentences to translate for training") optparser.add_option("-e", "--eval", dest="eval", default="data/test/all.cn-en.cn", help="File containing sentences to translate for evaluation") optparser.add_option("-r", "--reference-train", dest="refdev", default="data/dev/all.cn-en.en0", help="English reference sentences for training") optparser.add_option("-q", "--reference-test", dest="reftest", default="data/test/all.cn-en.en0", help="English reference sentences for evaluation") optparser.add_option("-t", "--translation-model-train", dest="tmdev", default="data/large/phrase-table/dev-filtered/rules_cnt.final.out", help="File containing translation model for training") optparser.add_option("-u", "--translation-model-test", dest="tmtest", default="data/large/phrase-table/test-filtered/rules_cnt.final.out", help="File containing translation model for evaluation") optparser.add_option("-l", "--language-model-train", dest="lmdev", default="data/lm/en.gigaword.3g.filtered.train_dev_test.arpa.gz", help="File containing ARPA-format language model") optparser.add_option("-m", "--language-model-test", dest="lmtest", default="data/lm/en.gigaword.3g.filtered.dev_test.arpa.gz", help="File containing ARPA-format language model") optparser.add_option("-k", "--translations-per-phrase", dest="k", default=3, type="int", help="Limit on number of translations to consider per phrase (default=1)") optparser.add_option("-s", "--stack-size", dest="s", default=100, type="int", help="Maximum stack size (default=1)") optparser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False, help="Verbose mode (default=off)") optparser.add_option("-f", "--feedback-loop", dest="loop", default=3, help="The number of times the weight vector loops between decoder and reranker") opts = optparser.parse_args()[0] lmdev = models.LM(opts.lmdev) lmevl = models.LM(opts.lmtest) weights = [1, 1, 1, 1, 1] # evaluation tm = models.TM(opts.tmtest, opts.k, weights[0:-1]) nbest_list = decode.get_candidates(opts.eval, tm, lmevl, weights, s=opts.s) results = rerank.rerank(weights, nbest_list) print >> sys.stderr, "BLEU SCORE: %f:" % scorereranker.score(results, opts.reftest) file = open("output_simp", "w") file.write("\n".join(results)) file.close()
def init(tmpath, lmpath): tm = models.TM(opts.tm, sys.maxint) lm = models.LM(opts.lm) return tm, lm