def __init__(self, opts):
     self.opts = opts
     self.tm = models.TM(opts.tm, sys.maxint)
     self.lm = models.LM(opts.lm)
     self.french = [
         tuple(line.strip().split())
         for line in open(opts.input).readlines()
     ]
     # tm should translate unknown words as-is with probability 1
     for word in set(sum(self.french, ())):
         if (word, ) not in self.tm:
             self.tm[(word, )] = [models.phrase(word, 0.0)]
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Compute unnormalized translation probability by marginalizing over alignments.'
    )
    parser.add_argument(
        '-i',
        '--input',
        dest='input',
        default='data/input',
        help='File containing sentences to translate (default=data/input)')
    parser.add_argument(
        '-t',
        '--translation-model',
        dest='tm',
        default='data/tm',
        help='File containing translation model (default=data/tm)')
    parser.add_argument(
        '-l',
        '--language-model',
        dest='lm',
        default='data/lm',
        help='File containing ARPA-format language model (default=data/lm)')
    opts = parser.parse_args()

    tm = models.TM(opts.tm, sys.maxint)
    lm = models.LM(opts.lm)
    french_sents = [
        tuple(line.strip().split()) for line in open(opts.input).readlines()
    ]
    english_sents = [tuple(line.strip().split()) for line in sys.stdin]
    if (len(french_sents) != len(english_sents)):
        sys.stderr.write(
            "ERROR: French and English files are not the same length! Only complete output can be graded!\n"
        )
        sys.exit(1)

    total_logprob = 0.0
    unaligned_sentences = 0
    for sent_num, (f, e) in enumerate(zip(french_sents, english_sents)):

        total_logprob += sent_logp(sent_num, (f, e))

    if unaligned_sentences > 0:
        sys.stderr.write(
            "ERROR: There were %d unaligned sentences! Only sentences that align under the model can be graded!\n"
            % unaligned_sentences)

    sys.stdout.write("%f\n" % total_logprob)
Ejemplo n.º 3
0
import models
from collections import namedtuple

optparser = optparse.OptionParser()
optparser.add_option("-i", "--input", dest="input", default="data/input", help="File containing sentences to translate (default=data/input)")
optparser.add_option("-t", "--translation-model", dest="tm", default="data/tm", help="File containing translation model (default=data/tm)")
optparser.add_option("-l", "--language-model", dest="lm", default="data/lm", help="File containing ARPA-format language model (default=data/lm)")
optparser.add_option("-n", "--num_sentences", dest="num_sents", default=sys.maxint, type="int", help="Number of sentences to decode (default=no limit)")
optparser.add_option("-k", "--translations-per-phrase", dest="k", default=1, type="int", help="Limit on number of translations to consider per phrase (default=1)")
optparser.add_option("-s", "--stack-size", dest="s", default=1, type="int", help="Maximum stack size (default=1)")
optparser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False,  help="Verbose mode (default=off)")
opts = optparser.parse_args()[0]

opts.k = 4
tm = models.TM(opts.tm, opts.k)
lm = models.LM(opts.lm)
french = [tuple(line.strip().split()) for line in open(opts.input).readlines()[:opts.num_sents]]




# tm should translate unknown words as-is with probability 1
for word in set(sum(french,())):
  if (word,) not in tm:
    tm[(word,)] = [models.phrase(word, 0.0)]
# values for the model parameter
dd = 5
nn = -4
beta = 2

gooby = tm[("de", "ce")]
Ejemplo n.º 4
0
                     help="weight for language model")
optparser.add_option("-b",
                     "--beta",
                     dest="beta",
                     default=1.0,
                     type="float",
                     help="weight for translation model")
optparser.add_option("-m",
                     "--mute",
                     dest="mute",
                     default=0,
                     type="int",
                     help="mute the output")
opts = optparser.parse_args()[0]
tm = models.TM(opts.tm, opts.k, opts.mute)
lm = models.LM(opts.lm, opts.mute)
french = [
    tuple(line.strip().split())
    for line in open(opts.input).readlines()[:opts.num_sents]
]
bound_width = float(opts.bwidth)
hypothesis = namedtuple(
    "hypothesis", "lm_state, logprob, coverage, end, predecessor, phrase")


def bitmap(sequence):
    """ Generate a coverage bitmap for a sequence of indexes """
    return reduce(lambda x, y: x | y,
                  map(lambda i: long('1' + '0' * i, 2), sequence), 0)

Ejemplo n.º 5
0
        batch_size=trainingconfig['batch_size'],
        shuffle=True,
        num_workers=dataconfig["fetchworker_num"])
    cv_loader = torch.utils.data.DataLoader(
        valid_set,
        collate_fn=collate,
        batch_size=trainingconfig['batch_size'],
        shuffle=False,
        num_workers=dataconfig["fetchworker_num"])

    if modelconfig["type"] == "lstm":
        lmlayer = lm_layers.LSTM(modelconfig)
    else:
        raise ValueError("Unknown model")

    model = models.LM(lmlayer)
    logging.info("\nModel info:\n{}".format(model))

    if args.continue_training:
        logging.info("Load package from {}.".format(
            os.path.join(trainingconfig["exp_dir"], "last.pt")))
        pkg = torch.load(os.path.join(trainingconfig["exp_dir"], "last.pt"))
        model.restore(pkg["model"])

    if "multi_gpu" in trainingconfig and trainingconfig["multi_gpu"] == True:
        logging.info("Let's use {} GPUs!".format(torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    model = model.cuda()

    trainer = LmTrainer(model, trainingconfig, tr_loader, cv_loader)
Ejemplo n.º 6
0
  argparser.add_argument("-o", "--output", dest="output", default="output",
                         help="Ouput result file")
  argparser.add_argument("-n", "--num_sentences", dest="num_sents", default=2**64, type=int,
                         help="Number of sentences to decode (default=2^64)")
  argparser.add_argument("-k", "--translations-per-phrase", dest="k", default=2**64, type=int,
                         help="Limit on number of translations to consider per phrase (default=2^64)")
  argparser.add_argument("-b", "--beam-size", dest="beam_size", default=1000, type=int,
                         help="Maximum beam size (default=1000)")
  argparser.add_argument("-dl", "--distortion-limit", dest="distortion_limit", default=10, type=int,
                         help="Hard distortion limit (default=10)")
  argparser.add_argument("-dp", "--distortion-parameter", dest="distortion_parameter", default=-0.01, type=float,
                         help="Soft distortion parameter (default=-0.01)")
  args = argparser.parse_args()

  tm = models.TM(args.tm, args.k)
  lm = models.LM(args.lm)
  french = [tuple(line.strip().split()) for line in open(args.input, 'r', encoding='utf-8').readlines()[:args.num_sents]]
  output = open(args.output, 'w', encoding='utf-8')

  # tm should translate unknown words as-is with probability 1
  for word in set(sum(french,())):
    if (word,) not in tm:
      tm[(word,)] = [models.phrase(word, 0.0)]

  hypothesis = namedtuple("hypothesis", "logprob, state, predecessor, phrase")
  state = namedtuple("state", "e1, e2, bitString, r")
  ph = namedtuple("ph", "s, t, phrase")

  for f in french:
    initial_state = state(None, lm.begin(), 0, 0.0)
    initial_hypothesis = hypothesis(0.0, initial_state, None, None)
Ejemplo n.º 7
0
import scorereranker

optparser = optparse.OptionParser()
optparser.add_option("-i", "--input", dest="input", default="data/dev/all.cn-en.cn", help="File containing sentences to translate for training")
optparser.add_option("-e", "--eval", dest="eval", default="data/test/all.cn-en.cn", help="File containing sentences to translate for evaluation")
optparser.add_option("-r", "--reference-train", dest="refdev", default="data/dev/all.cn-en.en0", help="English reference sentences for training")
optparser.add_option("-q", "--reference-test", dest="reftest", default="data/test/all.cn-en.en0", help="English reference sentences for evaluation")
optparser.add_option("-t", "--translation-model-train", dest="tmdev", default="data/large/phrase-table/dev-filtered/rules_cnt.final.out", help="File containing translation model for training")
optparser.add_option("-u", "--translation-model-test", dest="tmtest", default="data/large/phrase-table/test-filtered/rules_cnt.final.out", help="File containing translation model for evaluation")
optparser.add_option("-l", "--language-model-train", dest="lmdev", default="data/lm/en.gigaword.3g.filtered.train_dev_test.arpa.gz", help="File containing ARPA-format language model")
optparser.add_option("-m", "--language-model-test", dest="lmtest", default="data/lm/en.gigaword.3g.filtered.dev_test.arpa.gz", help="File containing ARPA-format language model")
optparser.add_option("-k", "--translations-per-phrase", dest="k", default=3, type="int", help="Limit on number of translations to consider per phrase (default=1)")
optparser.add_option("-s", "--stack-size", dest="s", default=100, type="int", help="Maximum stack size (default=1)")
optparser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False,  help="Verbose mode (default=off)")
optparser.add_option("-f", "--feedback-loop", dest="loop", default=3,  help="The number of times the weight vector loops between decoder and reranker")

opts = optparser.parse_args()[0]

lmdev = models.LM(opts.lmdev)
lmevl = models.LM(opts.lmtest)

weights = [1, 1, 1, 1, 1]
# evaluation
tm = models.TM(opts.tmtest, opts.k, weights[0:-1])
nbest_list = decode.get_candidates(opts.eval, tm, lmevl, weights, s=opts.s)
results = rerank.rerank(weights, nbest_list)
print >> sys.stderr, "BLEU SCORE: %f:" % scorereranker.score(results, opts.reftest)
file = open("output_simp", "w")
file.write("\n".join(results))
file.close()
Ejemplo n.º 8
0
def init(tmpath, lmpath):
    tm = models.TM(opts.tm, sys.maxint)
    lm = models.LM(opts.lm)

    return tm, lm