def __init__(self, opts):
     self.opts = opts
     self.tm = models.TM(opts.tm, sys.maxint)
     self.lm = models.LM(opts.lm)
     self.french = [
         tuple(line.strip().split())
         for line in open(opts.input).readlines()
     ]
     # tm should translate unknown words as-is with probability 1
     for word in set(sum(self.french, ())):
         if (word, ) not in self.tm:
             self.tm[(word, )] = [models.phrase(word, 0.0)]
Exemple #2
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Compute unnormalized translation probability by marginalizing over alignments.'
    )
    parser.add_argument(
        '-i',
        '--input',
        dest='input',
        default='data/input',
        help='File containing sentences to translate (default=data/input)')
    parser.add_argument(
        '-t',
        '--translation-model',
        dest='tm',
        default='data/tm',
        help='File containing translation model (default=data/tm)')
    parser.add_argument(
        '-l',
        '--language-model',
        dest='lm',
        default='data/lm',
        help='File containing ARPA-format language model (default=data/lm)')
    opts = parser.parse_args()

    tm = models.TM(opts.tm, sys.maxint)
    lm = models.LM(opts.lm)
    french_sents = [
        tuple(line.strip().split()) for line in open(opts.input).readlines()
    ]
    english_sents = [tuple(line.strip().split()) for line in sys.stdin]
    if (len(french_sents) != len(english_sents)):
        sys.stderr.write(
            "ERROR: French and English files are not the same length! Only complete output can be graded!\n"
        )
        sys.exit(1)

    total_logprob = 0.0
    unaligned_sentences = 0
    for sent_num, (f, e) in enumerate(zip(french_sents, english_sents)):

        total_logprob += sent_logp(sent_num, (f, e))

    if unaligned_sentences > 0:
        sys.stderr.write(
            "ERROR: There were %d unaligned sentences! Only sentences that align under the model can be graded!\n"
            % unaligned_sentences)

    sys.stdout.write("%f\n" % total_logprob)
Exemple #3
0
import sys
import models
from collections import namedtuple

optparser = optparse.OptionParser()
optparser.add_option("-i", "--input", dest="input", default="data/input", help="File containing sentences to translate (default=data/input)")
optparser.add_option("-t", "--translation-model", dest="tm", default="data/tm", help="File containing translation model (default=data/tm)")
optparser.add_option("-l", "--language-model", dest="lm", default="data/lm", help="File containing ARPA-format language model (default=data/lm)")
optparser.add_option("-n", "--num_sentences", dest="num_sents", default=sys.maxint, type="int", help="Number of sentences to decode (default=no limit)")
optparser.add_option("-k", "--translations-per-phrase", dest="k", default=1, type="int", help="Limit on number of translations to consider per phrase (default=1)")
optparser.add_option("-s", "--stack-size", dest="s", default=1, type="int", help="Maximum stack size (default=1)")
optparser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False,  help="Verbose mode (default=off)")
opts = optparser.parse_args()[0]

opts.k = 4
tm = models.TM(opts.tm, opts.k)
lm = models.LM(opts.lm)
french = [tuple(line.strip().split()) for line in open(opts.input).readlines()[:opts.num_sents]]




# tm should translate unknown words as-is with probability 1
for word in set(sum(french,())):
  if (word,) not in tm:
    tm[(word,)] = [models.phrase(word, 0.0)]
# values for the model parameter
dd = 5
nn = -4
beta = 2
Exemple #4
0
                     type="float",
                     help="weight for language model")
optparser.add_option("-b",
                     "--beta",
                     dest="beta",
                     default=1.0,
                     type="float",
                     help="weight for translation model")
optparser.add_option("-m",
                     "--mute",
                     dest="mute",
                     default=0,
                     type="int",
                     help="mute the output")
opts = optparser.parse_args()[0]
tm = models.TM(opts.tm, opts.k, opts.mute)
lm = models.LM(opts.lm, opts.mute)
french = [
    tuple(line.strip().split())
    for line in open(opts.input).readlines()[:opts.num_sents]
]
bound_width = float(opts.bwidth)
hypothesis = namedtuple(
    "hypothesis", "lm_state, logprob, coverage, end, predecessor, phrase")


def bitmap(sequence):
    """ Generate a coverage bitmap for a sequence of indexes """
    return reduce(lambda x, y: x | y,
                  map(lambda i: long('1' + '0' * i, 2), sequence), 0)
Exemple #5
0
    help="Try to resegment unknown words into two known words (default=off)")
optparser.add_option("-v",
                     "--verbose",
                     dest="verbose",
                     action="store_true",
                     default=False,
                     help="Verbose mode (default=off)")

opts = optparser.parse_args()[0]

lm = models.LM(opts.lm)

# Training
weights = [1.0 / 7] * 7
for i in range(opts.loop):
    tm = models.TM(opts.tmdev, opts.k, weights[:4], simpmode=opts.simplify)
    nbest_list = list(
        decode.get_candidates(opts.input,
                              tm,
                              lm,
                              weights,
                              stack_size=opts.s,
                              verbose=opts.verbose,
                              simpmode=opts.simplify,
                              separate_unknown_words=opts.reseg_unknown))
    weights = trainreranker.train(nbest_list, opts.reference.split(), weights)
    print weights
    results = rerank.rerank(weights, nbest_list)
    print >> sys.stderr, "TRAINING LOOP %d BLEU SCORE: %f:" % \
        (i, scorereranker.score(results, opts.reference.split()))
Exemple #6
0
                         help="File containing ARPA-format language model (default=data/lm)")
  argparser.add_argument("-o", "--output", dest="output", default="output",
                         help="Ouput result file")
  argparser.add_argument("-n", "--num_sentences", dest="num_sents", default=2**64, type=int,
                         help="Number of sentences to decode (default=2^64)")
  argparser.add_argument("-k", "--translations-per-phrase", dest="k", default=2**64, type=int,
                         help="Limit on number of translations to consider per phrase (default=2^64)")
  argparser.add_argument("-b", "--beam-size", dest="beam_size", default=1000, type=int,
                         help="Maximum beam size (default=1000)")
  argparser.add_argument("-dl", "--distortion-limit", dest="distortion_limit", default=10, type=int,
                         help="Hard distortion limit (default=10)")
  argparser.add_argument("-dp", "--distortion-parameter", dest="distortion_parameter", default=-0.01, type=float,
                         help="Soft distortion parameter (default=-0.01)")
  args = argparser.parse_args()

  tm = models.TM(args.tm, args.k)
  lm = models.LM(args.lm)
  french = [tuple(line.strip().split()) for line in open(args.input, 'r', encoding='utf-8').readlines()[:args.num_sents]]
  output = open(args.output, 'w', encoding='utf-8')

  # tm should translate unknown words as-is with probability 1
  for word in set(sum(french,())):
    if (word,) not in tm:
      tm[(word,)] = [models.phrase(word, 0.0)]

  hypothesis = namedtuple("hypothesis", "logprob, state, predecessor, phrase")
  state = namedtuple("state", "e1, e2, bitString, r")
  ph = namedtuple("ph", "s, t, phrase")

  for f in french:
    initial_state = state(None, lm.begin(), 0, 0.0)
                     default=1,
                     type="int",
                     help="Verbosity level, 0-3 (default=1)")
optparser.add_option("-o",
                     "--logfile",
                     dest="logfile",
                     default=None,
                     help="filename for logging output")
opts = optparser.parse_args()[0]

if opts.logfile:
    logging.basicConfig(filename=opts.logfile,
                        filemode='w',
                        level=logging.INFO)

tm = models.TM(opts.tm, sys.maxint)
lm = models.LM(opts.lm)
french = [tuple(line.strip().split()) for line in open(opts.input).readlines()]
english = [tuple(line.strip().split()) for line in sys.stdin]

# tm should translate unknown words as-is with probability 1
for word in set(sum(french, ())):
    if (word, ) not in tm:
        tm[(word, )] = [models.phrase(word, 0.0)]


def maybe_write(s, verbosity):
    if opts.logfile:
        logging.info(s)
    if opts.verbosity > verbosity:
        sys.stdout.write(s)
Exemple #8
0
                     dest="verbose",
                     action="store_true",
                     default=False,
                     help="Verbose mode (default=off)")
optparser.add_option(
    "-f",
    "--feedback-loop",
    dest="loop",
    default=10,
    help=
    "The number of times the weight vector loops between decoder and reranker")

opts = optparser.parse_args()[0]

weights = [1, 1, 1, 1, 1]
lm = models.LM(opts.lm)
tm = models.TM(opts.tmtest, opts.k, weights[0:-1])
nbest_list = decode.get_candidates(opts.eval, tm, lm, weights)
results = []
pt = -1
for cand in nbest_list:
    (i, sentence, features) = cand.strip().split("|||")
    i = int(i)
    if i != pt:
        pt = i
        results += [sentence]

file = open("output_d", "w")
file.write("\n".join(results))
file.close()
Exemple #9
0
        opts.lm = "data/lm/en.gigaword.3g.filtered.train_dev_test.arpa.gz"
        opts.tm = "data/large/phrase-table/dev-filtered/rules_cnt.final.out"
    elif opts.dataset == "test":
        opts.input = "data/test/all.cn-en.cn"
        opts.lm = "data/lm/en.gigaword.3g.filtered.train_dev_test.arpa.gz"
        opts.tm = "data/large/phrase-table/test-filtered/rules_cnt.final.out"
    if opts.weights is None:
        weights = [1. / number_of_features] * number_of_features
    else:
        with open(opts.weights) as weights_file:
            weights = [float(line.strip()) for line in weights_file]
            # weights = map(lambda x: 1.0 if math.isnan(x) or x == float("-inf") or x == float("inf") or x == 0.0 else x, w)
            assert len(weights) == number_of_features

    if opts.simpmode:
        tm = models.TM(opts.tm, opts.k, weights, simpmode=True)
    else:
        tm = models.TM(opts.tm, opts.k, weights, simpmode=False)
    lm = models.LM(opts.lm)

    candidates = get_candidates(opts.input,
                                tm,
                                lm,
                                weights,
                                stack_size=opts.s,
                                nbest=opts.nbest,
                                simpmode=opts.simpmode,
                                separate_unknown_words=opts.reseg_unknown,
                                verbose=opts.verbose)
    for i in candidates:
        print i
Exemple #10
0
def init(tmpath, lmpath):
    tm = models.TM(opts.tm, sys.maxint)
    lm = models.LM(opts.lm)

    return tm, lm