def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser(description='Train alignment model') parser.add_argument('--train', help='training corpus', required=True) parser.add_argument('--iter', help='number of iterations', type=int, required=True) parser.add_argument('--charlm', help='character language model') parser.add_argument('--pyp', help='G_w^0 is PYP(CharLM)', action='store_true') parser.add_argument('--output', help='model output path') parser.add_argument( '--reverse', help='train model in reverse direction (but output f-e)', action='store_true') args = parser.parse_args() source_vocabulary = Vocabulary() source_vocabulary[NULL] target_vocabulary = Vocabulary() logging.info('Reading parallel training data') with open(args.train) as train: training_corpus = read_parallel_corpus(train, source_vocabulary, target_vocabulary, args.reverse) if args.charlm: logging.info('Preloading character language model') if args.charlm == 'pu': char_lm = PoissonUniformCharLM(target_vocabulary) else: char_lm = CharLM(args.charlm, target_vocabulary) if args.pyp: t_base = PYP(char_lm, PYPPrior(1.0, 1.0, 1.0, 1.0, 0.1, 1.0)) else: t_base = char_lm else: t_base = Uniform(len(target_vocabulary)) model = AlignmentModel(len(source_vocabulary), t_base) logging.info('Training alignment model') alignments = run_sampler(model, training_corpus, args.iter) if args.output: with open(args.output, 'w') as f: model.source_vocabulary = source_vocabulary model.target_vocabulary = target_vocabulary cPickle.dump(model, f, protocol=-1) fmt = ('{e}-{f}' if args.reverse else '{f}-{e}') for a, (f, e) in izip(alignments, training_corpus): #f_sentence = ' '.join(source_vocabulary[w] for w in f[1:]) #e_sentence = ' '.join(target_vocabulary[w] for w in e) print(' '.join( fmt.format(f=j - 1, e=i) for i, j in enumerate(a) if j > 0))
def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser(description='Train alignment model') parser.add_argument('--train', help='training corpus', required=True) parser.add_argument('--iter', help='number of iterations', type=int, required=True) parser.add_argument('--charlm', help='character language model') parser.add_argument('--pyp', help='G_w^0 is PYP(CharLM)', action='store_true') parser.add_argument('--output', help='model output path') parser.add_argument('--reverse', help='train model in reverse direction (but output f-e)', action='store_true') args = parser.parse_args() source_vocabulary = Vocabulary() source_vocabulary[NULL] target_vocabulary = Vocabulary() logging.info('Reading parallel training data') with open(args.train) as train: training_corpus = read_parallel_corpus(train, source_vocabulary, target_vocabulary, args.reverse) if args.charlm: logging.info('Preloading character language model') if args.charlm == 'pu': char_lm = PoissonUniformCharLM(target_vocabulary) else: char_lm = CharLM(args.charlm, target_vocabulary) if args.pyp: t_base = PYP(char_lm, PYPPrior(1.0, 1.0, 1.0, 1.0, 0.1, 1.0)) else: t_base = char_lm else: t_base = Uniform(len(target_vocabulary)) model = AlignmentModel(len(source_vocabulary), t_base) logging.info('Training alignment model') alignments = run_sampler(model, training_corpus, args.iter) if args.output: with open(args.output, 'w') as f: model.source_vocabulary = source_vocabulary model.target_vocabulary = target_vocabulary cPickle.dump(model, f, protocol=-1) fmt = ('{e}-{f}' if args.reverse else '{f}-{e}') for a, (f, e) in izip(alignments, training_corpus): #f_sentence = ' '.join(source_vocabulary[w] for w in f[1:]) #e_sentence = ' '.join(target_vocabulary[w] for w in e) print(' '.join(fmt.format(f=j-1, e=i) for i, j in enumerate(a) if j > 0))