コード例 #1
0
def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(description='Train alignment model')
    parser.add_argument('--train', help='training corpus', required=True)
    parser.add_argument('--iter',
                        help='number of iterations',
                        type=int,
                        required=True)
    parser.add_argument('--charlm', help='character language model')
    parser.add_argument('--pyp',
                        help='G_w^0 is PYP(CharLM)',
                        action='store_true')
    parser.add_argument('--output', help='model output path')
    parser.add_argument(
        '--reverse',
        help='train model in reverse direction (but output f-e)',
        action='store_true')

    args = parser.parse_args()

    source_vocabulary = Vocabulary()
    source_vocabulary[NULL]
    target_vocabulary = Vocabulary()

    logging.info('Reading parallel training data')
    with open(args.train) as train:
        training_corpus = read_parallel_corpus(train, source_vocabulary,
                                               target_vocabulary, args.reverse)

    if args.charlm:
        logging.info('Preloading character language model')
        if args.charlm == 'pu':
            char_lm = PoissonUniformCharLM(target_vocabulary)
        else:
            char_lm = CharLM(args.charlm, target_vocabulary)
        if args.pyp:
            t_base = PYP(char_lm, PYPPrior(1.0, 1.0, 1.0, 1.0, 0.1, 1.0))
        else:
            t_base = char_lm
    else:
        t_base = Uniform(len(target_vocabulary))
    model = AlignmentModel(len(source_vocabulary), t_base)

    logging.info('Training alignment model')
    alignments = run_sampler(model, training_corpus, args.iter)

    if args.output:
        with open(args.output, 'w') as f:
            model.source_vocabulary = source_vocabulary
            model.target_vocabulary = target_vocabulary
            cPickle.dump(model, f, protocol=-1)

    fmt = ('{e}-{f}' if args.reverse else '{f}-{e}')
    for a, (f, e) in izip(alignments, training_corpus):
        #f_sentence = ' '.join(source_vocabulary[w] for w in f[1:])
        #e_sentence = ' '.join(target_vocabulary[w] for w in e)
        print(' '.join(
            fmt.format(f=j - 1, e=i) for i, j in enumerate(a) if j > 0))
コード例 #2
0
ファイル: train.py プロジェクト: mfaruqui/vpyp
def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(description='Train alignment model')
    parser.add_argument('--train', help='training corpus', required=True)
    parser.add_argument('--iter', help='number of iterations', type=int, required=True)
    parser.add_argument('--charlm', help='character language model')
    parser.add_argument('--pyp', help='G_w^0 is PYP(CharLM)', action='store_true')
    parser.add_argument('--output', help='model output path')
    parser.add_argument('--reverse', help='train model in reverse direction (but output f-e)', 
            action='store_true')

    args = parser.parse_args()

    source_vocabulary = Vocabulary()
    source_vocabulary[NULL]
    target_vocabulary = Vocabulary()

    logging.info('Reading parallel training data')
    with open(args.train) as train:
        training_corpus = read_parallel_corpus(train, source_vocabulary, target_vocabulary,
                args.reverse)

    if args.charlm:
        logging.info('Preloading character language model')
        if args.charlm == 'pu':
            char_lm = PoissonUniformCharLM(target_vocabulary)
        else:
            char_lm = CharLM(args.charlm, target_vocabulary)
        if args.pyp:
            t_base = PYP(char_lm, PYPPrior(1.0, 1.0, 1.0, 1.0, 0.1, 1.0))
        else:
            t_base = char_lm
    else:
        t_base = Uniform(len(target_vocabulary))
    model = AlignmentModel(len(source_vocabulary), t_base)

    logging.info('Training alignment model')
    alignments = run_sampler(model, training_corpus, args.iter)

    if args.output:
        with open(args.output, 'w') as f:
            model.source_vocabulary = source_vocabulary
            model.target_vocabulary = target_vocabulary
            cPickle.dump(model, f, protocol=-1)

    fmt = ('{e}-{f}' if args.reverse else '{f}-{e}')
    for a, (f, e) in izip(alignments, training_corpus):
        #f_sentence = ' '.join(source_vocabulary[w] for w in f[1:])
        #e_sentence = ' '.join(target_vocabulary[w] for w in e)
        print(' '.join(fmt.format(f=j-1, e=i) for i, j in enumerate(a) if j > 0))