Beispiel #1
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        'model_prefix',
        nargs='?',
        default='model/complete/enfr',
        help='The prefix of nmt model path, default is "%(default)s"')
    parser.add_argument(
        '--start',
        action="store",
        metavar="index",
        dest="start",
        type=int,
        default=1,
        help='The starting index of saved model to test, default is %(default)s'
    )
    parser.add_argument(
        '--end',
        action="store",
        metavar="index",
        dest="end",
        type=int,
        default=10,
        help='The ending index of saved model to test, default is %(default)s')
    parser.add_argument(
        '--gap',
        action="store",
        metavar="index",
        dest="interval",
        type=int,
        default=10000,
        help=
        'The interval between two consecutive tested models\' indexes, default is %(default)s'
    )
    parser.add_argument('--result',
                        action='store',
                        metavar='filename',
                        dest='result_file',
                        type=str,
                        default='trans_result.tsv',
                        help='Target small train file, default is %(default)s')
    parser.add_argument('--beam',
                        action="store",
                        metavar="beam_size",
                        dest="beam_size",
                        type=int,
                        default=4,
                        help='The beam size for translation, default is 4')
    parser.add_argument('--dataset',
                        action='store',
                        dest='dataset',
                        default='en-fr_bpe',
                        help='Dataset, default is "%(default)s"')

    args = parser.parse_args()

    if args.result_file == 'trans_result.tsv':
        model_file_name = os.path.split(args.model_prefix)[-1]
        args.result_file = './translated/complete/{}_bs{}.txt'.format(
            os.path.splitext(model_file_name)[0], args.beam_size)
    else:
        model_file_name = os.path.split(args.result_file)[-1]

    print args

    bleus = {}
    train1, train2, small1, small2, dev1, dev2, dev3, test1, test2, dic1, dic2 = Datasets[
        args.dataset]

    for idx in xrange(args.start, args.end + 1):
        trans_model_file = '%s.iter%d.npz' % (os.path.splitext(
            args.model_prefix)[0], idx * args.interval)
        trans_result_file = '%s.iter%d.txt' % (os.path.splitext(
            args.result_file)[0], idx * args.interval)

        if not os.path.exists(trans_result_file):
            exec_str = 'python translate_single.py -b 32 -k {} -p 1 -n {} {} {} {} {}\n'.format(
                args.beam_size, trans_model_file, './data/dic/{}'.format(dic1),
                './data/dic/{}'.format(dic2), './data/test/{}'.format(test1),
                trans_result_file)
            print 'Translate model {} '.format(trans_model_file)
            print exec_str
            pl_output = subprocess.Popen(exec_str,
                                         shell=True,
                                         stdout=subprocess.PIPE).stdout.read()

        if 'tc' in args.dataset:  # first de-truecase, then de-bpe
            exec_str = 'perl scripts/moses/detruecase.perl < {} > {}.detc'.format(
                trans_result_file, trans_result_file)
            pl_output = subprocess.Popen(exec_str,
                                         shell=True,
                                         stdout=subprocess.PIPE).stdout.read()
            trans_result_file = '{}.detc'.format(trans_result_file)

        if 'bpe' in args.dataset:
            with open('{}.bpe'.format(trans_result_file), 'w') as fout:
                fout.write(de_bpe(open(trans_result_file, 'r').read()))
            trans_result_file = '{}.bpe'.format(trans_result_file)

        bleus[idx] = get_bleu('./data/test/{}'.format(test2),
                              trans_result_file)

        print 'model %s, bleu %.2f' % (idx * args.interval, bleus[idx])

    args.result_file = './translated/complete/{}_s{}_e{}.txt'.format(
        os.path.splitext(model_file_name)[0], args.start, args.end)
    bleu_array = sorted(bleus.items(),
                        key=operator.itemgetter(0),
                        reverse=False)
    with open(args.result_file, 'w') as fout:
        fout.write('\n'.join(
            [str(idx) + '\t' + str(score) for (idx, score) in bleu_array]))
Beispiel #2
0
def main(model,
         dictionary,
         dictionary_target,
         source_file,
         saveto,
         k=5,
         alpha=0,
         normalize=False,
         chr_level=False,
         batch_size=1,
         zhen=False,
         src_trg_table_path=None,
         search_all_alphas=False,
         ref_file=None,
         dump_all=False,
         args=None):
    batch_mode = batch_size > 1
    assert batch_mode

    # load model model_options
    options = load_options_test(model)

    src_trg_table = None
    if src_trg_table_path:
        with open(src_trg_table_path, 'rb') as f:
            src_trg_table = pkl.load(f)

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)
    use_noise = theano.shared(np.float32(0.))

    model_type = 'NMTModel'
    if args.trg_attention:
        model_type = 'TrgAttnNMTModel'

    model, _ = build_and_init_model(model,
                                    options=options,
                                    build=False,
                                    model_type=model_type)

    f_init, f_next = model.build_sampler(trng=trng,
                                         use_noise=use_noise,
                                         batch_mode=batch_mode,
                                         dropout=options['use_dropout'],
                                         need_srcattn=zhen)

    trans, all_cand_ids, all_cand_trans, all_scores, word_idic_tgt = translate_whole(
        model,
        f_init,
        f_next,
        trng,
        dictionary,
        dictionary_target,
        source_file,
        k,
        normalize,
        alpha=alpha,
        src_trg_table=src_trg_table,
        zhen=zhen,
        n_words_src=options['n_words_src'],
        echo=True,
        batch_size=batch_size)

    if search_all_alphas:
        all_alpha_values = 0.1 * np.array(xrange(11))
        for alpha_v in all_alpha_values:
            trans_ids = []
            for samples, sample_scores in zip(all_cand_ids, all_scores):
                trans_ids.append(samples[chosen_by_len_alpha(
                    samples, sample_scores, alpha_v)])
            trans_strs = '\n'.join(seqs2words(trans_ids, word_idic_tgt))

            if 'tc' in source_file:
                trans_strs = de_tc(trans_strs)

            if 'bpe' in source_file:
                trans_strs = de_bpe(trans_strs)
            print 'alpha %.2f, bleu %.2f' % (
                alpha_v, get_bleu(ref_file, trans_strs, type_in='string'))
    else:
        with open(saveto, 'w') as f:
            print >> f, '\n'.join(trans)
        if dump_all:
            saveto_dump_all = '%s.all_beam%d' % (saveto, k)
            with open(saveto_dump_all, 'w') as f:
                print >> f, '\n'.join(all_cand_trans)
    print 'Done'
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(
        description='Replace UNK in the translated file, and get BLEU.')
    parser.add_argument('model', help='The model path')
    parser.add_argument('translated_file', help='The translated file with UNK')
    parser.add_argument(
        'table',
        nargs='?',
        default='./data/dic/fastAlign_en2fr.pkl',
        help='Source-Target table path, default is %(default)s')
    parser.add_argument('--dataset',
                        action='store',
                        dest='dataset',
                        default='en-fr_bpe',
                        help='Dataset, default is "%(default)s"')
    parser.add_argument('--nbest',
                        action="store",
                        metavar="N",
                        dest="nbest",
                        type=int,
                        default=1,
                        help='number of best, default is %(default)s')
    parser.add_argument('-B',
                        action='store_false',
                        default=True,
                        dest='bleu',
                        help='Get BLEU, default is True, set to False')
    parser.add_argument(
        '-d',
        '--dump',
        action='store_true',
        default=False,
        dest='dump',
        help='Dump translated file without UNK, default is False, set to True')

    args = parser.parse_args()

    print 'model: {}, translated file: {}'.format(args.model,
                                                  args.translated_file)

    train1, train2, small1, small2, valid1, valid2, test1, test2, dic1, dic2 = Datasets[
        args.dataset]

    options, src_sents_num, trans_sents_num, src_sents_str, trans_sents_str, src_tgt_table = _load_data(
        args,
        './data/dic/{}'.format(dic1),
        './data/dic/{}'.format(dic2),
        './data/test/{}'.format(test1),
    )

    replace_unk(args, src_sents_num, trans_sents_num, src_sents_str,
                trans_sents_str, src_tgt_table)

    translated_string = '\n'.join(' '.join(w for w in s)
                                  for s in trans_sents_str) + '\n'

    postfix = '.nounk'

    if 'bpe' in args.dataset:
        translated_string = de_bpe(translated_string)
        postfix = '.bpe' + postfix

    if args.dump:
        with open('{}{}'.format(args.translated_file, postfix), 'w') as f:
            print >> f, translated_string,

    if args.bleu:
        bleu = get_bleu(
            './data/test/{}'.format(test2),
            translated_string,
            type_in='string',
        )

        print 'BLEU: {:.2f}'.format(bleu)