Beispiel #1
0
def translate_model_single(input_, model_name, options, k, normalize):
    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)
    use_noise = theano.shared(np.float32(0.))

    model, _ = build_and_init_model(model_name, options=options, build=False)

    # word index
    f_init, f_next = model.build_sampler(trng=trng, use_noise=use_noise)

    return translate(input_, model, f_init, f_next, trng, k, normalize)
Beispiel #2
0
def replace_unk(args, seq_source, seq_trans, src_sents, trans_sents,
                src_tgt_table):
    print 'Load and build models...',
    model, _, (trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost, context_mean) = \
        build_and_init_model(args.model, build=True)
    f_get_attention = theano.function([x, x_mask, y, y_mask],
                                      opt_ret['dec_alphas'])
    print 'Done'

    print 'Start to calculate the scores...'

    current_id = 0
    batch_size = 80

    while True:
        block_x = seq_source[current_id * batch_size:(current_id + 1) *
                             batch_size]
        block_y = seq_trans[current_id * batch_size:(current_id + 1) *
                            batch_size]
        block_x_str = src_sents[current_id * batch_size:(current_id + 1) *
                                batch_size]
        block_y_str = trans_sents[current_id * batch_size:(current_id + 1) *
                                  batch_size]

        if len(block_x) == 0:
            break

        x, x_mask, y, y_mask = prepare_data(block_x, block_y)
        attn_score_ = f_get_attention(x, x_mask, y, y_mask)
        srcWordsByAttn = attn_score_.argmax(axis=2)
        for idx, (sentx, senty, strx, stry) in enumerate(
                zip(block_x, block_y, block_x_str, block_y_str)):
            attn_mapping = srcWordsByAttn[:, idx]
            unk_pos = np.where(np.array(senty, dtype='int64') == 1)
            badder = 0
            end_pos = -1
            for ii in unk_pos[0].tolist():
                srcidx = attn_mapping[ii]
                if srcidx < len(sentx):
                    trans_sents[idx + current_id *
                                batch_size][ii] = src_tgt_table.get(
                                    strx[srcidx], strx[srcidx])
                else:
                    badder += 1
                    if badder == 1:
                        end_pos = ii
                    if badder > 1:
                        trans_sents[idx + current_id * batch_size] = \
                            trans_sents[idx + current_id * batch_size][:end_pos]
                        break

        print 'Minibatch', current_id, ' Done'
        current_id += 1
Beispiel #3
0
def main(model,
         dictionary,
         dictionary_target,
         source_file,
         saveto,
         k=5,
         alpha=0,
         normalize=False,
         chr_level=False,
         batch_size=1,
         zhen=False,
         src_trg_table_path=None,
         search_all_alphas=False,
         ref_file=None,
         dump_all=False,
         args=None):
    batch_mode = batch_size > 1
    assert batch_mode

    # load model model_options
    options = load_options_test(model)

    src_trg_table = None
    if src_trg_table_path:
        with open(src_trg_table_path, 'rb') as f:
            src_trg_table = pkl.load(f)

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)
    use_noise = theano.shared(np.float32(0.))

    model_type = 'NMTModel'
    if args.trg_attention:
        model_type = 'TrgAttnNMTModel'

    model, _ = build_and_init_model(model,
                                    options=options,
                                    build=False,
                                    model_type=model_type)

    f_init, f_next = model.build_sampler(trng=trng,
                                         use_noise=use_noise,
                                         batch_mode=batch_mode,
                                         dropout=options['use_dropout'],
                                         need_srcattn=zhen)

    trans, all_cand_ids, all_cand_trans, all_scores, word_idic_tgt = translate_whole(
        model,
        f_init,
        f_next,
        trng,
        dictionary,
        dictionary_target,
        source_file,
        k,
        normalize,
        alpha=alpha,
        src_trg_table=src_trg_table,
        zhen=zhen,
        n_words_src=options['n_words_src'],
        echo=True,
        batch_size=batch_size)

    if search_all_alphas:
        all_alpha_values = 0.1 * np.array(xrange(11))
        for alpha_v in all_alpha_values:
            trans_ids = []
            for samples, sample_scores in zip(all_cand_ids, all_scores):
                trans_ids.append(samples[chosen_by_len_alpha(
                    samples, sample_scores, alpha_v)])
            trans_strs = '\n'.join(seqs2words(trans_ids, word_idic_tgt))

            if 'tc' in source_file:
                trans_strs = de_tc(trans_strs)

            if 'bpe' in source_file:
                trans_strs = de_bpe(trans_strs)
            print 'alpha %.2f, bleu %.2f' % (
                alpha_v, get_bleu(ref_file, trans_strs, type_in='string'))
    else:
        with open(saveto, 'w') as f:
            print >> f, '\n'.join(trans)
        if dump_all:
            saveto_dump_all = '%s.all_beam%d' % (saveto, k)
            with open(saveto_dump_all, 'w') as f:
                print >> f, '\n'.join(all_cand_trans)
    print 'Done'
def main(model,
         dictionary,
         dictionary_target,
         source_file,
         saveto,
         k=5,
         normalize=False,
         chr_level=False,
         batch_size=-1,
         args=None):
    batch_mode = batch_size > 0

    # load model model_options
    option_file = '%s.pkl' % model
    if not os.path.exists(option_file):
        m = re.search("iter(\d+)\.npz", model)
        if m:
            uidx = int(m.group((1)))
            option_file = '%s.iter%d.npz.pkl' % (os.path.splitext(model)[0],
                                                 uidx)
    assert os.path.exists(option_file)

    with open(option_file, 'rb') as f:
        options = DefaultOptions.copy()
        options.update(pkl.load(f))

        if 'fix_dp_bug' not in options:
            options['fix_dp_bug'] = False
        print 'Options:'
        pprint(options)

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)
    use_noise = theano.shared(np.float32(0.))

    model_type = 'NMTModel'
    if args.trg_attention:
        model_type = 'TrgAttnNMTModel'

    model, _ = build_and_init_model(model,
                                    options=options,
                                    build=False,
                                    model_type=model_type)

    f_init, f_next = model.build_sampler(trng=trng,
                                         use_noise=use_noise,
                                         batch_mode=batch_mode,
                                         dropout=options['use_dropout'])

    if not batch_mode:
        word_dict, word_idict, word_idict_trg, input_ = load_translate_data(
            dictionary,
            dictionary_target,
            source_file,
            batch_mode=False,
            chr_level=chr_level,
            options=options,
        )

        print 'Translating ', source_file, '...'
        trans = seqs2words(
            translate(input_, model, f_init, f_next, trng, k, normalize),
            word_idict_trg,
        )
    else:
        word_dict, word_idict, word_idict_trg, all_src_blocks, m_block = load_translate_data(
            dictionary,
            dictionary_target,
            source_file,
            batch_mode=True,
            chr_level=chr_level,
            n_words_src=options['n_words_src'],
            batch_size=batch_size,
        )

        print 'Translating ', source_file, '...'
        all_sample = []
        for bidx, seqs in enumerate(all_src_blocks):
            all_sample.extend(
                translate_block(seqs, model, f_init, f_next, trng, k))
            print bidx, '/', m_block, 'Done'

        trans = seqs2words(all_sample, word_idict_trg)

    with open(saveto, 'w') as f:
        print >> f, '\n'.join(trans)
    print 'Done'
def get_gate_weights(model_name,
                     dictionary,
                     dictionary_target,
                     source_file,
                     args,
                     k=5,
                     normalize=False,
                     chr_level=False):
    options = load_options(model_name)

    word_dict, word_idict, word_idict_trg = load_translate_data(
        dictionary,
        dictionary_target,
        source_file,
        batch_mode=False,
        chr_level=chr_level,
        load_input=False)

    inputs = []
    lines = []

    print 'Loading input...',
    with open(source_file, 'r') as f:
        for idx, line in enumerate(f):
            if idx >= args.test_number:
                break

            lines.append(line)
            if chr_level:
                words = list(line.decode('utf-8').strip())
            else:
                words = line.strip().split()

            x = [word_dict[w] if w in word_dict else 1 for w in words]
            x = [ii if ii < options['n_words_src'] else 1 for ii in x]
            x.append(0)

            inputs.append(x)
    print 'Done'

    print 'Building model...',
    model, _ = build_and_init_model(model_name, options, build=False)
    print 'Done'

    if args.encoder:
        return get_encoder_gate_weights(args, model, options, inputs, lines)

    print 'Building sampler...'
    trng = RandomStreams(1234)
    use_noise = theano.shared(np.float32(0.))
    f_init, f_next = model.build_sampler(
        trng=trng,
        use_noise=use_noise,
        batch_mode=False,
        get_gates=True,
    )
    build_result = model, f_init, f_next, trng
    print 'Done'

    results = []

    for i, src_seq in enumerate(inputs):
        results.append({
            'index': i,
            'input': lines[i].strip(),
            'dim': options['dim'],
            'encoder': False,
        })

        tgt_seq, kw_ret = translate_sentence(src_seq, build_result, k,
                                             normalize)

        results[-1]['output'] = seq2words(tgt_seq, word_idict_trg)
        results[-1]['kw_ret'] = kw_ret
        results[-1]['n_layers'] = len(kw_ret['input_gates_list'][0])

        print 'Input:', lines[i]
        print 'Output:', results[-1]['output']
        print '=============================='

    return results