def translate_model_single(input_, model_name, options, k, normalize): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) use_noise = theano.shared(np.float32(0.)) model, _ = build_and_init_model(model_name, options=options, build=False) # word index f_init, f_next = model.build_sampler(trng=trng, use_noise=use_noise) return translate(input_, model, f_init, f_next, trng, k, normalize)
def replace_unk(args, seq_source, seq_trans, src_sents, trans_sents, src_tgt_table): print 'Load and build models...', model, _, (trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost, context_mean) = \ build_and_init_model(args.model, build=True) f_get_attention = theano.function([x, x_mask, y, y_mask], opt_ret['dec_alphas']) print 'Done' print 'Start to calculate the scores...' current_id = 0 batch_size = 80 while True: block_x = seq_source[current_id * batch_size:(current_id + 1) * batch_size] block_y = seq_trans[current_id * batch_size:(current_id + 1) * batch_size] block_x_str = src_sents[current_id * batch_size:(current_id + 1) * batch_size] block_y_str = trans_sents[current_id * batch_size:(current_id + 1) * batch_size] if len(block_x) == 0: break x, x_mask, y, y_mask = prepare_data(block_x, block_y) attn_score_ = f_get_attention(x, x_mask, y, y_mask) srcWordsByAttn = attn_score_.argmax(axis=2) for idx, (sentx, senty, strx, stry) in enumerate( zip(block_x, block_y, block_x_str, block_y_str)): attn_mapping = srcWordsByAttn[:, idx] unk_pos = np.where(np.array(senty, dtype='int64') == 1) badder = 0 end_pos = -1 for ii in unk_pos[0].tolist(): srcidx = attn_mapping[ii] if srcidx < len(sentx): trans_sents[idx + current_id * batch_size][ii] = src_tgt_table.get( strx[srcidx], strx[srcidx]) else: badder += 1 if badder == 1: end_pos = ii if badder > 1: trans_sents[idx + current_id * batch_size] = \ trans_sents[idx + current_id * batch_size][:end_pos] break print 'Minibatch', current_id, ' Done' current_id += 1
def main(model, dictionary, dictionary_target, source_file, saveto, k=5, alpha=0, normalize=False, chr_level=False, batch_size=1, zhen=False, src_trg_table_path=None, search_all_alphas=False, ref_file=None, dump_all=False, args=None): batch_mode = batch_size > 1 assert batch_mode # load model model_options options = load_options_test(model) src_trg_table = None if src_trg_table_path: with open(src_trg_table_path, 'rb') as f: src_trg_table = pkl.load(f) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) use_noise = theano.shared(np.float32(0.)) model_type = 'NMTModel' if args.trg_attention: model_type = 'TrgAttnNMTModel' model, _ = build_and_init_model(model, options=options, build=False, model_type=model_type) f_init, f_next = model.build_sampler(trng=trng, use_noise=use_noise, batch_mode=batch_mode, dropout=options['use_dropout'], need_srcattn=zhen) trans, all_cand_ids, all_cand_trans, all_scores, word_idic_tgt = translate_whole( model, f_init, f_next, trng, dictionary, dictionary_target, source_file, k, normalize, alpha=alpha, src_trg_table=src_trg_table, zhen=zhen, n_words_src=options['n_words_src'], echo=True, batch_size=batch_size) if search_all_alphas: all_alpha_values = 0.1 * np.array(xrange(11)) for alpha_v in all_alpha_values: trans_ids = [] for samples, sample_scores in zip(all_cand_ids, all_scores): trans_ids.append(samples[chosen_by_len_alpha( samples, sample_scores, alpha_v)]) trans_strs = '\n'.join(seqs2words(trans_ids, word_idic_tgt)) if 'tc' in source_file: trans_strs = de_tc(trans_strs) if 'bpe' in source_file: trans_strs = de_bpe(trans_strs) print 'alpha %.2f, bleu %.2f' % ( alpha_v, get_bleu(ref_file, trans_strs, type_in='string')) else: with open(saveto, 'w') as f: print >> f, '\n'.join(trans) if dump_all: saveto_dump_all = '%s.all_beam%d' % (saveto, k) with open(saveto_dump_all, 'w') as f: print >> f, '\n'.join(all_cand_trans) print 'Done'
def main(model, dictionary, dictionary_target, source_file, saveto, k=5, normalize=False, chr_level=False, batch_size=-1, args=None): batch_mode = batch_size > 0 # load model model_options option_file = '%s.pkl' % model if not os.path.exists(option_file): m = re.search("iter(\d+)\.npz", model) if m: uidx = int(m.group((1))) option_file = '%s.iter%d.npz.pkl' % (os.path.splitext(model)[0], uidx) assert os.path.exists(option_file) with open(option_file, 'rb') as f: options = DefaultOptions.copy() options.update(pkl.load(f)) if 'fix_dp_bug' not in options: options['fix_dp_bug'] = False print 'Options:' pprint(options) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) use_noise = theano.shared(np.float32(0.)) model_type = 'NMTModel' if args.trg_attention: model_type = 'TrgAttnNMTModel' model, _ = build_and_init_model(model, options=options, build=False, model_type=model_type) f_init, f_next = model.build_sampler(trng=trng, use_noise=use_noise, batch_mode=batch_mode, dropout=options['use_dropout']) if not batch_mode: word_dict, word_idict, word_idict_trg, input_ = load_translate_data( dictionary, dictionary_target, source_file, batch_mode=False, chr_level=chr_level, options=options, ) print 'Translating ', source_file, '...' trans = seqs2words( translate(input_, model, f_init, f_next, trng, k, normalize), word_idict_trg, ) else: word_dict, word_idict, word_idict_trg, all_src_blocks, m_block = load_translate_data( dictionary, dictionary_target, source_file, batch_mode=True, chr_level=chr_level, n_words_src=options['n_words_src'], batch_size=batch_size, ) print 'Translating ', source_file, '...' all_sample = [] for bidx, seqs in enumerate(all_src_blocks): all_sample.extend( translate_block(seqs, model, f_init, f_next, trng, k)) print bidx, '/', m_block, 'Done' trans = seqs2words(all_sample, word_idict_trg) with open(saveto, 'w') as f: print >> f, '\n'.join(trans) print 'Done'
def get_gate_weights(model_name, dictionary, dictionary_target, source_file, args, k=5, normalize=False, chr_level=False): options = load_options(model_name) word_dict, word_idict, word_idict_trg = load_translate_data( dictionary, dictionary_target, source_file, batch_mode=False, chr_level=chr_level, load_input=False) inputs = [] lines = [] print 'Loading input...', with open(source_file, 'r') as f: for idx, line in enumerate(f): if idx >= args.test_number: break lines.append(line) if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = [word_dict[w] if w in word_dict else 1 for w in words] x = [ii if ii < options['n_words_src'] else 1 for ii in x] x.append(0) inputs.append(x) print 'Done' print 'Building model...', model, _ = build_and_init_model(model_name, options, build=False) print 'Done' if args.encoder: return get_encoder_gate_weights(args, model, options, inputs, lines) print 'Building sampler...' trng = RandomStreams(1234) use_noise = theano.shared(np.float32(0.)) f_init, f_next = model.build_sampler( trng=trng, use_noise=use_noise, batch_mode=False, get_gates=True, ) build_result = model, f_init, f_next, trng print 'Done' results = [] for i, src_seq in enumerate(inputs): results.append({ 'index': i, 'input': lines[i].strip(), 'dim': options['dim'], 'encoder': False, }) tgt_seq, kw_ret = translate_sentence(src_seq, build_result, k, normalize) results[-1]['output'] = seq2words(tgt_seq, word_idict_trg) results[-1]['kw_ret'] = kw_ret results[-1]['n_layers'] = len(kw_ret['input_gates_list'][0]) print 'Input:', lines[i] print 'Output:', results[-1]['output'] print '==============================' return results