def encode_model(queue, rqueue, pid, model, options): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng) def _encode(seq): # encode the source sentence code = f_init(numpy.array(seq).reshape([len(seq), 1]))[1] return code while True: req = queue.get() if req is None: break idx, x = req[0], req[1] print(pid, '-', idx) cod = _encode(x) rqueue.put((idx, cod)) return
def translate_model(queue, rqueue, pid, models, options, k, normalize, verbose, nbest, return_alignment, suppress_unk): from nmt import (build_sampler, gen_sample, load_params, init_params, init_tparams) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) fs_init = [] fs_next = [] for model, option in zip(models, options): # allocate model parameters params = init_params(option) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=return_alignment) fs_init.append(f_init) fs_next.append(f_next) def _translate(seq): # sample given an input sequence and obtain scores input = [numpy.array(s).T.reshape([len(s[0]), len(s), 1]) for s in seq] sample, score, word_probs, alignment = gen_sample(fs_init, fs_next, input, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False, return_alignment=return_alignment, suppress_unk=suppress_unk) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths if nbest: return sample, score, word_probs, alignment else: sidx = numpy.argmin(score) return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx] while True: req = queue.get() if req is None: break idx, x = req[0], req[1] if verbose: sys.stderr.write('{0} - {1}\n'.format(pid,idx)) seq = _translate(x) rqueue.put((idx, seq)) return
def encode_model(queue, rqueue, pid, model, options): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng) def _encode(seq): # encode the source sentence code = f_init(numpy.array(seq).reshape([len(seq), 1]))[1] return code while True: req = queue.get() if req is None: break idx, x = req[0], req[1] print pid, '-', idx cod = _encode(x) rqueue.put((idx, cod)) return
def encode_model(queue, rqueue, pid, model, options): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) f_init, f_next = build_sampler(tparams, options, trng) def _encode(seq): code = f_init(numpy.array(seq).reshape([len(seq), 1]))[1] return code while True: req = queue.get() if req is None: break idx, x = req[0], req[1] print pid, '-', idx cod = _encode(x) rqueue.put((idx, cod)) return
def translate_model(queue, rqueue, pid, models, options, k, normalize, verbose, nbest, return_alignment, suppress_unk): from nmt import (build_sampler, gen_sample, load_params, init_params, init_tparams) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) fs_init = [] fs_next = [] for model, option in zip(models, options): # allocate model parameters params = init_params(option) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=return_alignment) fs_init.append(f_init) fs_next.append(f_next) def _translate(seq): # sample given an input sequence and obtain scores sample, score, word_probs, alignment = gen_sample(fs_init, fs_next, numpy.array(seq).T.reshape([len(seq[0]), len(seq), 1]), trng=trng, k=k, maxlen=200, stochastic=False, argmax=False, return_alignment=return_alignment, suppress_unk=suppress_unk) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths if nbest: return sample, score, word_probs, alignment else: sidx = numpy.argmin(score) return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx] while True: req = queue.get() if req is None: break idx, x = req[0], req[1] if verbose: sys.stderr.write('{0} - {1}\n'.format(pid,idx)) seq = _translate(x) rqueue.put((idx, seq)) return
def rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose): trng = RandomStreams(1234) fs_log_probs = [] for model, option in zip(models, options): # allocate model parameters params = init_params(option) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, option) inps = [x, x_mask, y, y_mask] use_noise.set_value(0.) f_log_probs = theano.function(inps, cost) fs_log_probs.append(f_log_probs) def _score(pairs): # sample given an input sequence and obtain scores scores = [] for i, f_log_probs in enumerate(fs_log_probs): scores.append(pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize)) return scores lines = source_file.readlines() nbest_lines = nbest_file.readlines() with tempfile.NamedTemporaryFile(prefix='rescore-tmpin') as tmp_in, tempfile.NamedTemporaryFile(prefix='rescore-tmpout') as tmp_out: for line in nbest_lines: linesplit = line.split(' ||| ') idx = int(linesplit[0]) tmp_in.write(lines[idx]) tmp_out.write(linesplit[1] + '\n') tmp_in.seek(0) tmp_out.seek(0) pairs = TextIterator(tmp_in.name, tmp_out.name, options[0]['dictionaries'][0], options[0]['dictionaries'][1], n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'], batch_size=b, maxlen=float('inf'), sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after scores = _score(pairs) for i, line in enumerate(nbest_lines): score_str = ' '.join(map(str,[s[i] for s in scores])) saveto.write('{0} {1}\n'.format(line.strip(), score_str))
def translate_model(queue, rqueue, mask_left, mask_right, write_mask, eots, model, options, k, normalize): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # allocate model parameters #params = init_params(options) # load model parameters and set theano shared variables params = load_params(model) #, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng, use_noise) def _translate(seq, left, right, write, eot): # sample given an input sequence and obtain scores print left.shape, right.shape, write.shape, len(seq) sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), left[:, :, None], right[:, :, None], write, eot[:, None], options, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] for idx, [x, l, r, w, eot] in enumerate( zip(queue, mask_left, mask_right, write_mask, eots)): # req = queue.get() if x is None: break print idx seq = _translate(x, l, r, w, eot) rqueue.append(seq) return
def translate_model(queue, rqueue, pid, model, options, k, normalize, n_best): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng) def _translate(seq): # sample given an input sequence and obtain scores sample, score = gen_sample( tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False, ) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths if n_best > 1: sidx = numpy.argsort(score)[:n_best] else: sidx = numpy.argmin(score) return numpy.array(sample)[sidx], numpy.array(score)[sidx] while True: req = queue.get() if req is None: break idx, x = req[0], req[1] print pid, "-", idx seq, scores = _translate(x) rqueue.put((idx, seq, scores)) return
def translate_model(queue, rqueue, pid, model, options, k, normalize, n_best): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng, use_noise) def _translate(seq): # sample given an input sequence and obtain scores sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths if n_best > 1: sidx = numpy.argsort(score)[:n_best] else: sidx = numpy.argmin(score) return numpy.array(sample)[sidx], numpy.array(score)[sidx] while True: req = queue.get() if req is None: break idx, x = req[0], req[1] print pid, '-', idx seq, scores = _translate(x) rqueue.put((idx, seq, scores)) return
def __init__(self, trained_model): # load model model_options with open('%s.pkl' % trained_model, 'rb') as f: self.options = pkl.load(f) logging.info(self.options) src_dict = os.path.join(self.options['baseDir'], self.options['dictionaries'][0]) if len(self.options['dictionaries']) == 1: target_dict = None else: target_dict = os.path.join(self.options['baseDir'], self.options['dictionaries'][1]) # load source dictionary and invert with open(src_dict, 'rb') as f: self.word_dict = pkl.load(f) self.word_idict = dict() for kk, vv in self.word_dict.iteritems(): self.word_idict[vv] = kk self.word_idict[0] = 'EOS' self.word_idict[1] = 'UNK' # load target dictionary and invert if target_dict is None: self.word_dict_trg = self.word_dict self.word_idict_trg = self.word_idict else: with open(target_dict, 'rb') as f: self.word_dict_trg = pkl.load(f) self.word_idict_trg = dict() for kk, vv in self.word_dict_trg.iteritems(): self.word_idict_trg[vv] = kk self.word_idict_trg[0] = 'EOS' self.word_idict_trg[1] = 'UNK' from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams self.trng = RandomStreams(1234) # allocate model parameters params = init_params(self.options) # load model parameters and set theano shared variables self.params = load_params(trained_model, params) self.tparams = init_tparams(params) # word index use_noise = theano.shared(numpy.float32(0.)) self.f_init, self.f_next = build_sampler(self.tparams, self.options, self.trng, use_noise)
def translate_model(queue, rqueue, pid, model, options, k, normalize, annotations_only): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index # f_init outs are [init_state (to decoder), ctx (from encoder)] # f_next outs are [next_probs, next_sample, next_state] (decoder) f_init, f_next = build_sampler(tparams, options, trng, annotations_only) def _translate(seq): # sample given an input sequence and obtain scores if annotations_only: next_state, ctx = f_init(numpy.array(seq).reshape([len(seq), 1])) return ctx else: sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] while True: req = queue.get() if req is None: break idx, x = req[0], req[1] seq = _translate(x) rqueue.put((idx, seq)) return
def translate_model(queue, rqueue, pid, model, options, k, normalize): import theano from theano import tensor from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.), name='use_noise') params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # word index maxlen = 150 options['maxlen'] = maxlen f_init, f_next = build_sampler(tparams, options, trng) def _translate(seq): sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=maxlen, stochastic=False) if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] while True: req = queue.get() if req == None: break idx, x = req[0], req[1] print pid, '-', idx seq = _translate(x) rqueue.put((idx, seq)) return
def get_error(model, test_src, test_target): profile = False # reload options f = open('%s.pkl' % model, 'rb') model_options = pkl.load(f) logging.info(model_options) logging.info('Building model') params = init_params(model_options) # reload parameters params = load_params(model, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] dict_src = os.path.join(model_options['baseDir'], model_options['dictionaries'][0]) if len(model_options['dictionaries']) == 1: dict_target = None else: dict_target = os.path.join(model_options['baseDir'], model_options['dictionaries'][1]) valid = TextIterator(test_src, test_target, dict_src, dict_target, n_words_source=model_options['n_words_src'], n_words_target=model_options['n_words'], batch_size=model_options['valid_batch_size'], maxlen=model_options['maxlen']) logging.info('Building f_log_probs...') f_log_probs = theano.function(inps, cost, profile=profile) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() logging.info('Valid Error:%s' % (str(valid_err)))
def get_error(model, test_src, test_target): profile=False # reload options f = open('%s.pkl' % model, 'rb') model_options = pkl.load(f) logging.info(model_options) logging.info('Building model') params = init_params(model_options) # reload parameters params = load_params(model, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] dict_src = os.path.join(model_options['baseDir'], model_options['dictionaries'][0]) if len(model_options['dictionaries']) == 1: dict_target = None else: dict_target = os.path.join(model_options['baseDir'], model_options['dictionaries'][1]) valid = TextIterator(test_src, test_target, dict_src, dict_target, n_words_source=model_options['n_words_src'], n_words_target=model_options['n_words'], batch_size=model_options['valid_batch_size'], maxlen=model_options['maxlen']) logging.info('Building f_log_probs...') f_log_probs = theano.function(inps, cost, profile=profile) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() logging.info('Valid Error:%s'% (str(valid_err)))
def translate_model(queue, rqueue, pid, model, options, k, normalize): import theano from theano import tensor from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.), name='use_noise') params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # word index maxlen = 150 options['maxlen'] = maxlen f_init, f_next = build_sampler(tparams, options, trng) def _translate(seq): sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq),1]), options, trng=trng, k=k, maxlen=maxlen, stochastic=False) if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] while True: req = queue.get() if req == None: break idx, x = req[0], req[1] print pid, '-', idx seq = _translate(x) rqueue.put((idx, seq)) return
def build_alignment_cg(model, options): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # build model trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, options) inps = [x, x_mask, y, y_mask] # compile a function and return it return theano.function(inps, opt_ret['dec_alphas'])
def sample(model, dictionary, dictionary_target, \ source_file, ref_file, saveto, \ k=10, normalize=False, \ bleu_script='./data/mteval-v11b.pl', res_to_sgm='./data/plain2sgm'): # load model model_options with open(model + '.pkl', 'rb') as f: options = pkl.load(f) # load target dictionary and invert with open(dictionary_target, 'rb') as f: word_dict_trg = pkl.load(f) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk val_start_time = time.time() trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng, use_noise) bleu_score = gen_trans(test_src=source_file, test_ref=ref_file, out_file=saveto, \ dict_src=dictionary, idict_trg=word_idict_trg, \ tparams=tparams, f_init=f_init, f_next=f_next, model_options=options, \ trng=trng, k=10, stochastic=False) print(model + ' / ' + source_file + ' / ' + 'test bleu %.4f' % bleu_score) print('timestamp {} {}'.format( 'done', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) sys.stdout.flush()
def translate_model(queue, model, options, k, normalize, d_maxlen=200): use_noise = theano.shared(numpy.float32(0.)) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng, use_noise) def _translate(seq): # sample given an input sequence and obtain scores sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=d_maxlen, stochastic=False, argmax=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] rqueue = [] for req in queue: idx, x = req[0], req[1] print 'translate-', idx seq = _translate(x) rqueue.append(seq) return rqueue
def main(model, pklmodel, dictionary, dictionary_target, dictionary_chunk, source_file, target_file, saveto, ck=5, wk=5, k=20, normalize=False, n_process=5, chr_level=False, jointProb=False, show_boundary=False): print 'load model model_options' with open('%s' % pklmodel, 'rb') as f: options = pkl.load(f) print 'load source dictionary and invert' with open(dictionary, 'rb') as f: word_dict = pkl.load(f) word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'load target dictionary and invert' with open(dictionary_target, 'rb') as f: word_dict_trg = pkl.load(f) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # dict for chunk label worddict_chunk = [None] worddict_r_chunk = [None] with open(dictionary_chunk, 'rb') as f: worddict_chunk = pkl.load(f) worddict_r_chunk = dict() for kk, vv in worddict_chunk.iteritems(): worddict_r_chunk[vv] = kk def _seqs2wordsByChunk(caps, boundary, chunk, dictionary): capsw = [] for cc, bb, ch in zip(caps, boundary, chunk): if cc == 0: continue # if w == -10000: # ww.append('| NOTEND') # continue if cc < 0: # ww.append('|' + str(w)) continue if bb == 0: capsw[-1] = capsw[-1] + "_" + (dictionary[cc]) else: capsw.append(dictionary[cc]) return capsw # output in the chunk format: # w1, POS, chunk_boundary-chunk_tag def _seqs2wordsByChunkFormat(caps, boundary, chunk, dictionary, chunk_dic): capsw = [] current_tag = '' for cc, bb, ch in zip(caps, boundary, chunk): if cc == 0: continue # if w == -10000: # ww.append('| NOTEND') # continue if cc < 0: # ww.append('|' + str(w)) continue if bb == 0: capsw.append(dictionary[cc] + ' ' + 'I-' + chunk_dic[ch]) else: capsw.append(dictionary[cc] + ' ' + 'B-' + chunk_dic[ch]) return capsw # utility function def _seqs2words(caps, dictionary): capsw = [] ww = [] for w in caps: if w == 0: continue ww.append(dictionary[w]) return ww # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) f_align = build_alignment(tparams, options) # begin to read by iterators train = TrainingTextIterator(source_file, target_file, dictionary, dictionary_target, dictionary_chunk, n_words_source=30000, n_words_target=30000, batch_size=1, max_chunk_len=50, max_word_len=10000) boundary_right = 0.0 tag_right = 0.0 boundary_total = 0.0 tag_total = 0.0 for x, y_chunk, y_cw in train: x, x_mask, y_c, y_cw, chunk_indicator, y_mask = \ prepare_training_data(x, y_chunk, y_cw, maxlen_chunk=100000, maxlen_cw=100000, n_words_src=30000, n_words=30000) align, chunk_tag, chunk_boundary = f_align(x, x_mask, y_c, y_cw, y_mask, chunk_indicator) x = x.reshape((x.shape[0], )) y_cw = y_cw.reshape((y_cw.shape[0], )) y_c = y_c.reshape((y_c.shape[0], )) chunk_indicator = chunk_indicator.reshape((chunk_indicator.shape[0], )) print '\n'.join( _seqs2wordsByChunkFormat(numpy.ndarray.tolist(y_cw), numpy.ndarray.tolist(chunk_boundary), numpy.ndarray.tolist(chunk_tag), word_idict_trg, worddict_r_chunk)) for gold_boundary, gold_chunk_tag, predict_boundary, predict_chunk_tag in zip( numpy.ndarray.tolist(chunk_indicator), numpy.ndarray.tolist(y_c), numpy.ndarray.tolist(chunk_boundary), numpy.ndarray.tolist(chunk_tag)): boundary_total += 1 tag_total += 1 if gold_boundary == predict_boundary: boundary_right += 1 if gold_chunk_tag == predict_chunk_tag: tag_right += 1 # for tag, boundary in zip(numpy.ndarray.tolist(chunk_tag), numpy.ndarray.tolist(chunk_boundary)): # print # # # filter alignment # filter_align = [] # for b, align in zip(numpy.ndarray.tolist(chunk_indicator), numpy.ndarray.tolist(align[0])): # if b == 1.0: # filter_align.append(align) # # # print 'align =', # # a = numpy.ndarray.tolist(filter_align) # a = numpy.array(filter_align) # a = numpy.transpose(a) # a = numpy.ndarray.tolist(a) # # print a print 'boundary prec: ', boundary_right / boundary_total print 'tag prec: ', tag_right / tag_total print 'Done'
def main(model, bn_model, dictionary_target, fea, latex, saveto, output, k=5): # load model model_options with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) # load source dictionary and invert worddicts = load_dict(dictionary_target) worddicts_r = [None] * len(worddicts) for kk, vv in worddicts.iteritems(): worddicts_r[vv] = kk valid, valid_uid_list = dataIterator(fea, latex, worddicts, batch_size=1, batch_Imagesize=500000, maxlen=500, maxImagesize=500000) trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # allocate model parameters params = init_params(options) bn_params = init_bn_params(options) # load model parameters and set theano shared variables params = load_params(model, params) bn_params = load_params(bn_model, bn_params) tparams = init_tparams(params) bn_tparams = init_tparams(bn_params) f_init, f_next = build_sampler(tparams, bn_tparams, options, trng, use_noise) use_noise.set_value(0.) fpp_sample = open(saveto, 'w') valid_count_idx = 0 # FIXME: random selection? print 'Decoding ... ' for x, y in valid: for xx in x: print '%d : %s' % (valid_count_idx + 1, valid_uid_list[valid_count_idx]) xx_pad = numpy.zeros( (xx.shape[0], xx.shape[1], xx.shape[2]), dtype='float32') # input_channels * height * width xx_pad[:, :, :] = xx / 255. stochastic = False sample, score = gen_sample(f_init, f_next, xx_pad[None, :, :, :], options, trng=trng, k=10, maxlen=1000, stochastic=stochastic, argmax=False) if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] fpp_sample.write(valid_uid_list[valid_count_idx]) valid_count_idx = valid_count_idx + 1 for vv in ss: if vv == 0: # <eol> break fpp_sample.write(' ' + worddicts_r[vv]) fpp_sample.write('\n') fpp_sample.close() print 'test set decode done' os.system('python compute-wer.py ' + saveto + ' ' + latex + ' ' + output) fpp = open(output) # %WER 31.63 stuff = fpp.readlines() fpp.close() m = re.search('WER (.*)\n', stuff[0]) valid_per = 100. * float(m.group(1)) m = re.search('ExpRate (.*)\n', stuff[1]) valid_sacc = 100. * float(m.group(1)) print 'Valid WER: %.2f%%, ExpRate: %.2f%%' % (valid_per, valid_sacc)
def main(model, dictionary_target, source_fea, source_latex, saveto, wer_file, k=5): # load model model_options with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) # load source dictionary and invert worddicts = load_dict(dictionary_target) worddicts_r = [None] * len(worddicts) for kk, vv in worddicts.iteritems(): worddicts_r[vv] = kk valid, valid_uid_list = dataIterator_valid(source_fea, source_latex, worddicts, batch_size=1, maxlen=2000) trng = RandomStreams(1234) params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) f_init, f_next = build_sampler(tparams, options, trng) fpp_sample = open(saveto, 'w') valid_count_idx = 0 print 'Decoding...' ud_epoch = 0 ud_epoch_start = time.time() for x, y in valid: for xx in x: print '%d : %s' % (valid_count_idx + 1, valid_uid_list[valid_count_idx]) xx_pad = numpy.zeros((xx.shape[0] + 1, xx.shape[1]), dtype='float32') xx_pad[:xx.shape[0], :] = xx stochastic = False sample, score = gen_sample(f_init, f_next, xx_pad[:, None, :], options, trng=trng, k=k, maxlen=1000, stochastic=stochastic, argmax=False) if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] fpp_sample.write(valid_uid_list[valid_count_idx]) valid_count_idx = valid_count_idx + 1 for vv in ss: if vv == 0: # <eol> break fpp_sample.write(' ' + worddicts_r[vv]) fpp_sample.write('\n') fpp_sample.close() ud_epoch = (time.time() - ud_epoch_start) / 60. print 'test set decode done, cost time ...', ud_epoch os.system('python compute-wer.py ' + saveto + ' ' + source_latex + ' ' + wer_file) fpp = open(wer_file) stuff = fpp.readlines() fpp.close() m = re.search('WER (.*)\n', stuff[0]) valid_per = 100. * float(m.group(1)) m = re.search('ExpRate (.*)\n', stuff[1]) valid_sacc = 100. * float(m.group(1)) print 'Valid WER: %.2f%%, ExpRate: %.2f%%' % (valid_per, valid_sacc)
def rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights): trng = RandomStreams(1234) fs_log_probs = [] for model, option in zip(models, options): # allocate model parameters params = init_params(option) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, option) inps = [x, x_mask, y, y_mask] use_noise.set_value(0.) if alignweights: print "\t*** Save weight mode ON, alignment matrix will be saved." outputs = [cost, opt_ret['dec_alphas']] f_log_probs = theano.function(inps, outputs) else: print "\t*** Save weight mode OFF, alignment matrix will not be saved." f_log_probs = theano.function(inps, cost) fs_log_probs.append(f_log_probs) def _score(pairs, alignweights=False): # sample given an input sequence and obtain scores scores = [] for i, f_log_probs in enumerate(fs_log_probs): score_this_batch = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights=alignweights) scores.append(score_this_batch) return scores lines = source_file.readlines() nbest_lines = nbest_file.readlines() if alignweights: ### opening the temporary file. temp_name = saveto.name + ".json" align_OUT = tempfile.NamedTemporaryFile(prefix=temp_name) with tempfile.NamedTemporaryFile( prefix='rescore-tmpin') as tmp_in, tempfile.NamedTemporaryFile( prefix='rescore-tmpout') as tmp_out: for line in nbest_lines: linesplit = line.split(' ||| ') idx = int( linesplit[0]) ##index from the source file. Starting from 0. tmp_in.write(lines[idx]) tmp_out.write(linesplit[1] + '\n') tmp_in.seek(0) tmp_out.seek(0) pairs = TextIterator( tmp_in.name, tmp_out.name, options[0]['dictionaries'][0], options[0]['dictionaries'][1], n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'], batch_size=b, maxlen=float('inf'), sort_by_length=False ) #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after scores, alignments = _score(pairs, alignweights) for i, line in enumerate(nbest_lines): score_str = ' '.join(map(str, [s[i] for s in scores])) saveto.write('{0} {1}\n'.format(line.strip(), score_str)) ### optional save weights mode. if alignweights: for line in alignments: align_OUT.write(line + "\n") if alignweights: combine_source_target_text(source_file, nbest_file, saveto.name, align_OUT) align_OUT.close()
def main(model, dictionary, dictionary_target, source_file, reference_file, chr_level=False): print 'load model model_options' with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) print 'load source dictionary and invert' with open(dictionary, 'rb') as f: word_dict = pkl.load(f) word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'load target dictionary and invert' with open(dictionary_target, 'rb') as f: word_dict_trg = pkl.load(f) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # utility function def _seqs2sen(seqs, _dict): sen = [] for w in seqs: if w == 0: continue elif w < 0: continue sen.append(_dict[w]) return ' '.join(sen) def _send_jobs(fname, _dict, _n_words): # translate source sentence into source indices sourceIndices = [] source = [] with open(fname, 'r') as f: for idx, line in enumerate(f): if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = map(lambda w: _dict[w] if w in _dict else 1, words) x = map(lambda ii: ii if ii < _n_words else 1, x) x += [0] sourceIndices.append(x) source.append(line) return sourceIndices, source print 'Force Translating ', source_file, '...' print 'Prepare data...', ret = _send_jobs(source_file, word_dict, options['n_words_src']) sourceIndices = ret[0] source = ret[1] ret_ref = _send_jobs(reference_file, word_dict_trg, options['n_words']) targetIndices = ret_ref[0] target = ret_ref[1] from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams use_noise = theano.shared(numpy.float32(0.)) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index force_record = build_force_sampler(tparams, options, use_noise) def _translate(seq, trg_seq): # sample given an input sequence and obtain translated result sampleData = force_record( numpy.array(seq).reshape([len(seq), 1]), numpy.array(trg_seq).reshape([len(trg_seq), 1])) alpha_buffer_record = sampleData[0] attention_record = sampleData[1] if alpha_buffer_record is None: buffer_weight = None else: buffer_weight = alpha_buffer_record.reshape( [len(trg_seq), options['buffer_size']]) if attention_record is None: attention = None else: attention = attention_record.reshape([len(trg_seq), len(seq)]) return buffer_weight, attention idx = 0 print 'Done, translating...' for x, sSen, y in zip(sourceIndices, source, targetIndices): transData = _translate(x, y) buffer_weight = transData[0] attention = transData[1] print 'Sen ', idx, ':', sSen # source sentence tSen = _seqs2sen(y, word_idict_trg) print 'translation:', tSen # target sentence idx += 1 print 'buffer_weight:' print_matrix(buffer_weight) print 'attention:' print_matrix(attention) print 'Done'
def translate_model(queue, rqueue, mask_left, mask_right, write_mask, pid, model, options, k, normalize): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng, use_noise) def _translate(seq, left, right, write): # sample given an input sequence and obtain scores print left.shape, right.shape, write.shape, len(seq) sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), left[:, :, None], right[:, :, None], write, options, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] while True: req = queue.get() if req is None: break rem_l = mask_left.get() rem_r = mask_right.get() rem_w = write_mask.get() idx, x = req[0], req[1] l = rem_l[1] r = rem_r[1] w = rem_w[1] print pid, '-', idx seq = _translate(x, l, r, w) rqueue.put((idx, seq)) return
def rescore_model(source_file, target_file, saveto, models, options, b, normalize, verbose, alignweights): trng = RandomStreams(1234) fs_log_probs = [] for model, option in zip(models, options): # allocate model parameters params = init_params(option) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, option) inps = [x, x_mask, y, y_mask] use_noise.set_value(0.) if alignweights: print "\t*** Save weight mode ON, alignment matrix will be saved." outputs = [cost, opt_ret['dec_alphas']] f_log_probs = theano.function(inps, outputs) else: print "\t*** Save weight mode OFF, alignment matrix will not be saved." f_log_probs = theano.function(inps, cost) fs_log_probs.append(f_log_probs) def _score(pairs, alignweights=False): # sample given an input sequence and obtain scores scores = [] for i, f_log_probs in enumerate(fs_log_probs): score_this_batch = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights = alignweights) scores.append(score_this_batch) return scores pairs = TextIterator(source_file.name, target_file.name, options[0]['dictionaries'][0], options[0]['dictionaries'][1], n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'], batch_size=b, maxlen=float('inf'), sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after scores = _score(pairs, alignweights) source_file.seek(0) target_file.seek(0) source_lines = source_file.readlines() target_lines = target_file.readlines() for i, line in enumerate(target_lines): score_str = ' '.join(map(str,[s[i] for s in scores])) saveto.write('{0} {1}\n'.format(line.strip(), score_str)) ### optional save weights mode. if alignweights: ### writing out the alignments. temp_name = saveto.name + ".json" with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT: for line in all_alignments: align_OUT.write(line + "\n") ### combining the actual source and target words. combine_source_target_text_1to1(source_file, target_file, saveto.name, align_OUT)
def main(model, src_dict, trg_dict, src, trg, multibleu, batch_size=60, pred_dir='', model_list=False): if pred_dir is not '' and not os.path.exists(pred_dir): os.makedirs(pred_dir) if model_list: model_list_file = model with open(model_list_file) as f: model = f.readline().strip() # load dictionaries and invert them worddicts = [None] * 2 worddicts_r = [None] * 2 for ii, dd in enumerate([src_dict, trg_dict]): with open(dd, 'rb') as f: worddicts[ii] = pkl.load(f) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk # load model options with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) trng = RandomStreams(options['trng']) use_noise = theano.shared(numpy.float32(0.)) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) f_init_2, f_next_2 = build_sampler_2(tparams, options, trng, use_noise) iterator = TextIterator(src, trg, src_dict, trg_dict, n_words_source=options['n_words_src'], n_words_target=options['n_words'], batch_size=batch_size, maxlen=2000, shuffle=False, replace=False) if not model_list: try: valid_out, valid_bleu = greedy_decoding( options, trg, iterator, worddicts_r, tparams, prepare_data, gen_sample_2, f_init_2, f_next_2, trng, multibleu, fname=os.path.join(pred_dir, os.path.basename(model)[:-3] + 'out'), maxlen=100, verbose=False) except: valid_out = '' valid_bleu = 0.0 print valid_out, valid_bleu else: best_score = 0. best_model = '' with open(model_list_file) as f: for line in f: start = time.time() model = line.strip() if model == '': continue params = load_params(model, params) for kk, pp in params.iteritems(): tparams[kk].set_value(params[kk]) print model, try: valid_out, valid_bleu = greedy_decoding( options, trg, iterator, worddicts_r, tparams, prepare_data, gen_sample_2, f_init_2, f_next_2, trng, multibleu, fname=os.path.join( pred_dir, os.path.basename(model)[:-3] + 'out'), maxlen=100, verbose=False) except: valid_out = '' valid_bleu = 0.0 print valid_out, valid_bleu, if valid_bleu > best_score: best_score = valid_bleu best_model = model end = time.time() print "Time: ", end - start print 'Best model: ', best_model print 'Best BLEU: ', best_score
def main(model, dictionary, dictionary_target, source_file, saveto, k=5, pkl_file=None, normalize=False, output_attention=False): # load model model_options if pkl_file is None: pkl_file = model + '.pkl' with open(pkl_file, 'rb') as f: options = pkl.load(f) # load source dictionary and invert with open(dictionary, 'rb') as f: word_dict = pkl.load(f) # word2id word_idict = dict() # id2word for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # load target dictionary and invert with open(dictionary_target, 'rb') as f: word_dict_trg = pkl.load(f) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # create input and output queues for processes # utility function def _seqs2words(caps): capsw = [] for cc in caps: ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) capsw.append(' '.join(ww)) return capsw def _send_jobs(fname): retval = [] retval_ori = [] with open(fname, 'r') as f: for idx, line in enumerate(f): words = line.strip().split() retval_ori.append(line.strip()) x = map(lambda w: word_dict[w] if w in word_dict else 1, words) x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x) x += [0] retval.append(x) return retval, retval_ori print 'Translating ', source_file, '...' sys.stdout.flush() n_samples, n_samples_src = _send_jobs(source_file) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # allocate model parameters # params = init_params(options) # load model parameters and set theano shared variables params = load_params(model) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng, use_noise) def _translate(seq): # sample given an input sequence and obtain scores sample, score, att = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) # return sample[sidx], att[sidx] return sample[sidx], numpy.array(att[sidx]) def _output_attention(sent_idx, att): dirname = saveto + '.attention' if not os.path.exists(dirname): os.mkdir(dirname) with open(dirname + '/' + str(sent_idx), 'w') as fp: fp.write("%d %d\n" % (att.shape[0], att.shape[1])) for row in att: fp.write( str(row.argmax()) + " " + ' '.join([str(x) for x in row]) + '\n') # translation ys = [] atts = [] idx = 0 for x in n_samples: y, att = _translate(x) ys.append(y) atts.append(att) print idx idx += 1 trans = _seqs2words(ys) # save with open(saveto, 'w') as f: print >> f, '\n'.join(trans) if output_attention: with open(saveto + '.att', 'w') as f: for idx, (x, y, att) in enumerate(zip(n_samples_src, trans, atts)): print >> f, ('%d ||| %s ||| 0 ||| %s ||| %d %d' % (idx, y, x, att.shape[1], att.shape[0])) for hehe in att: print >> f, ' '.join([str(x) for x in hehe]) print >> f print 'Done'
def main(model, pklmodel, dictionary, dictionary_target, source_file, saveto, ck=5, wk=5, k=20, normalize=False, n_process=5, chr_level=False, jointProb=False, show_boundary=False): print 'load model model_options' with open('%s' % pklmodel, 'rb') as f: options = pkl.load(f) print 'load source dictionary and invert' with open(dictionary, 'rb') as f: word_dict = pkl.load(f) word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'load target dictionary and invert' with open(dictionary_target, 'rb') as f: word_dict_trg = pkl.load(f) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # utility function def _seqs2words(caps, boundary, chunk): capsw = [] for cc, bb, ch in zip(caps, boundary, chunk): ww = [] for w, b, c in zip(cc, bb, ch): if w == 0: continue # if w == -10000: # ww.append('| NOTEND') # continue elif w < 0: # ww.append('|' + str(w)) continue if show_boundary: if b == 1.0: ww.append('|') ww.append(word_idict_trg[w]) capsw.append(' '.join(ww)) return capsw def _seqs2wordsByChunk(caps, boundary, chunk): capsw = [] for cc, bb, ch in zip(caps, boundary, chunk): ww = [] for w, b, c in zip(cc, bb, ch): if w == 0: continue # if w == -10000: # ww.append('| NOTEND') # continue elif w < 0: # ww.append('|' + str(w)) continue if b == 1.0: ww.append('| ' + str(c)) ww.append(word_idict_trg[w]) capsw.append(' '.join(ww)) return capsw def _send_jobs(fname): retval = [] with open(fname, 'r') as f: for idx, line in enumerate(f): if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = map(lambda w: word_dict[w] if w in word_dict else 1, words) x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x) x += [0] retval.append(x) return retval print 'Translating ', source_file, '...' print 'look up table' n_samples = _send_jobs(source_file) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next_chunk, f_next_word = build_sampler(tparams, options, trng, use_noise) def _translate(seq): be_stochastic = False # sample given an input sequence and obtain scores sample, boundary, chunk, score = gen_sample(tparams, f_init, f_next_chunk, f_next_word, numpy.array(seq).reshape( [len(seq), 1]), options, trng=trng, maxlen=200, k_chunk=ck, k_word=wk, k=k, stochastic=be_stochastic, argmax=True, jointProb=False) if be_stochastic: return sample # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths # print 'score', score # print 'candidates', sample sidx = numpy.argmin(score) return sample[sidx], boundary[sidx], chunk[sidx] ys = [] yb = [] yc = [] idx = 0 for x in n_samples: y, y_boundary, y_chunk = _translate(x) ys.append(y) yb.append(y_boundary) yc.append(y_chunk) print idx idx += 1 # print ys # print yb trans = _seqs2words(ys, yb, yc) trans_chunk = _seqs2wordsByChunk(ys, yb, yc) with open(saveto, 'w') as f: print >> f, '\n'.join(trans) with open(saveto + 'chunk', 'w') as f: print >> f, '\n'.join(trans_chunk) print 'Done'
def main(model_files, dictionary_target, grammar_target, data_path, saveto, wer_file, k=5): # load source dictionary and invert worddicts = load_dict(dictionary_target) worddicts_r = [None] * len(worddicts) for kk, vv in worddicts.items(): worddicts_r[vv] = kk grammar=compileGrammar(loadGrammar(grammar_target,worddicts)) trng = RandomStreams(1234) models=[] # load model model_options for model_file in model_files: print('Loading model: %s' % model_file) with open('%s.pkl' % model_file, 'rb') as f: options = pkl.load(f) print(options) params = init_params(options) params = load_params(model_file, params) tparams = init_tparams(params) f_init, f_next = build_sampler(tparams, options, trng) models.append((f_init,f_next,options,0.8)) for lm_file in []: print('Loading language model: %s' % lm_file) f_init,f_next,options=load_language_model(lm_file) models.append((f_init,f_next,options,0.2)) valid,valid_uid_list = dataIterator_valid(data_path, worddicts, batch_size=1, maxlen=250) fpp_sample=[open('%s.%d'%(saveto,beam),'w') for beam in range(k)] valid_count_idx=0 print('Decoding...') ud_epoch = 0 ud_epoch_start = time.time() for x,y in valid: for xx in x: print('%d : %s' % (valid_count_idx+1, valid_uid_list[valid_count_idx])) xx_pad = numpy.zeros((xx.shape[0]+1,xx.shape[1]), dtype='float32') xx_pad[:xx.shape[0],:] = xx stochastic = False sample, score = gen_sample(models, xx_pad[:, None, :], grammar, trng=trng, k=k, maxlen=250, dictlen=len(worddicts), stochastic=stochastic, argmax=False) score = score / numpy.array([len(s) for s in sample]) sample_rank=numpy.argsort(score) for beam in range(k): fpp_sample[beam].write(valid_uid_list[valid_count_idx]) if len(sample)>beam: ss=sample[sample_rank[beam]] else: ss=[0] for vv in ss: if vv == 0: # <eol> break fpp_sample[beam].write(' '+worddicts_r[vv]) fpp_sample[beam].write('\n') valid_count_idx=valid_count_idx+1 ud_epoch = (time.time() - ud_epoch_start) print 'test set decode done, cost time ...', ud_epoch for beam in range(k): fpp_sample[beam].flush(); fpp_sample[beam].close(); os.system('python compute-wer.py %s.%d %s %s'%(saveto,beam,os.path.join(data_path,"caption.txt"),wer_file)) fpp=open(wer_file) stuff=fpp.readlines() fpp.close() m=re.search('WER (.*)\n',stuff[0]) valid_per=100. * float(m.group(1)) m=re.search('ExpRate (.*)\n',stuff[1]) valid_sacc=100. * float(m.group(1)) print '%d Valid WER: %.2f%%, ExpRate: %.2f%%' % (beam,valid_per,valid_sacc)
def main(model, pklmodel, valid_datasets=['../data/dev/newstest2011.en.tok', '../data/dev/newstest2011.fr.tok'], dictionaries=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'], dictionary_chunk='/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', result_file='./cost.result'): # load the dictionaries of both source and target # load dictionaries and invert them worddicts = [None] * len(dictionaries) worddicts_r = [None] * len(dictionaries) for ii, dd in enumerate(dictionaries): with open(dd, 'rb') as f: worddicts[ii] = pkl.load(f) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk # dict for chunk label worddict_chunk = [None] worddict_r_chunk = [None] with open(dictionary_chunk, 'rb') as f: worddict_chunk = pkl.load(f) worddict_r_chunk = dict() for kk, vv in worddict_chunk.iteritems(): worddict_r_chunk[vv] = kk print worddict_chunk print 'load model model_options' with open('%s' % pklmodel, 'rb') as f: options = pkl.load(f) # build valid set valid = TrainingTextIterator(valid_datasets[0], valid_datasets[1], dictionaries[0], dictionaries[1], dictionary_chunk, n_words_source=options['n_words_src'], n_words_target=options['n_words'], batch_size=options['batch_size'], max_chunk_len=options['maxlen_chunk'], max_word_len=options['maxlen_chunk_words']) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y_chunk, y_mask, y_cw, y_chunk_indicator, \ opt_ret, \ cost, cost_cw= \ build_model(tparams, options) inps = [x, x_mask, y_chunk, y_mask, y_cw, y_chunk_indicator] # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=False) f_log_probs_cw = theano.function(inps, cost_cw, profile=False) print 'Done' valid_errs, valid_errs_cw = pred_probs(f_log_probs, f_log_probs_cw, prepare_training_data, options, valid) valid_err = valid_errs.mean() valid_err_cw = valid_errs_cw.mean() with open(result_file, 'w') as result_file: print >> result_file, valid_err, valid_err_cw
def main(model, dictionary, dictionary_target, source_file, saveto, k=5, normalize=False, n_process=5, chr_level=False): # load model model_options with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) # load source dictionary and invert with open(dictionary, 'rb') as f: word_dict = pkl.load(f) word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # load target dictionary and invert with open(dictionary_target, 'rb') as f: word_dict_trg = pkl.load(f) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng) def _translate(seq): # sample given an input sequence and obtain scores sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] # utility function def _seqs2words(caps): capsw = [] for cc in caps: ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) capsw.append(' '.join(ww)) return capsw translations = [] print "start Translating..." with open(source_file, 'r') as f: for idx, line in enumerate(f): if idx % 20 == 0: print "%s lines done!" % idx if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = map(lambda w: word_dict[w] if w in word_dict else 1, words) x = map(lambda ii: ii if ii < options['n_words'] else 1, x) x += [0] translation = _translate(x) translations.append(" ".join(_seqs2words([translation]))) with open(saveto, 'w') as f: print >> f, '\n'.join(translations) print "Finish Translating!"
def main(model, dictionary, dictionary_target, source_file, saveto, k=5, normalize=False, n_process=5, chr_level=False,messageOff=False): if not messageOff: print 'load model model_options' if os.path.exists('%s.pkl' % model): with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) else: pklName = model[:model.index('.iter')]+model[model.index('.npz'):] with open('%s.pkl' % pklName, 'rb') as f: options = pkl.load(f) if not messageOff: print 'load source dictionary and invert' with open(dictionary, 'rb') as f: word_dict = pkl.load(f) word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' if not messageOff: print 'load target dictionary and invert' with open(dictionary_target, 'rb') as f: word_dict_trg = pkl.load(f) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # utility function def _index2sens(caps): capsw = [] for cc in caps: ww = [] for w in cc: if w == 0: continue elif w < 0: continue ww.append(word_idict_trg[w]) capsw.append(' '.join(ww)) return capsw def _seqs2sen(seqs): sen = [] for w in seqs: if w == 0: continue elif w < 0: continue sen.append(word_idict_trg[w]) return ' '.join(sen) def _send_jobs(fname):# translate source sentence into indices sourceIndices = [] source = [] with open(fname, 'r') as f: for idx, line in enumerate(f): if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = map(lambda w: word_dict[w] if w in word_dict else 1, words) x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x) x += [0] sourceIndices.append(x) source.append(line) return sourceIndices , source if not messageOff: print 'Translating ', source_file, '...' print 'Prepare data...', ret = _send_jobs(source_file) sourceIndices = ret[0] source = ret[1] from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng, use_noise) def _translate(seq): # sample given an input sequence and obtain translated result sampleData = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=200, return_attention=True, stochastic = False, argmax = False, normalize = normalize) sample=sampleData[0] score=sampleData[1] attention_record=sampleData[2] # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) if attention_record is None: attention=None else: attention=attention_record[sidx] return sample[sidx], attention trans = [] idx = 0 if not messageOff: print 'Done, translating...' for x , sSen in zip(sourceIndices , source): transData = _translate(x) y=transData[0] attention=transData[1] if not messageOff: print 'Sen ',idx, ':',sSen # source sentence y = _seqs2sen(y) trans.append(y) if not messageOff: print 'translation:', y # translation result print 'attention:' # if attention is not None: print_matrix(attention) idx += 1 with open(saveto, 'w') as f: print >>f, '\n'.join(trans) print 'Done'
def main(model, dictionary, dictionary_target, source_file, saveto, k=5, normalize=False, n_process=5, chr_level=False): # load model model_options with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) # load source dictionary and invert with open(dictionary, 'rb') as f: word_dict = pkl.load(f) word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # load target dictionary and invert with open(dictionary_target, 'rb') as f: word_dict_trg = pkl.load(f) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # utility function def _seq2words(cc): ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) return ' '.join(ww) #init model from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng) trans = [] att = [] print 'Translating ', source_file, '...' fo = open(saveto,'w') fa = open(saveto+'.att','w') with open(source_file, 'r') as f: n = 0 for line in f: n += 1 if n%10 == 0: print n if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = map(lambda w: word_dict[w] if w in word_dict else 1, words) x = map(lambda ii: ii if ii < options['n_words'] else 1, x) x += [0] y,a = _translate(x, tparams, f_init, f_next, options, trng, k, normalize) trans.append(y) att.append(a) print >>fo,_seq2words(y) print _seq2words(y) for i,e in enumerate(a): for j,p in enumerate(e): print >>fa,'{}-{}-{}'.format(i,j,p), print >>fa print 'Done'
def main(model, dictionary, dictionary_target, source, target, outfile, wordbyword): # load model model_options with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) """ # load source dictionary and invert with open(dictionary, 'rb') as f: word_dict = pkl.load(f) word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # load target dictionary and invert with open(dictionary_target, 'rb') as f: word_dict_trg = pkl.load(f) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' """ valid_noshuf = TextIterator(source, target, dictionary, dictionary_target, n_words_source=options['n_words_src'], n_words_target=options['n_words'], batch_size=options['valid_batch_size'], maxlen=2000, shuffle=False) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost, cost_ = \ build_model(tparams, options) inps = [x, x_mask, y, y_mask] if wordbyword: f_log_probs = theano.function(inps, cost_, profile=profile) valid_errs = pred_probs(f_log_probs, prepare_data, options, valid_noshuf, verbose=True, as_list=True) with open(outfile, 'wb') as f: pkl.dump(valid_errs, f, pkl.HIGHEST_PROTOCOL) else: f_log_probs = theano.function(inps, cost, profile=profile) valid_errs = pred_probs(f_log_probs, prepare_data, options, valid_noshuf, verbose=True) numpy.save(outfile, valid_errs)
def rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights): trng = RandomStreams(1234) fs_log_probs = [] for model, option in zip(models, options): # allocate model parameters params = init_params(option) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, option) inps = [x, x_mask, y, y_mask] use_noise.set_value(0.) if alignweights: sys.stderr.write("\t*** Save weight mode ON, alignment matrix will be saved.\n") outputs = [cost, opt_ret['dec_alphas']] f_log_probs = theano.function(inps, outputs) else: f_log_probs = theano.function(inps, cost) fs_log_probs.append(f_log_probs) def _score(pairs, alignweights=False): # sample given an input sequence and obtain scores scores = [] alignments = [] for i, f_log_probs in enumerate(fs_log_probs): score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights = alignweights) scores.append(score) alignments.append(alignment) return scores, alignments lines = source_file.readlines() nbest_lines = nbest_file.readlines() if alignweights: ### opening the temporary file. temp_name = saveto.name + ".json" align_OUT = tempfile.NamedTemporaryFile(prefix=temp_name) with tempfile.NamedTemporaryFile(prefix='rescore-tmpin') as tmp_in, tempfile.NamedTemporaryFile(prefix='rescore-tmpout') as tmp_out: for line in nbest_lines: linesplit = line.split(' ||| ') idx = int(linesplit[0]) ##index from the source file. Starting from 0. tmp_in.write(lines[idx]) tmp_out.write(linesplit[1] + '\n') tmp_in.seek(0) tmp_out.seek(0) pairs = TextIterator(tmp_in.name, tmp_out.name, options[0]['dictionaries'][:-1], options[0]['dictionaries'][1], n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'], batch_size=b, maxlen=float('inf'), sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after scores, alignments = _score(pairs, alignweights) for i, line in enumerate(nbest_lines): score_str = ' '.join(map(str,[s[i] for s in scores])) saveto.write('{0} {1}\n'.format(line.strip(), score_str)) ### optional save weights mode. if alignweights: for line in alignments: align_OUT.write(line + "\n") if alignweights: combine_source_target_text(source_file, nbest_file, saveto.name, align_OUT) align_OUT.close()
def rescore_model(source_file, target_file, saveto, models, options, b, normalize, verbose, alignweights): trng = RandomStreams(1234) fs_log_probs = [] for model, option in zip(models, options): # allocate model parameters params = init_params(option) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, option) inps = [x, x_mask, y, y_mask] use_noise.set_value(0.) if alignweights: sys.stderr.write( "\t*** Save weight mode ON, alignment matrix will be saved.\n") outputs = [cost, opt_ret['dec_alphas']] f_log_probs = theano.function(inps, outputs) else: f_log_probs = theano.function(inps, cost) fs_log_probs.append(f_log_probs) def _score(pairs, alignweights=False): # sample given an input sequence and obtain scores scores = [] alignments = [] for i, f_log_probs in enumerate(fs_log_probs): score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights=alignweights) scores.append(score) alignments.append(alignment) return scores, alignments pairs = TextIterator( source_file.name, target_file.name, options[0]['dictionaries'][:-1], options[0]['dictionaries'][1], n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'], batch_size=b, maxlen=float('inf'), sort_by_length=False ) #TODO: sorting by length could be more efficient, but we'd want to resort after scores, alignments = _score(pairs, alignweights) source_file.seek(0) target_file.seek(0) source_lines = source_file.readlines() target_lines = target_file.readlines() for i, line in enumerate(target_lines): score_str = ' '.join(map(str, [s[i] for s in scores])) saveto.write('{0} {1}\n'.format(line.strip(), score_str)) ### optional save weights mode. if alignweights: ### writing out the alignments. temp_name = saveto.name + ".json" with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT: for line in all_alignments: align_OUT.write(line + "\n") ### combining the actual source and target words. combine_source_target_text_1to1(source_file, target_file, saveto.name, align_OUT)