def _load_theano(self): """ Loads models, sets theano shared variables and builds samplers. This entails irrevocable binding to a specific GPU. """ from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared from nmt import (build_sampler, gen_sample) from theano_util import (numpy_floatX, load_params, init_theano_params) trng = RandomStreams(1234) use_noise = shared(numpy_floatX(0.)) fs_init = [] fs_next = [] for model, option in zip(self._models, self._options): param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) # always return alignment at this point f_init, f_next = build_sampler( tparams, option, use_noise, trng, return_alignment=True) fs_init.append(f_init) fs_next.append(f_next) return trng, fs_init, fs_next, gen_sample
def translate_model(queue, rqueue, pid, models, options, k, normalize, verbose, nbest, return_alignment, suppress_unk): from nmt import (build_sampler, gen_sample, load_params, init_params, init_tparams) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) fs_init = [] fs_next = [] for model, option in zip(models, options): # allocate model parameters params = init_params(option) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=return_alignment) fs_init.append(f_init) fs_next.append(f_next) def _translate(seq): # sample given an input sequence and obtain scores input = [numpy.array(s).T.reshape([len(s[0]), len(s), 1]) for s in seq] sample, score, word_probs, alignment = gen_sample(fs_init, fs_next, input, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False, return_alignment=return_alignment, suppress_unk=suppress_unk) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths if nbest: return sample, score, word_probs, alignment else: sidx = numpy.argmin(score) return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx] while True: req = queue.get() if req is None: break idx, x = req[0], req[1] if verbose: sys.stderr.write('{0} - {1}\n'.format(pid,idx)) seq = _translate(x) rqueue.put((idx, seq)) return
def encode_model(queue, rqueue, pid, model, options): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) f_init, f_next = build_sampler(tparams, options, trng) def _encode(seq): code = f_init(numpy.array(seq).reshape([len(seq), 1]))[1] return code while True: req = queue.get() if req is None: break idx, x = req[0], req[1] print pid, '-', idx cod = _encode(x) rqueue.put((idx, cod)) return
def encode_model(queue, rqueue, pid, model, options): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng) def _encode(seq): # encode the source sentence code = f_init(numpy.array(seq).reshape([len(seq), 1]))[1] return code while True: req = queue.get() if req is None: break idx, x = req[0], req[1] print(pid, '-', idx) cod = _encode(x) rqueue.put((idx, cod)) return
def encode_model(queue, rqueue, pid, model, options): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng) def _encode(seq): # encode the source sentence code = f_init(numpy.array(seq).reshape([len(seq), 1]))[1] return code while True: req = queue.get() if req is None: break idx, x = req[0], req[1] print pid, '-', idx cod = _encode(x) rqueue.put((idx, cod)) return
def translate_model(queue, rqueue, pid, models, options, k, normalize, verbose, nbest, return_alignment, suppress_unk, return_hyp_graph): from theano_util import (load_params, init_theano_params) from nmt import (build_sampler, gen_sample, init_params) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) fs_init = [] fs_next = [] for model, option in zip(models, options): # load model parameters and set theano shared variables param_list = numpy.load(model).files param_list = dict.fromkeys([key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) # word index f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=return_alignment) fs_init.append(f_init) fs_next.append(f_next) def _translate(seq): # sample given an input sequence and obtain scores sample, score, word_probs, alignment, hyp_graph = gen_sample(fs_init, fs_next, numpy.array(seq).T.reshape([len(seq[0]), len(seq), 1]), trng=trng, k=k, maxlen=200, stochastic=False, argmax=False, return_alignment=return_alignment, suppress_unk=suppress_unk, return_hyp_graph=return_hyp_graph) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths if nbest: return sample, score, word_probs, alignment, hyp_graph else: sidx = numpy.argmin(score) return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx], hyp_graph while True: req = queue.get() if req is None: break idx, x = req[0], req[1] if verbose: sys.stderr.write('{0} - {1}\n'.format(pid,idx)) seq = _translate(x) rqueue.put((idx, seq)) return
def translate_model(queue, rqueue, pid, models, options, k, normalize): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # allocate model parameters params = [] for i in xrange(len(models)): params.append(init_params(options)) # load model parameters and set theano shared variables tparams = [] for i in xrange(len(params)): params[i] = load_params(models[i], params[i]) tparams.append(init_tparams(params[i])) # word index use_noise = theano.shared(numpy.float32(0.)) f_inits = [] f_nexts = [] for i in xrange(len(tparams)): f_init, f_next = build_sampler(tparams[i], options, trng, use_noise) f_inits.append(f_init) f_nexts.append(f_next) def _translate(seq): use_noise.set_value(0.) # sample given an input sequence and obtain scores sample, score = gen_sample(tparams, f_inits, f_nexts, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=500, stochastic=False, argmax=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] while True: req = queue.get() if req is None: break idx, x = req[0], req[1] print pid, '-', idx seq = _translate(x) rqueue.put((idx, seq)) return
def _load_theano(self): """ Loads models, sets theano shared variables and builds samplers. This entails irrevocable binding to a specific GPU. """ from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared from nmt import (build_sampler, build_multi_sampler, gen_sample) from theano_util import (numpy_floatX, load_params, init_theano_params) trng = RandomStreams(1234) use_noise = shared(numpy_floatX(0.)) fs_init = [] fs_next = [] for model, option in zip(self._models, self._options): # check compatibility with multisource if option["multisource_type"] is not None and len( option['extra_sources']) == 0: logging.error( "This model is multi-source but no auxiliary source file was provided." ) sys.exit(1) elif option["multisource_type"] is None and len( option['extra_sources']) != 0: logging.warn( "You provided an auxiliary input but this model is not multi-source. Ignoring extra input." ) param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) # always return alignment at this point if option['multisource_type'] is not None: f_init, f_next = build_multi_sampler(tparams, option, use_noise, trng, return_alignment=True) else: f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=True) fs_init.append(f_init) fs_next.append(f_next) return trng, fs_init, fs_next, gen_sample
def translate_model(queue, rqueue, mask_left, mask_right, write_mask, eots, model, options, k, normalize): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # allocate model parameters #params = init_params(options) # load model parameters and set theano shared variables params = load_params(model) #, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng, use_noise) def _translate(seq, left, right, write, eot): # sample given an input sequence and obtain scores print left.shape, right.shape, write.shape, len(seq) sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), left[:, :, None], right[:, :, None], write, eot[:, None], options, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] for idx, [x, l, r, w, eot] in enumerate( zip(queue, mask_left, mask_right, write_mask, eots)): # req = queue.get() if x is None: break print idx seq = _translate(x, l, r, w, eot) rqueue.append(seq) return
def translate_model(queue, rqueue, pid, model, options, k, normalize, n_best): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng) def _translate(seq): # sample given an input sequence and obtain scores sample, score = gen_sample( tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False, ) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths if n_best > 1: sidx = numpy.argsort(score)[:n_best] else: sidx = numpy.argmin(score) return numpy.array(sample)[sidx], numpy.array(score)[sidx] while True: req = queue.get() if req is None: break idx, x = req[0], req[1] print pid, "-", idx seq, scores = _translate(x) rqueue.put((idx, seq, scores)) return
def translate_model(queue, rqueue, pid, model, options, k, normalize, n_best): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng, use_noise) def _translate(seq): # sample given an input sequence and obtain scores sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths if n_best > 1: sidx = numpy.argsort(score)[:n_best] else: sidx = numpy.argmin(score) return numpy.array(sample)[sidx], numpy.array(score)[sidx] while True: req = queue.get() if req is None: break idx, x = req[0], req[1] print pid, '-', idx seq, scores = _translate(x) rqueue.put((idx, seq, scores)) return
def __init__(self, trained_model): # load model model_options with open('%s.pkl' % trained_model, 'rb') as f: self.options = pkl.load(f) logging.info(self.options) src_dict = os.path.join(self.options['baseDir'], self.options['dictionaries'][0]) if len(self.options['dictionaries']) == 1: target_dict = None else: target_dict = os.path.join(self.options['baseDir'], self.options['dictionaries'][1]) # load source dictionary and invert with open(src_dict, 'rb') as f: self.word_dict = pkl.load(f) self.word_idict = dict() for kk, vv in self.word_dict.iteritems(): self.word_idict[vv] = kk self.word_idict[0] = 'EOS' self.word_idict[1] = 'UNK' # load target dictionary and invert if target_dict is None: self.word_dict_trg = self.word_dict self.word_idict_trg = self.word_idict else: with open(target_dict, 'rb') as f: self.word_dict_trg = pkl.load(f) self.word_idict_trg = dict() for kk, vv in self.word_dict_trg.iteritems(): self.word_idict_trg[vv] = kk self.word_idict_trg[0] = 'EOS' self.word_idict_trg[1] = 'UNK' from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams self.trng = RandomStreams(1234) # allocate model parameters params = init_params(self.options) # load model parameters and set theano shared variables self.params = load_params(trained_model, params) self.tparams = init_tparams(params) # word index use_noise = theano.shared(numpy.float32(0.)) self.f_init, self.f_next = build_sampler(self.tparams, self.options, self.trng, use_noise)
def translate_model(queue, rqueue, pid, model, options, k, normalize, annotations_only): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index # f_init outs are [init_state (to decoder), ctx (from encoder)] # f_next outs are [next_probs, next_sample, next_state] (decoder) f_init, f_next = build_sampler(tparams, options, trng, annotations_only) def _translate(seq): # sample given an input sequence and obtain scores if annotations_only: next_state, ctx = f_init(numpy.array(seq).reshape([len(seq), 1])) return ctx else: sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] while True: req = queue.get() if req is None: break idx, x = req[0], req[1] seq = _translate(x) rqueue.put((idx, seq)) return
def translate_model(queue, rqueue, pid, model, options, k, normalize): import theano from theano import tensor from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.), name='use_noise') params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # word index maxlen = 150 options['maxlen'] = maxlen f_init, f_next = build_sampler(tparams, options, trng) def _translate(seq): sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=maxlen, stochastic=False) if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] while True: req = queue.get() if req == None: break idx, x = req[0], req[1] print pid, '-', idx seq = _translate(x) rqueue.put((idx, seq)) return
def translate_model(queue, rqueue, pid, model, options, k, normalize): import theano from theano import tensor from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.), name='use_noise') params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # word index maxlen = 150 options['maxlen'] = maxlen f_init, f_next = build_sampler(tparams, options, trng) def _translate(seq): sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq),1]), options, trng=trng, k=k, maxlen=maxlen, stochastic=False) if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] while True: req = queue.get() if req == None: break idx, x = req[0], req[1] print pid, '-', idx seq = _translate(x) rqueue.put((idx, seq)) return
def sample(model, dictionary, dictionary_target, \ source_file, ref_file, saveto, \ k=10, normalize=False, \ bleu_script='./data/mteval-v11b.pl', res_to_sgm='./data/plain2sgm'): # load model model_options with open(model + '.pkl', 'rb') as f: options = pkl.load(f) # load target dictionary and invert with open(dictionary_target, 'rb') as f: word_dict_trg = pkl.load(f) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk val_start_time = time.time() trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng, use_noise) bleu_score = gen_trans(test_src=source_file, test_ref=ref_file, out_file=saveto, \ dict_src=dictionary, idict_trg=word_idict_trg, \ tparams=tparams, f_init=f_init, f_next=f_next, model_options=options, \ trng=trng, k=10, stochastic=False) print(model + ' / ' + source_file + ' / ' + 'test bleu %.4f' % bleu_score) print('timestamp {} {}'.format( 'done', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) sys.stdout.flush()
def translate_model(queue, model, options, k, normalize, d_maxlen=200): use_noise = theano.shared(numpy.float32(0.)) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng, use_noise) def _translate(seq): # sample given an input sequence and obtain scores sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=d_maxlen, stochastic=False, argmax=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] rqueue = [] for req in queue: idx, x = req[0], req[1] print 'translate-', idx seq = _translate(x) rqueue.append(seq) return rqueue
def main(models, saveto, bpe_file, save_alignment=None, k=5, normalize=False, n_process=5, chr_level=False, verbose=False, nbest=False, suppress_unk=False, a_json=False, print_word_probabilities=False, return_hyp_graph=False): # load model model_options options = [] for model in models: options.append(load_config(model)) fill_options(options[-1]) dictionaries = options[0]['dictionaries'] dictionaries_source = dictionaries[:-1] dictionary_target = dictionaries[-1] # load source dictionary and invert word_dicts = [] word_idicts = [] for dictionary in dictionaries_source: word_dict = load_dict(dictionary) if options[0]['n_words_src']: for key, idx in word_dict.items(): if idx >= options[0]['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' word_dicts.append(word_dict) word_idicts.append(word_idict) # load target dictionary and invert word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # create input and output queues for processes # CAN I MAKE IT INTO SERVER ###### The following functions should be already a part of serverisation # utility function def _seqs2words(cc): ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) return ' '.join(ww) def _send_jobs(f, processes, queue): source_sentences = [] for idx, line in enumerate(f): if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = [] for w in words: w = [ word_dicts[i][f] if f in word_dicts[i] else 1 for (i, f) in enumerate(w.split('|')) ] if len(w) != options[0]['factors']: sys.stderr.write( 'Error: expected {0} factors, but input word has {1}\n' .format(options[0]['factors'], len(w))) for midx in xrange(n_process): processes[midx].terminate() sys.exit(1) x.append(w) x += [[0] * options[0]['factors']] queue.put((idx, x)) source_sentences.append(words) return idx + 1, source_sentences def _finish_processes(queue): for midx in xrange(n_process): queue.put(None) def _retrieve_jobs(n_samples, processes, queue, rqueue): trans = [None] * n_samples out_idx = 0 for idx in xrange(n_samples): resp = None while resp is None: try: resp = rqueue.get(True, 5) # if queue is empty after 5s, check if processes are still alive except Empty: for midx in xrange(n_process): if not processes[midx].is_alive(): # kill all other processes and raise exception if one dies queue.cancel_join_thread() rqueue.cancel_join_thread() for idx in xrange(n_process): processes[idx].terminate() sys.stderr.write( "Error: translate worker process {0} crashed with exitcode {1}" .format(processes[midx].pid, processes[midx].exitcode)) sys.exit(1) trans[resp[0]] = resp[1] if verbose and numpy.mod(idx, 10) == 0: sys.stderr.write('Sample {0} / {1} Done\n'.format((idx + 1), n_samples)) while out_idx < n_samples and trans[out_idx] != None: yield trans[out_idx] out_idx += 1 def _parallelized_main(fs_init, fs_next, c, bpe, tokenizer, detokenizer): source_file_t = sent_tokenize(c.recv(4096).decode('utf-8')) #print(source_file_t[i]) while source_file_t[0] != "EOT": for i in range(len(source_file_t)): # print source_file_t[i].decode('utf-8') #pipe = subprocess.Popen("echo " + source_file_t[i] + "| perl truecase.perl --model en-truecase.mdl", shell=True) #pipe = subprocess.Popen(["echo", '"' + source_file_t[i] + '"', "|", "perl", "truecase.perl", "--model", # "en-truecase.mdl"], stdout=subprocess.PIPE) #result = pipe.stdout.read() #print pipe.communicate() #print pipe #print pipe.stdout #print pipe.stdout.read() #print pipe. #print "Here" #print result #source_file_t[i] = subprocess.check_output() source_file_t[i] = bpe.segment( tokenizer.tokenize(source_file_t[i], return_str=True)).strip() #print "Passed" print source_file_t detokenized = '' queue = Queue() rqueue = Queue() processes = [None] * n_process for midx in xrange(n_process): processes[midx] = Process( target=translate_model, args=(queue, rqueue, midx, models, options, k, normalize, verbose, nbest, save_alignment is not None, suppress_unk, return_hyp_graph, fs_init, fs_next)) processes[midx].start() n_samples, source_sentences = _send_jobs(source_file_t, processes, queue) _finish_processes(queue) #### The model loading takes place in the head of for loop, prolly in _retrieve_jobs for i, trans in enumerate( _retrieve_jobs(n_samples, processes, queue, rqueue)): print "NEXT SENTENCE:" if nbest: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) order = numpy.argsort(scores) for j in order: if print_word_probabilities: probs = " ||| " + " ".join( "{0}".format(prob) for prob in word_probs[j]) else: probs = "" saveto.write('{0} ||| {1} ||| {2}{3}\n'.format( i, _seqs2words(samples[j]), scores[j], probs)) # print alignment matrix for each hypothesis # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos # translation_token_count+eos if save_alignment is not None: if a_json: print_matrix_json( alignment[j], source_sentences[i], _seqs2words(samples[j]).split(), i, i + j, save_alignment) else: save_alignment.write( '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n' .format(i, _seqs2words(samples[j]), scores[j], ' '.join(source_sentences[i]), len(source_sentences[i]) + 1, len(samples[j]))) print_matrix(alignment[j], save_alignment) else: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) ## TODO: Handle the output here #print((_seqs2words(samples) + "\n").encode('utf-8')) #text.append(_seqs2words(samples) + "\n") x = _seqs2words(samples) #print x[0].upper() + x[1:] detokenized += detokenizer.detokenize( (x.decode('utf-8') + " ").split(), return_str=True) detokenized = detokenized[0].upper() + detokenized[1:] #print "ref this" #print detokenized #detokenized[0] = detokenized[0].upper() #c.send(detokenized.replace('@@ ', '').encode('utf-8').strip()) ## TODO: End of output handling if print_word_probabilities: for prob in word_probs: saveto.write("{} ".format(prob)) saveto.write('\n') if save_alignment is not None: if a_json: print_matrix_json(alignment, source_sentences[i], _seqs2words(trans[0]).split(), i, i, save_alignment) else: save_alignment.write( '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'. format(i, _seqs2words(trans[0]), 0, ' '.join(source_sentences[i]), len(source_sentences[i]) + 1, len(trans[0]))) print_matrix(alignment, save_alignment) c.send(detokenized.replace('@@ ', '').encode('utf-8').strip()) source_file_t = sent_tokenize(c.recv(4096).decode('utf-8')) c.close() sys.stderr.write('Done\n') def _listen(c, addr, fs_init, fs_next, tokenizer, detokenizer, bpe): while True: try: # Establish connection with client. try: print 'Got connection from', addr print "Receiving..." fname = c.recv(4096) except socket.error: c.close() print "connection closed" break print fname c.send("okay") #if fname == 'exit': # print "Terminating connection with client." # c.close() # break #else: #t = threading.Thread(target=_parallelized_main, args=(fname, fs_init, fs_next, c)) try: t = threading.Thread(target=_parallelized_main, args=(fs_init, fs_next, c, bpe, tokenizer, detokenizer)) t.start() t.join() except socket.error: c.close() break except KeyboardInterrupt as e: LOG.debug('Crtrl+C issued ...') LOG.info('Terminating server ...') try: c.shutdown(socket.SHUT_RDWR) c.close() except: pass break s = socket.socket() # Create a socket object host = socket.gethostname() # Get local machine name port = 12345 # Reserve a port for your service. s.bind((host, port)) # Bind to the port # Now wait for client connection. # Beginning model loading from theano_util import (load_params, init_theano_params) from nmt import (build_sampler) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) fs_init = [] fs_next = [] for model, option in zip(models, options): # load model parameters and set theano shared variables param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) # word index f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=save_alignment is not None) fs_init.append(f_init) fs_next.append(f_next) # end of model loading tokenizer = moses.MosesTokenizer() detokenizer = moses.MosesDetokenizer() # start listening to connections once models are loaded args.codes = codecs.open(bpe_file[0], encoding='utf-8') bpe = BPE(args.codes, '@@') while True: try: s.listen(5) print("Waiting for connections and stuff...") c, addr = s.accept() t = threading.Thread(target=_listen, args=(c, addr, fs_init, fs_next, tokenizer, detokenizer, bpe)) t.start() except KeyboardInterrupt: break s.close()
def main(model, pklmodel, dictionary, dictionary_target, source_file, saveto, ck=5, wk=5, k=20, normalize=False, n_process=5, chr_level=False, jointProb=False, show_boundary=False): print 'load model model_options' with open('%s' % pklmodel, 'rb') as f: options = pkl.load(f) print 'load source dictionary and invert' with open(dictionary, 'rb') as f: word_dict = pkl.load(f) word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'load target dictionary and invert' with open(dictionary_target, 'rb') as f: word_dict_trg = pkl.load(f) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # utility function def _seqs2words(caps, boundary, chunk): capsw = [] for cc, bb, ch in zip(caps, boundary, chunk): ww = [] for w, b, c in zip(cc, bb, ch): if w == 0: continue # if w == -10000: # ww.append('| NOTEND') # continue elif w < 0: # ww.append('|' + str(w)) continue if show_boundary: if b == 1.0: ww.append('|') ww.append(word_idict_trg[w]) capsw.append(' '.join(ww)) return capsw def _seqs2wordsByChunk(caps, boundary, chunk): capsw = [] for cc, bb, ch in zip(caps, boundary, chunk): ww = [] for w, b, c in zip(cc, bb, ch): if w == 0: continue # if w == -10000: # ww.append('| NOTEND') # continue elif w < 0: # ww.append('|' + str(w)) continue if b == 1.0: ww.append('| ' + str(c)) ww.append(word_idict_trg[w]) capsw.append(' '.join(ww)) return capsw def _send_jobs(fname): retval = [] with open(fname, 'r') as f: for idx, line in enumerate(f): if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = map(lambda w: word_dict[w] if w in word_dict else 1, words) x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x) x += [0] retval.append(x) return retval print 'Translating ', source_file, '...' print 'look up table' n_samples = _send_jobs(source_file) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next_chunk, f_next_word = build_sampler(tparams, options, trng, use_noise) def _translate(seq): be_stochastic = False # sample given an input sequence and obtain scores sample, boundary, chunk, score = gen_sample(tparams, f_init, f_next_chunk, f_next_word, numpy.array(seq).reshape( [len(seq), 1]), options, trng=trng, maxlen=200, k_chunk=ck, k_word=wk, k=k, stochastic=be_stochastic, argmax=True, jointProb=False) if be_stochastic: return sample # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths # print 'score', score # print 'candidates', sample sidx = numpy.argmin(score) return sample[sidx], boundary[sidx], chunk[sidx] ys = [] yb = [] yc = [] idx = 0 for x in n_samples: y, y_boundary, y_chunk = _translate(x) ys.append(y) yb.append(y_boundary) yc.append(y_chunk) print idx idx += 1 # print ys # print yb trans = _seqs2words(ys, yb, yc) trans_chunk = _seqs2wordsByChunk(ys, yb, yc) with open(saveto, 'w') as f: print >> f, '\n'.join(trans) with open(saveto + 'chunk', 'w') as f: print >> f, '\n'.join(trans_chunk) print 'Done'
def main(model, dictionary, dictionary_target, source_file, saveto, k=5, normalize=False, n_process=5, chr_level=False): # load model model_options with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) # load source dictionary and invert with open(dictionary, 'rb') as f: word_dict = pkl.load(f) word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # load target dictionary and invert with open(dictionary_target, 'rb') as f: word_dict_trg = pkl.load(f) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # utility function def _seq2words(cc): ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) return ' '.join(ww) #init model from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng) trans = [] att = [] print 'Translating ', source_file, '...' fo = open(saveto,'w') fa = open(saveto+'.att','w') with open(source_file, 'r') as f: n = 0 for line in f: n += 1 if n%10 == 0: print n if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = map(lambda w: word_dict[w] if w in word_dict else 1, words) x = map(lambda ii: ii if ii < options['n_words'] else 1, x) x += [0] y,a = _translate(x, tparams, f_init, f_next, options, trng, k, normalize) trans.append(y) att.append(a) print >>fo,_seq2words(y) print _seq2words(y) for i,e in enumerate(a): for j,p in enumerate(e): print >>fa,'{}-{}-{}'.format(i,j,p), print >>fa print 'Done'
def translate_model(queue, rqueue, pid, models, options, k, normalization_alpha, verbose, nbest, return_alignment, suppress_unk, return_hyp_graph, deviceid): # if the --device-list argument is set if deviceid != '': import os theano_flags = os.environ['THEANO_FLAGS'].split(',') exist = False for i in xrange(len(theano_flags)): if theano_flags[i].strip().startswith('device'): exist = True theano_flags[i] = '%s=%s' % ('device', deviceid) break if exist == False: theano_flags.append('%s=%s' % ('device', deviceid)) os.environ['THEANO_FLAGS'] = ','.join(theano_flags) from theano_util import (floatX, numpy_floatX, load_params, init_theano_params) from nmt import (build_sampler, gen_sample, init_params) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy_floatX(0.)) fs_init = [] fs_next = [] for model, option in zip(models, options): # load model parameters and set theano shared variables param_list = numpy.load(model).files param_list = dict.fromkeys([key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) # word index f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=return_alignment) fs_init.append(f_init) fs_next.append(f_next) def _translate(seq): # sample given an input sequence and obtain scores sample, score, word_probs, alignment, hyp_graph = gen_sample(fs_init, fs_next, numpy.array(seq).T.reshape([len(seq[0]), len(seq), 1]), trng=trng, k=k, maxlen=200, stochastic=False, argmax=False, return_alignment=return_alignment, suppress_unk=suppress_unk, return_hyp_graph=return_hyp_graph) # normalize scores according to sequence lengths if normalization_alpha: adjusted_lengths = numpy.array([len(s) ** normalization_alpha for s in sample]) score = score / adjusted_lengths if nbest: return sample, score, word_probs, alignment, hyp_graph else: sidx = numpy.argmin(score) return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx], hyp_graph while True: req = queue.get() if req is None: break idx, x = req[0], req[1] if verbose: sys.stderr.write('{0} - {1}\n'.format(pid,idx)) seq = _translate(x) rqueue.put((idx, seq)) return
def main(model, dictionary, dictionary_target, source_file, saveto, k=5, pkl_file=None, normalize=False, output_attention=False): # load model model_options if pkl_file is None: pkl_file = model + '.pkl' with open(pkl_file, 'rb') as f: options = pkl.load(f) # load source dictionary and invert with open(dictionary, 'rb') as f: word_dict = pkl.load(f) # word2id word_idict = dict() # id2word for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # load target dictionary and invert with open(dictionary_target, 'rb') as f: word_dict_trg = pkl.load(f) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # create input and output queues for processes # utility function def _seqs2words(caps): capsw = [] for cc in caps: ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) capsw.append(' '.join(ww)) return capsw def _send_jobs(fname): retval = [] retval_ori = [] with open(fname, 'r') as f: for idx, line in enumerate(f): words = line.strip().split() retval_ori.append(line.strip()) x = map(lambda w: word_dict[w] if w in word_dict else 1, words) x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x) x += [0] retval.append(x) return retval, retval_ori print 'Translating ', source_file, '...' sys.stdout.flush() n_samples, n_samples_src = _send_jobs(source_file) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # allocate model parameters # params = init_params(options) # load model parameters and set theano shared variables params = load_params(model) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng, use_noise) def _translate(seq): # sample given an input sequence and obtain scores sample, score, att = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) # return sample[sidx], att[sidx] return sample[sidx], numpy.array(att[sidx]) def _output_attention(sent_idx, att): dirname = saveto + '.attention' if not os.path.exists(dirname): os.mkdir(dirname) with open(dirname + '/' + str(sent_idx), 'w') as fp: fp.write("%d %d\n" % (att.shape[0], att.shape[1])) for row in att: fp.write( str(row.argmax()) + " " + ' '.join([str(x) for x in row]) + '\n') # translation ys = [] atts = [] idx = 0 for x in n_samples: y, att = _translate(x) ys.append(y) atts.append(att) print idx idx += 1 trans = _seqs2words(ys) # save with open(saveto, 'w') as f: print >> f, '\n'.join(trans) if output_attention: with open(saveto + '.att', 'w') as f: for idx, (x, y, att) in enumerate(zip(n_samples_src, trans, atts)): print >> f, ('%d ||| %s ||| 0 ||| %s ||| %d %d' % (idx, y, x, att.shape[1], att.shape[0])) for hehe in att: print >> f, ' '.join([str(x) for x in hehe]) print >> f print 'Done'
def main(model, dictionary, dictionary_target, source_file, saveto, k=5, normalize=False, n_process=5, chr_level=False,messageOff=False): if not messageOff: print 'load model model_options' if os.path.exists('%s.pkl' % model): with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) else: pklName = model[:model.index('.iter')]+model[model.index('.npz'):] with open('%s.pkl' % pklName, 'rb') as f: options = pkl.load(f) if not messageOff: print 'load source dictionary and invert' with open(dictionary, 'rb') as f: word_dict = pkl.load(f) word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' if not messageOff: print 'load target dictionary and invert' with open(dictionary_target, 'rb') as f: word_dict_trg = pkl.load(f) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # utility function def _index2sens(caps): capsw = [] for cc in caps: ww = [] for w in cc: if w == 0: continue elif w < 0: continue ww.append(word_idict_trg[w]) capsw.append(' '.join(ww)) return capsw def _seqs2sen(seqs): sen = [] for w in seqs: if w == 0: continue elif w < 0: continue sen.append(word_idict_trg[w]) return ' '.join(sen) def _send_jobs(fname):# translate source sentence into indices sourceIndices = [] source = [] with open(fname, 'r') as f: for idx, line in enumerate(f): if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = map(lambda w: word_dict[w] if w in word_dict else 1, words) x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x) x += [0] sourceIndices.append(x) source.append(line) return sourceIndices , source if not messageOff: print 'Translating ', source_file, '...' print 'Prepare data...', ret = _send_jobs(source_file) sourceIndices = ret[0] source = ret[1] from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng, use_noise) def _translate(seq): # sample given an input sequence and obtain translated result sampleData = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=200, return_attention=True, stochastic = False, argmax = False, normalize = normalize) sample=sampleData[0] score=sampleData[1] attention_record=sampleData[2] # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) if attention_record is None: attention=None else: attention=attention_record[sidx] return sample[sidx], attention trans = [] idx = 0 if not messageOff: print 'Done, translating...' for x , sSen in zip(sourceIndices , source): transData = _translate(x) y=transData[0] attention=transData[1] if not messageOff: print 'Sen ',idx, ':',sSen # source sentence y = _seqs2sen(y) trans.append(y) if not messageOff: print 'translation:', y # translation result print 'attention:' # if attention is not None: print_matrix(attention) idx += 1 with open(saveto, 'w') as f: print >>f, '\n'.join(trans) print 'Done'
def main(model, dictionary, dictionary_target, source_file, saveto, k=5, batch_size = 1, opt_base=None, normalize=False, output_attention=False): trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) #load params if opt_base is None: options = load_config(model) else: options = load_config(opt_base) param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list, '') tparams = init_theano_params(params) #load dictionary if dictionary is None: dictionary = options['dictionaries'][0] word_dict = load_dict(dictionary) if options['n_words_src']: for key, idx in word_dict.items(): if idx >= options['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' if dictionary_target is None: dictionary_target = options['dictionaries'][1] word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' def _send_jobs(fname): retval = [] retval_ori = [] with open(fname, 'r') as f: for idx, line in enumerate(f): words = line.strip().split() if len(words) == 0: continue retval_ori.append(line.strip()) x = map(lambda w: word_dict[w] if w in word_dict else 1, words) x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x) retval.append(x) logging.info('total %s sentences' % len(retval)) return retval, retval_ori sources, sources_ori = _send_jobs(source_file) batches = [] for i in range(len(sources) / batch_size): batches.append(prepare_data(sources[i * batch_size: (i + 1) * batch_size])) if (i + 1) * batch_size < len(sources): batches.append(prepare_data(sources[(i + 1) * batch_size: ])) final_sentences = [] f_init, f_next = build_sampler(tparams, options, use_noise, trng) for batch in batches: samples, scores, word_probs, _, _ = gen_sample([f_init], [f_next], batch[0], trng=trng, k=k, maxlen=200, stochastic=False, argmax=False) if normalize: lengths = numpy.array([len(s) for s in samples]) scores = scores / lengths final_words = samples[numpy.argmin(scores)] final_sentences.append(' '.join([word_idict_trg[w] for w in final_words]) + '\n') with open(saveto, 'w') as fout: for sentence in final_sentences: fout.write(sentence) print 'Done'
def main(model, bn_model, dictionary_target, fea, latex, saveto, output, k=5): # load model model_options with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) # load source dictionary and invert worddicts = load_dict(dictionary_target) worddicts_r = [None] * len(worddicts) for kk, vv in worddicts.iteritems(): worddicts_r[vv] = kk valid, valid_uid_list = dataIterator(fea, latex, worddicts, batch_size=1, batch_Imagesize=500000, maxlen=500, maxImagesize=500000) trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # allocate model parameters params = init_params(options) bn_params = init_bn_params(options) # load model parameters and set theano shared variables params = load_params(model, params) bn_params = load_params(bn_model, bn_params) tparams = init_tparams(params) bn_tparams = init_tparams(bn_params) f_init, f_next = build_sampler(tparams, bn_tparams, options, trng, use_noise) use_noise.set_value(0.) fpp_sample = open(saveto, 'w') valid_count_idx = 0 # FIXME: random selection? print 'Decoding ... ' for x, y in valid: for xx in x: print '%d : %s' % (valid_count_idx + 1, valid_uid_list[valid_count_idx]) xx_pad = numpy.zeros( (xx.shape[0], xx.shape[1], xx.shape[2]), dtype='float32') # input_channels * height * width xx_pad[:, :, :] = xx / 255. stochastic = False sample, score = gen_sample(f_init, f_next, xx_pad[None, :, :, :], options, trng=trng, k=10, maxlen=1000, stochastic=stochastic, argmax=False) if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] fpp_sample.write(valid_uid_list[valid_count_idx]) valid_count_idx = valid_count_idx + 1 for vv in ss: if vv == 0: # <eol> break fpp_sample.write(' ' + worddicts_r[vv]) fpp_sample.write('\n') fpp_sample.close() print 'test set decode done' os.system('python compute-wer.py ' + saveto + ' ' + latex + ' ' + output) fpp = open(output) # %WER 31.63 stuff = fpp.readlines() fpp.close() m = re.search('WER (.*)\n', stuff[0]) valid_per = 100. * float(m.group(1)) m = re.search('ExpRate (.*)\n', stuff[1]) valid_sacc = 100. * float(m.group(1)) print 'Valid WER: %.2f%%, ExpRate: %.2f%%' % (valid_per, valid_sacc)
def main(model, dictionary_target, source_fea, source_latex, saveto, wer_file, k=5): # load model model_options with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) # load source dictionary and invert worddicts = load_dict(dictionary_target) worddicts_r = [None] * len(worddicts) for kk, vv in worddicts.iteritems(): worddicts_r[vv] = kk valid, valid_uid_list = dataIterator_valid(source_fea, source_latex, worddicts, batch_size=1, maxlen=2000) trng = RandomStreams(1234) params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) f_init, f_next = build_sampler(tparams, options, trng) fpp_sample = open(saveto, 'w') valid_count_idx = 0 print 'Decoding...' ud_epoch = 0 ud_epoch_start = time.time() for x, y in valid: for xx in x: print '%d : %s' % (valid_count_idx + 1, valid_uid_list[valid_count_idx]) xx_pad = numpy.zeros((xx.shape[0] + 1, xx.shape[1]), dtype='float32') xx_pad[:xx.shape[0], :] = xx stochastic = False sample, score = gen_sample(f_init, f_next, xx_pad[:, None, :], options, trng=trng, k=k, maxlen=1000, stochastic=stochastic, argmax=False) if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] fpp_sample.write(valid_uid_list[valid_count_idx]) valid_count_idx = valid_count_idx + 1 for vv in ss: if vv == 0: # <eol> break fpp_sample.write(' ' + worddicts_r[vv]) fpp_sample.write('\n') fpp_sample.close() ud_epoch = (time.time() - ud_epoch_start) / 60. print 'test set decode done, cost time ...', ud_epoch os.system('python compute-wer.py ' + saveto + ' ' + source_latex + ' ' + wer_file) fpp = open(wer_file) stuff = fpp.readlines() fpp.close() m = re.search('WER (.*)\n', stuff[0]) valid_per = 100. * float(m.group(1)) m = re.search('ExpRate (.*)\n', stuff[1]) valid_sacc = 100. * float(m.group(1)) print 'Valid WER: %.2f%%, ExpRate: %.2f%%' % (valid_per, valid_sacc)
def main(model_files, dictionary_target, grammar_target, data_path, saveto, wer_file, k=5): # load source dictionary and invert worddicts = load_dict(dictionary_target) worddicts_r = [None] * len(worddicts) for kk, vv in worddicts.items(): worddicts_r[vv] = kk grammar=compileGrammar(loadGrammar(grammar_target,worddicts)) trng = RandomStreams(1234) models=[] # load model model_options for model_file in model_files: print('Loading model: %s' % model_file) with open('%s.pkl' % model_file, 'rb') as f: options = pkl.load(f) print(options) params = init_params(options) params = load_params(model_file, params) tparams = init_tparams(params) f_init, f_next = build_sampler(tparams, options, trng) models.append((f_init,f_next,options,0.8)) for lm_file in []: print('Loading language model: %s' % lm_file) f_init,f_next,options=load_language_model(lm_file) models.append((f_init,f_next,options,0.2)) valid,valid_uid_list = dataIterator_valid(data_path, worddicts, batch_size=1, maxlen=250) fpp_sample=[open('%s.%d'%(saveto,beam),'w') for beam in range(k)] valid_count_idx=0 print('Decoding...') ud_epoch = 0 ud_epoch_start = time.time() for x,y in valid: for xx in x: print('%d : %s' % (valid_count_idx+1, valid_uid_list[valid_count_idx])) xx_pad = numpy.zeros((xx.shape[0]+1,xx.shape[1]), dtype='float32') xx_pad[:xx.shape[0],:] = xx stochastic = False sample, score = gen_sample(models, xx_pad[:, None, :], grammar, trng=trng, k=k, maxlen=250, dictlen=len(worddicts), stochastic=stochastic, argmax=False) score = score / numpy.array([len(s) for s in sample]) sample_rank=numpy.argsort(score) for beam in range(k): fpp_sample[beam].write(valid_uid_list[valid_count_idx]) if len(sample)>beam: ss=sample[sample_rank[beam]] else: ss=[0] for vv in ss: if vv == 0: # <eol> break fpp_sample[beam].write(' '+worddicts_r[vv]) fpp_sample[beam].write('\n') valid_count_idx=valid_count_idx+1 ud_epoch = (time.time() - ud_epoch_start) print 'test set decode done, cost time ...', ud_epoch for beam in range(k): fpp_sample[beam].flush(); fpp_sample[beam].close(); os.system('python compute-wer.py %s.%d %s %s'%(saveto,beam,os.path.join(data_path,"caption.txt"),wer_file)) fpp=open(wer_file) stuff=fpp.readlines() fpp.close() m=re.search('WER (.*)\n',stuff[0]) valid_per=100. * float(m.group(1)) m=re.search('ExpRate (.*)\n',stuff[1]) valid_sacc=100. * float(m.group(1)) print '%d Valid WER: %.2f%%, ExpRate: %.2f%%' % (beam,valid_per,valid_sacc)
def translate_model(queue, rqueue, mask_left, mask_right, write_mask, pid, model, options, k, normalize): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng, use_noise) def _translate(seq, left, right, write): # sample given an input sequence and obtain scores print left.shape, right.shape, write.shape, len(seq) sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), left[:, :, None], right[:, :, None], write, options, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] while True: req = queue.get() if req is None: break rem_l = mask_left.get() rem_r = mask_right.get() rem_w = write_mask.get() idx, x = req[0], req[1] l = rem_l[1] r = rem_r[1] w = rem_w[1] print pid, '-', idx seq = _translate(x, l, r, w) rqueue.put((idx, seq)) return
def main(model, dictionary, dictionary_target, source_file, saveto, k=5, normalize=False, n_process=5, chr_level=False): # load model model_options with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) # load source dictionary and invert with open(dictionary, 'rb') as f: word_dict = pkl.load(f) word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # load target dictionary and invert with open(dictionary_target, 'rb') as f: word_dict_trg = pkl.load(f) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) # word index f_init, f_next = build_sampler(tparams, options, trng) def _translate(seq): # sample given an input sequence and obtain scores sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] # utility function def _seqs2words(caps): capsw = [] for cc in caps: ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) capsw.append(' '.join(ww)) return capsw translations = [] print "start Translating..." with open(source_file, 'r') as f: for idx, line in enumerate(f): if idx % 20 == 0: print "%s lines done!" % idx if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = map(lambda w: word_dict[w] if w in word_dict else 1, words) x = map(lambda ii: ii if ii < options['n_words'] else 1, x) x += [0] translation = _translate(x) translations.append(" ".join(_seqs2words([translation]))) with open(saveto, 'w') as f: print >> f, '\n'.join(translations) print "Finish Translating!"