def _load_theano(self): """ Loads models, sets theano shared variables and builds samplers. This entails irrevocable binding to a specific GPU. """ from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared from nmt import (build_sampler, gen_sample) from theano_util import (numpy_floatX, load_params, init_theano_params) trng = RandomStreams(1234) use_noise = shared(numpy_floatX(0.)) fs_init = [] fs_next = [] for model, option in zip(self._models, self._options): param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) # always return alignment at this point f_init, f_next = build_sampler( tparams, option, use_noise, trng, return_alignment=True) fs_init.append(f_init) fs_next.append(f_next) return trng, fs_init, fs_next, gen_sample
def load_scorer(model, option, alignweights=None): # load model parameters and set theano shared variables param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, option) inps = [x, x_mask, y, y_mask] use_noise.set_value(0.) if alignweights: logging.debug("Save weight mode ON, alignment matrix will be saved.") outputs = [cost, opt_ret['dec_alphas']] f_log_probs = theano.function(inps, outputs) else: f_log_probs = theano.function(inps, cost) return f_log_probs
def translate_model(queue, rqueue, pid, models, options, k, normalize, verbose, nbest, return_alignment, suppress_unk, return_hyp_graph): from theano_util import (load_params, init_theano_params) from nmt import (build_sampler, gen_sample, init_params) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) fs_init = [] fs_next = [] for model, option in zip(models, options): # load model parameters and set theano shared variables param_list = numpy.load(model).files param_list = dict.fromkeys([key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) # word index f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=return_alignment) fs_init.append(f_init) fs_next.append(f_next) def _translate(seq): # sample given an input sequence and obtain scores sample, score, word_probs, alignment, hyp_graph = gen_sample(fs_init, fs_next, numpy.array(seq).T.reshape([len(seq[0]), len(seq), 1]), trng=trng, k=k, maxlen=200, stochastic=False, argmax=False, return_alignment=return_alignment, suppress_unk=suppress_unk, return_hyp_graph=return_hyp_graph) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths if nbest: return sample, score, word_probs, alignment, hyp_graph else: sidx = numpy.argmin(score) return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx], hyp_graph while True: req = queue.get() if req is None: break idx, x = req[0], req[1] if verbose: sys.stderr.write('{0} - {1}\n'.format(pid,idx)) seq = _translate(x) rqueue.put((idx, seq)) return
def translate_model(queue, rqueue, pid, models, options, k, normalize, verbose, nbest, return_alignment, suppress_unk): from theano_util import (load_params, init_theano_params) from nmt import (build_sampler, gen_sample, init_params) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) fs_init = [] fs_next = [] for model, option in zip(models, options): # allocate model parameters params = init_params(option) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_theano_params(params) # word index f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=return_alignment) fs_init.append(f_init) fs_next.append(f_next) def _translate(seq): # sample given an input sequence and obtain scores sample, score, word_probs, alignment = gen_sample(fs_init, fs_next, numpy.array(seq).T.reshape([len(seq[0]), len(seq), 1]), trng=trng, k=k, maxlen=200, stochastic=False, argmax=False, return_alignment=return_alignment, suppress_unk=suppress_unk) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths if nbest: return sample, score, word_probs, alignment else: sidx = numpy.argmin(score) return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx] while True: req = queue.get() if req is None: break idx, x = req[0], req[1] if verbose: sys.stderr.write('{0} - {1}\n'.format(pid,idx)) seq = _translate(x) rqueue.put((idx, seq)) return
def _load_theano(self): """ Loads models, sets theano shared variables and builds samplers. This entails irrevocable binding to a specific GPU. """ from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared from nmt import (build_sampler, build_multi_sampler, gen_sample) from theano_util import (numpy_floatX, load_params, init_theano_params) trng = RandomStreams(1234) use_noise = shared(numpy_floatX(0.)) fs_init = [] fs_next = [] for model, option in zip(self._models, self._options): # check compatibility with multisource if option["multisource_type"] is not None and len( option['extra_sources']) == 0: logging.error( "This model is multi-source but no auxiliary source file was provided." ) sys.exit(1) elif option["multisource_type"] is None and len( option['extra_sources']) != 0: logging.warn( "You provided an auxiliary input but this model is not multi-source. Ignoring extra input." ) param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) # always return alignment at this point if option['multisource_type'] is not None: f_init, f_next = build_multi_sampler(tparams, option, use_noise, trng, return_alignment=True) else: f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=True) fs_init.append(f_init) fs_next.append(f_next) return trng, fs_init, fs_next, gen_sample
def load_scorer(model, option, alignweights=None): # load model parameters and set theano shared variables param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) # compatibility with multi-source if 'extra_sources' not in option: option['extra_sources'] = [] if 'multisource_type' not in option: option['multisource_type'] = None #if 'multisource_type' not in option or option['multisource_type'] is None: # print("building single source model") # trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost = build_model(tparams, option) # inps = [x, x_mask, y, y_mask] #else: trng, use_noise, xs, x_masks, y, y_mask, opt_ret, cost = build_multisource_model( tparams, option) #inps = [xs[0], x_masks[0], xs[1], x_masks[1], y, y_mask] inps = [z for (x, x_mask) in zip(xs, x_masks) for z in (x, x_mask)] + [y, y_mask] # list of inputs use_noise.set_value(0.) if alignweights: logging.debug("Save weight mode ON, alignment matrix will be saved.") outputs = [cost] if option['multisource_type'] == 'init-decoder': extra_encoders = 0 else: extra_encoders = len(option['extra_sources']) for i in range(extra_encoders + 1): outputs.append(opt_ret['dec_alphas' + str(i)]) outputs.append(opt_ret['cost_per_word']) #if 'multisource_type' not in option or option['multisource_type'] is None: # outputs = [cost, opt_ret['dec_alphas0'], opt_ret['cost_per_word']] #else: # outputs = [cost, opt_ret['dec_alphas0'], opt_ret['dec_alphas1'], opt_ret['cost_per_word']] f_log_probs = theano.function(inps, outputs) else: f_log_probs = theano.function(inps, [cost, opt_ret['cost_per_word']]) return f_log_probs
def rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights): trng = RandomStreams(1234) fs_log_probs = [] for model, option in zip(models, options): # load model parameters and set theano shared variables param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, option) inps = [x, x_mask, y, y_mask] use_noise.set_value(0.) if alignweights: sys.stderr.write( "\t*** Save weight mode ON, alignment matrix will be saved.\n") outputs = [cost, opt_ret['dec_alphas']] f_log_probs = theano.function(inps, outputs) else: f_log_probs = theano.function(inps, cost) fs_log_probs.append(f_log_probs) def _score(pairs, alignweights=False): # sample given an input sequence and obtain scores scores = [] alignments = [] for i, f_log_probs in enumerate(fs_log_probs): score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights=alignweights) scores.append(score) alignments.append(alignment) return scores, alignments lines = source_file.readlines() nbest_lines = nbest_file.readlines() if alignweights: ### opening the temporary file. temp_name = saveto.name + ".json" align_OUT = tempfile.NamedTemporaryFile(prefix=temp_name) with tempfile.NamedTemporaryFile( prefix='rescore-tmpin') as tmp_in, tempfile.NamedTemporaryFile( prefix='rescore-tmpout') as tmp_out: for line in nbest_lines: linesplit = line.split(' ||| ') idx = int( linesplit[0]) ##index from the source file. Starting from 0. tmp_in.write(lines[idx]) tmp_out.write(linesplit[1] + '\n') tmp_in.seek(0) tmp_out.seek(0) pairs = TextIterator( tmp_in.name, tmp_out.name, options[0]['dictionaries'][:-1], options[0]['dictionaries'][1], n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'], batch_size=b, maxlen=float('inf'), sort_by_length=False ) #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after scores, alignments = _score(pairs, alignweights) for i, line in enumerate(nbest_lines): score_str = ' '.join(map(str, [s[i] for s in scores])) saveto.write('{0} {1}\n'.format(line.strip(), score_str)) ### optional save weights mode. if alignweights: for line in alignments: align_OUT.write(line + "\n") if alignweights: combine_source_target_text(source_file, nbest_file, saveto.name, align_OUT) align_OUT.close()
def train( dim_word=100, # word vector dimensionality dim=1000, # the number of LSTM units patience=10, # early stopping patience max_epochs=5000, finish_after=10000000, # finish after this many updates dispFreq=100, decay_c=0., # L2 regularization penalty map_decay_c=0., # L2 regularization penalty towards original weights alpha_c=0., # alignment regularization clip_c=-1., # gradient clipping threshold lrate=0.01, # learning rate n_words_src=None, # source vocabulary size n_words_tgt=None, # target vocabulary size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size=16, valid_batch_size=16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq datasets=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'], valid_datasets=['../data/dev/newstest2011.en.tok', '../data/dev/newstest2011.fr.tok'], dictionaries=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'], use_dropout=False, dropout_embedding=0.2, # dropout for input embeddings (0: no dropout) dropout_hidden=0.5, # dropout for hidden layers (0: no dropout) dropout_source=0, # dropout source words (0: no dropout) dropout_target=0, # dropout target words (0: no dropout) reload_=False, overwrite=False, external_validation_script=None, shuffle_each_epoch=True, sort_by_length=True, maxibatch_size=20, #How many minibatches to load at one time model_version = 0.1 ): # 获取局部参数 model_options = locals().copy() print 'Model options:',model_options # 加载字典,并且反转 worddicts = [None]*len(dictionaries) worddicts_r = [None]*len(dictionaries) for ii,dd in enumerate(dictionaries): worddicts[ii] = load_dict(dd) worddicts_r[ii] = dict() for kk,vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk # 若词汇总大小未设置,则给定默认值为词汇表大小 if n_words_src is None: n_words_src = len(worddicts[0]) model_options['n_words_src'] = n_words_src if n_words_tgt is None: n_words_tgt = len(worddicts[1]) model_options['n_words_tgt'] = n_words_tgt # 加载数据 print 'Loading data ...' train = TextIterator(datasets[0],datasets[1], dictionaries[0],dictionaries[1], n_words_source=n_words_src, n_words_target=n_words_tgt, batch_size=batch_size, maxlen=maxlen, shuffle_each_epoch=shuffle_each_epoch, sort_by_length=sort_by_length, maxibatch_size=maxibatch_size) valid = TextIterator(valid_datasets[0], valid_datasets[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words_tgt, batch_size=valid_batch_size, maxlen=maxlen) # 初始化模型参数 print 'Init parameters ...' params = init_params(model_options) # 重新载入模型,当程序意外中断的时候,可以继续运行代码 if reload_ and os.path.exists(saveto): print 'Reloading model parameters' params = load_params(saveto,params) # 把网络中的W,b 变为共享变量 tparams = init_theano_params(params) # 建立模型 print 'Building model ...' trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt = build_model(tparams,model_options) inps = [x, x_mask, y, y_mask] #建立采样器 if validFreq or sampleFreq: print 'Building sampler ...' f_init, f_next = build_sampler(tparams, model_options, use_noise, trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # apply L2 regularisation to loaded model (map training) if map_decay_c > 0: map_decay_c = theano.shared(numpy.float32(map_decay_c), name="map_decay_c") weight_map_decay = 0. for kk, vv in tparams.iteritems(): init_value = theano.shared(vv.get_value(), name= kk + "_init") weight_map_decay += ((vv -init_value) ** 2).sum() weight_map_decay *= map_decay_c cost += weight_map_decay # after all regularizers - compile the computational graph for cost print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) f_alpha = theano.function(inps, opt_ret['dec_alphas']) # alphas print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, profile=profile) print 'Done' #开始优化 print 'Optimization' best_p = None bad_counter = 0 uidx = 0 estop = False history_errs = [] # reload history if reload_ and os.path.exists(saveto): rmodel = numpy.load(saveto) history_errs = list(rmodel['history_errs']) if 'uidx' in rmodel: uidx = rmodel['uidx'] if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size if sampleFreq == -1: sampleFreq = len(train[0])/batch_size valid_err = None for eidx in xrange(max_epochs): n_samples = 0 for x, y in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen, n_words_src=n_words_src, n_words=n_words_tgt) #长度小于 maxlen 的值的句子为 0 if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x, x_mask, y, y_mask) # 画出词对齐矩阵 #print f_alpha(x, x_mask, y, y_mask).shape """ x_word = [worddicts_r[0][idx] for idx in x[:,0]] y_word = [worddicts_r[1][idx] for idx in y[:,0]] print len(x_word), x_word print len(y_word), y_word shape = f_alpha(x, x_mask, y, y_mask).shape for i in range(shape[1]): # print sum(f_alpha(x, x_mask, y, y_mask)[i,0,:]) mx = sum(y_mask[:,i]) my = sum(x_mask[:,i]) align_matrix = f_alpha(x, x_mask, y, y_mask)[:,i,:][0:mx,0:my] align_shape = align_matrix.shape scale_ = 20 # 图像大小 out_matrix = numpy.ones([scale_*align_shape[0],scale_*align_shape[1]]) for j in range(align_shape[0]): for k in range(align_shape[1]): out_matrix[j*scale_:(j+1)*scale_,k*scale_:(k+1)*scale_] *= align_matrix[j,k] plt.imshow(100*out_matrix, plt.cm.gray) plt.pause(1) plt.show() sys.exit(0) """ # do the update on parameters f_update(lrate) ud = time.time() - ud_start # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. # verbose if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud # save the best model so far, in addition, save the latest model # into a separate file with the iteration number for external eval if numpy.mod(uidx, saveFreq) == 0: print 'Saving the best model...', if best_p is not None: params = best_p else: params = unzip_from_theano(tparams) numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params) json.dump(model_options, open('%s.json' % saveto, 'wb'), indent=2) print 'Done' # save with uidx if not overwrite: print 'Saving the model at iteration {}...'.format(uidx), saveto_uidx = '{}.iter{}.npz'.format( os.path.splitext(saveto)[0], uidx) numpy.savez(saveto_uidx, history_errs=history_errs, uidx=uidx, **unzip_from_theano(tparams)) print 'Done' # generate some samples with the model and display them if sampleFreq and numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? for jj in xrange(numpy.minimum(5, x.shape[1])): stochastic = True sample, score, sample_word_probs, alignment = gen_sample([f_init], [f_next], x[:, jj][:, None], trng=trng, k=1, maxlen=30, stochastic=stochastic, argmax=False, suppress_unk=False) print 'Source ', jj, ': ', for vv in x[:,jj]: if vv == 0: break if vv in worddicts_r[0]: print worddicts_r[0][vv], else: print 'UNK' print print 'Truth ', jj, ' : ', for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[-1]: print worddicts_r[-1][vv], else: print 'UNK', print print 'Sample ', jj, ': ', if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] for vv in ss: if vv == 0: break if vv in worddicts_r[-1]: print worddicts_r[-1][vv], else: print 'UNK', print # validate model on validation set and early stop if necessary if valid and validFreq and numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs, alignment = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip_from_theano(tparams) bad_counter = 0 if len(history_errs) > patience and valid_err >= \ numpy.array(history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): ipdb.set_trace() print 'Valid ', valid_err if external_validation_script: print "Calling external validation script" print 'Saving model...', params = unzip_from_theano(tparams) #每次验证的时候,也会保存 uidx numpy.savez(saveto +'.dev', history_errs=history_errs, uidx=uidx, **params) json.dump(model_options, open('%s.dev.npz.json' % saveto, 'wb'), indent=2) print 'Done' p = Popen([external_validation_script]) # finish after this many updates if uidx >= finish_after: print 'Finishing after %d iterations!' % uidx estop = True break print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zip_to_theano(best_p, tparams) if valid: use_noise.set_value(0.) valid_errs, alignment = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() print 'Valid ', valid_err if best_p is not None: params = copy.copy(best_p) else: params = unzip_from_theano(tparams) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, uidx=uidx, **params) return valid_err
def main(model, dictionary, dictionary_target, source_file, saveto, k=5, batch_size = 1, opt_base=None, normalize=False, output_attention=False): trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) #load params if opt_base is None: options = load_config(model) else: options = load_config(opt_base) param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list, '') tparams = init_theano_params(params) #load dictionary if dictionary is None: dictionary = options['dictionaries'][0] word_dict = load_dict(dictionary) if options['n_words_src']: for key, idx in word_dict.items(): if idx >= options['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' if dictionary_target is None: dictionary_target = options['dictionaries'][1] word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' def _send_jobs(fname): retval = [] retval_ori = [] with open(fname, 'r') as f: for idx, line in enumerate(f): words = line.strip().split() if len(words) == 0: continue retval_ori.append(line.strip()) x = map(lambda w: word_dict[w] if w in word_dict else 1, words) x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x) retval.append(x) logging.info('total %s sentences' % len(retval)) return retval, retval_ori sources, sources_ori = _send_jobs(source_file) batches = [] for i in range(len(sources) / batch_size): batches.append(prepare_data(sources[i * batch_size: (i + 1) * batch_size])) if (i + 1) * batch_size < len(sources): batches.append(prepare_data(sources[(i + 1) * batch_size: ])) final_sentences = [] f_init, f_next = build_sampler(tparams, options, use_noise, trng) for batch in batches: samples, scores, word_probs, _, _ = gen_sample([f_init], [f_next], batch[0], trng=trng, k=k, maxlen=200, stochastic=False, argmax=False) if normalize: lengths = numpy.array([len(s) for s in samples]) scores = scores / lengths final_words = samples[numpy.argmin(scores)] final_sentences.append(' '.join([word_idict_trg[w] for w in final_words]) + '\n') with open(saveto, 'w') as fout: for sentence in final_sentences: fout.write(sentence) print 'Done'
def rescore_model(source_file, target_file, saveto, models, options, b, normalization_alpha, verbose, alignweights): trng = RandomStreams(1234) fs_log_probs = [] for model, option in zip(models, options): # load model parameters and set theano shared variables param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, option) inps = [x, x_mask, y, y_mask] use_noise.set_value(0.) if alignweights: logging.debug( "Save weight mode ON, alignment matrix will be saved.") outputs = [cost, opt_ret['dec_alphas']] f_log_probs = theano.function(inps, outputs) else: f_log_probs = theano.function(inps, cost) fs_log_probs.append(f_log_probs) def _score(pairs, alignweights=False): # sample given an input sequence and obtain scores scores = [] alignments = [] for i, f_log_probs in enumerate(fs_log_probs): score, alignment = pred_probs( f_log_probs, prepare_data, options[i], pairs, normalization_alpha=normalization_alpha, alignweights=alignweights) scores.append(score) alignments.append(alignment) return scores, alignments pairs = TextIterator( source_file.name, target_file.name, options[0]['dictionaries'][:-1], options[0]['dictionaries'][-1], n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'], batch_size=b, maxlen=float('inf'), sort_by_length=False ) #TODO: sorting by length could be more efficient, but we'd want to resort after scores, alignments = _score(pairs, alignweights) source_file.seek(0) target_file.seek(0) source_lines = source_file.readlines() target_lines = target_file.readlines() for i, line in enumerate(target_lines): score_str = ' '.join(map(str, [s[i] for s in scores])) if verbose: saveto.write('{0} '.format(line.strip())) saveto.write('{0}\n'.format(score_str)) ### optional save weights mode. if alignweights: ### writing out the alignments. temp_name = saveto.name + ".json" with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT: for line in all_alignments: align_OUT.write(line + "\n") ### combining the actual source and target words. combine_source_target_text_1to1(source_file, target_file, saveto.name, align_OUT)
def train( dim_word=100, # word vector dimensionality dim=1000, # the number of LSTM units factors=1, # input factors dim_per_factor=None, # list of word vector dimensionalities (one per factor): [250,200,50] for total dimensionality of 500 encoder='gru', decoder='gru_cond', patience=10, # early stopping patience max_epochs=5000, finish_after=10000000, # finish after this many updates dispFreq=100, decay_c=0., # L2 regularization penalty map_decay_c=0., # L2 regularization penalty towards original weights alpha_c=0., # alignment regularization clip_c=-1., # gradient clipping threshold lrate=0.01, # learning rate n_words_src=None, # source vocabulary size n_words=None, # target vocabulary size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size=16, valid_batch_size=16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq datasets=('/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'), valid_datasets=('../data/dev/newstest2011.en.tok', '../data/dev/newstest2011.fr.tok'), dictionaries=( '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'), use_dropout=False, dropout_embedding=0.2, # dropout for input embeddings (0: no dropout) dropout_hidden=0.5, # dropout for hidden layers (0: no dropout) dropout_source=0, # dropout source words (0: no dropout) dropout_target=0, # dropout target words (0: no dropout) reload_=False, overwrite=False, external_validation_script=None, shuffle_each_epoch=True, finetune=False, finetune_only_last=False, sort_by_length=True, use_domain_interpolation=False, domain_interpolation_min=0.1, domain_interpolation_inc=0.1, domain_interpolation_indomain_datasets=('indomain.en', 'indomain.fr'), maxibatch_size=20, #How many minibatches to load at one time model_version=0.1, #store version used for training for compatibility ): # Model options model_options = locals().copy() if model_options['dim_per_factor'] == None: if factors == 1: model_options['dim_per_factor'] = [model_options['dim_word']] else: sys.stderr.write( 'Error: if using factored input, you must specify \'dim_per_factor\'\n' ) sys.exit(1) assert (len(dictionaries) == factors + 1 ) # one dictionary per source factor + 1 for target factor assert (len(model_options['dim_per_factor']) == factors ) # each factor embedding has its own dimensionality assert ( sum(model_options['dim_per_factor']) == model_options['dim_word'] ) # dimensionality of factor embeddings sums up to total dimensionality of input embedding vector # load dictionaries and invert them worddicts = [None] * len(dictionaries) worddicts_r = [None] * len(dictionaries) for ii, dd in enumerate(dictionaries): worddicts[ii] = load_dict(dd) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk if n_words_src is None: n_words_src = len(worddicts[0]) model_options['n_words_src'] = n_words_src if n_words is None: n_words = len(worddicts[1]) model_options['n_words'] = n_words print('Loading data') domain_interpolation_cur = None if use_domain_interpolation: print( 'Using domain interpolation with initial ratio %s, increase rate %s' % (domain_interpolation_min, domain_interpolation_inc)) domain_interpolation_cur = domain_interpolation_min train = DomainInterpolatorTextIterator( datasets[0], datasets[1], dictionaries[:-1], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, batch_size=batch_size, maxlen=maxlen, shuffle_each_epoch=shuffle_each_epoch, sort_by_length=sort_by_length, indomain_source=domain_interpolation_indomain_datasets[0], indomain_target=domain_interpolation_indomain_datasets[1], interpolation_rate=domain_interpolation_cur, maxibatch_size=maxibatch_size) else: train = TextIterator(datasets[0], datasets[1], dictionaries[:-1], dictionaries[-1], n_words_source=n_words_src, n_words_target=n_words, batch_size=batch_size, maxlen=maxlen, skip_empty=True, shuffle_each_epoch=shuffle_each_epoch, sort_by_length=sort_by_length, maxibatch_size=maxibatch_size) if valid_datasets and validFreq: valid = TextIterator(valid_datasets[0], valid_datasets[1], dictionaries[:-1], dictionaries[-1], n_words_source=n_words_src, n_words_target=n_words, batch_size=valid_batch_size, maxlen=maxlen) else: valid = None comp_start = time.time() print('Building model') params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): print('Reloading model parameters') params = load_params(saveto, params) tparams = init_theano_params(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] if validFreq or sampleFreq: print('Building sampler') f_init, f_next = build_sampler(tparams, model_options, use_noise, trng) # before any regularizer print('Building f_log_probs...', ) f_log_probs = theano.function(inps, cost, profile=profile) print('Done') cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # apply L2 regularisation to loaded model (map training) if map_decay_c > 0: map_decay_c = theano.shared(numpy.float32(map_decay_c), name="map_decay_c") weight_map_decay = 0. for kk, vv in tparams.iteritems(): init_value = theano.shared(vv.get_value(), name=kk + "_init") weight_map_decay += ((vv - init_value)**2).sum() weight_map_decay *= map_decay_c cost += weight_map_decay # allow finetuning with fixed embeddings if finetune: updated_params = OrderedDict([(key, value) for (key, value) in tparams.iteritems() if not key.startswith('Wemb')]) else: updated_params = tparams # allow finetuning of only last layer (becomes a linear model training problem) if finetune_only_last: updated_params = OrderedDict([(key, value) for (key, value) in tparams.iteritems() if key in ['ff_logit_W', 'ff_logit_b']]) else: updated_params = tparams print('Computing gradient...', ) grads = tensor.grad(cost, wrt=itemlist(updated_params)) print('Done') # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print('Building optimizers...', ) f_grad_shared, f_update = eval(optimizer)(lr, updated_params, grads, inps, cost, profile=profile) print('Done') print('Total compilation time: {0:.1f}s'.format(time.time() - comp_start)) print('Optimization') best_p = None bad_counter = 0 uidx = 0 estop = False history_errs = [] # reload history if reload_ and os.path.exists(saveto): rmodel = numpy.load(saveto) history_errs = list(rmodel['history_errs']) if 'uidx' in rmodel: uidx = rmodel['uidx'] # save model options json.dump(model_options, open('%s.json' % saveto, 'wb'), indent=2) if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size if sampleFreq == -1: sampleFreq = len(train[0]) / batch_size valid_err = None last_disp_samples = 0 ud_start = time.time() p_validation = None for eidx in xrange(max_epochs): n_samples = 0 for x, y in train: n_samples += len(x) last_disp_samples += len(x) uidx += 1 use_noise.set_value(1.) # ensure consistency in number of factors if len(x) and len(x[0]) and len(x[0][0]) != factors: sys.stderr.write( 'Error: mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n' .format(factors, len(x[0][0]))) sys.exit(1) x, x_mask, y, y_mask = prepare_data( x, y, maxlen=maxlen ) # n_words_src=n_words_src, n_words=n_words) # TODO: why unused?? if x is None: print('Minibatch with zero sample under length ', maxlen) uidx -= 1 continue # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x, x_mask, y, y_mask) # do the update on parameters f_update(lrate) # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if numpy.isnan(cost) or numpy.isinf(cost): print('NaN detected') return 1., 1., 1. # verbose if numpy.mod(uidx, dispFreq) == 0: ud = time.time() - ud_start wps = (last_disp_samples) / float(ud) print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud, "{0:.2f} sentences/s".format(wps)) ud_start = time.time() last_disp_samples = 0 # save the best model so far, in addition, save the latest model # into a separate file with the iteration number for external eval if numpy.mod(uidx, saveFreq) == 0: print('Saving the best model...', ) if best_p is not None: params = best_p else: params = unzip_from_theano(tparams) numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params) print('Done') # save with uidx if not overwrite: print('Saving the model at iteration {}...'.format(uidx), ) saveto_uidx = '{}.iter{}.npz'.format( os.path.splitext(saveto)[0], uidx) numpy.savez(saveto_uidx, history_errs=history_errs, uidx=uidx, **unzip_from_theano(tparams)) print('Done') # generate some samples with the model and display them if sampleFreq and numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? for jj in xrange(numpy.minimum(5, x.shape[2])): stochastic = True x_current = x[:, :, jj][:, :, None] # remove padding x_current = x_current[:, :x_mask[:, jj].sum(), :] sample, score, sample_word_probs, alignment, hyp_graph = gen_sample( [f_init], [f_next], x_current, trng=trng, k=1, maxlen=30, stochastic=stochastic, argmax=False, suppress_unk=False, return_hyp_graph=False) print( 'Source ', jj, ': ', ) for pos in range(x.shape[1]): if x[0, pos, jj] == 0: break for factor in range(factors): vv = x[factor, pos, jj] if vv in worddicts_r[factor]: sys.stdout.write(worddicts_r[factor][vv]) else: sys.stdout.write('UNK') if factor + 1 < factors: sys.stdout.write('|') else: sys.stdout.write(' ') print() print( 'Truth ', jj, ' : ', ) for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[-1]: print(worddicts_r[-1][vv], ) else: print('UNK', ) print() print( 'Sample ', jj, ': ', ) if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] for vv in ss: if vv == 0: break if vv in worddicts_r[-1]: print(worddicts_r[-1][vv], ) else: print('UNK', ) print() # validate model on validation set and early stop if necessary if valid and validFreq and numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs, alignment = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip_from_theano(tparams) bad_counter = 0 if len(history_errs) > patience and valid_err >= \ numpy.array(history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: if use_domain_interpolation and ( domain_interpolation_cur < 1.0): domain_interpolation_cur = min( domain_interpolation_cur + domain_interpolation_inc, 1.0) print( 'No progress on the validation set, increasing domain interpolation rate to %s and resuming from best params' % domain_interpolation_cur) train.adjust_domain_interpolation_rate( domain_interpolation_cur) if best_p is not None: zip_to_theano(best_p, tparams) bad_counter = 0 else: print('Early Stop!') estop = True break if numpy.isnan(valid_err): ipdb.set_trace() print('Valid ', valid_err) if external_validation_script: print("Calling external validation script") if p_validation is not None and p_validation.poll( ) is None: print("Waiting for previous validation run to finish") print( "If this takes too long, consider increasing validation interval, reducing validation set size, or speeding up validation by using multiple processes" ) valid_wait_start = time.time() p_validation.wait() print("Waited for {0:.1f} seconds".format( time.time() - valid_wait_start)) print('Saving model...', ) params = unzip_from_theano(tparams) numpy.savez(saveto + '.dev', history_errs=history_errs, uidx=uidx, **params) json.dump(model_options, open('%s.dev.npz.json' % saveto, 'wb'), indent=2) print('Done') p_validation = Popen([external_validation_script]) # finish after this many updates if uidx >= finish_after: print('Finishing after %d iterations!' % uidx) estop = True break print('Seen %d samples' % n_samples) if estop: break if best_p is not None: zip_to_theano(best_p, tparams) if valid: use_noise.set_value(0.) valid_errs, alignment = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() print('Valid ', valid_err) if best_p is not None: params = copy.copy(best_p) else: params = unzip_from_theano(tparams) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, uidx=uidx, **params) return valid_err
def translate_model(queue, rqueue, pid, models, options, k, normalization_alpha, verbose, nbest, return_alignment, suppress_unk, return_hyp_graph, deviceid): # if the --device-list argument is set if deviceid != '': import os theano_flags = os.environ['THEANO_FLAGS'].split(',') exist = False for i in xrange(len(theano_flags)): if theano_flags[i].strip().startswith('device'): exist = True theano_flags[i] = '%s=%s' % ('device', deviceid) break if exist == False: theano_flags.append('%s=%s' % ('device', deviceid)) os.environ['THEANO_FLAGS'] = ','.join(theano_flags) from theano_util import (floatX, numpy_floatX, load_params, init_theano_params) from nmt import (build_sampler, gen_sample, init_params) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy_floatX(0.)) fs_init = [] fs_next = [] for model, option in zip(models, options): # load model parameters and set theano shared variables param_list = numpy.load(model).files param_list = dict.fromkeys([key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) # word index f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=return_alignment) fs_init.append(f_init) fs_next.append(f_next) def _translate(seq): # sample given an input sequence and obtain scores sample, score, word_probs, alignment, hyp_graph = gen_sample(fs_init, fs_next, numpy.array(seq).T.reshape([len(seq[0]), len(seq), 1]), trng=trng, k=k, maxlen=200, stochastic=False, argmax=False, return_alignment=return_alignment, suppress_unk=suppress_unk, return_hyp_graph=return_hyp_graph) # normalize scores according to sequence lengths if normalization_alpha: adjusted_lengths = numpy.array([len(s) ** normalization_alpha for s in sample]) score = score / adjusted_lengths if nbest: return sample, score, word_probs, alignment, hyp_graph else: sidx = numpy.argmin(score) return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx], hyp_graph while True: req = queue.get() if req is None: break idx, x = req[0], req[1] if verbose: sys.stderr.write('{0} - {1}\n'.format(pid,idx)) seq = _translate(x) rqueue.put((idx, seq)) return
def translate_model(queue, rqueue, pid, models, options, k, normalize, verbose, nbest, return_alignment, suppress_unk, return_hyp_graph): from theano_util import (load_params, init_theano_params) from nrg import (build_sampler, gen_sample, init_params) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) fs_init = [] fs_next = [] print models for model, option in zip(models, options): # load model parameters and set theano shared variables param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) #print param_list params = load_params(model, param_list) # output models in plain texts numpy.set_printoptions(threshold='nan') #for kk, vv in params.iteritems(): #print kk #print vv tparams = init_theano_params(params) f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=return_alignment) fs_init.append(f_init) fs_next.append(f_next) def _translate(seq): sample, score, word_probs, alignment, hyp_graph = gen_sample( fs_init, fs_next, # factors, time-steps, n-sample numpy.array(seq).T.reshape([len(seq[0]), len(seq), 1]), trng=trng, k=k, maxlen=200, stochastic=False, argmax=False, # these two is a kind of search method return_alignment=return_alignment, suppress_unk=suppress_unk, return_hyp_graph=return_hyp_graph) if normalize: # length normalization lengths = numpy.array([len(s) for s in sample]) scores = scores / length if nbest: # return n-best return sample, score, word_probs, alignment, hyp_graph else: # return the top best sidx = numpy.argmin(score) return sample[sidx], score[sidx], word_probs[sidx], alignment[ sidx], hyp_graph while True: req = queue.get() if req is None: break idx, x = req[0], req[1] seq = _translate(x) rqueue.put((idx, seq))
def main(models, saveto, bpe_file, save_alignment=None, k=5, normalize=False, n_process=5, chr_level=False, verbose=False, nbest=False, suppress_unk=False, a_json=False, print_word_probabilities=False, return_hyp_graph=False): # load model model_options options = [] for model in models: options.append(load_config(model)) fill_options(options[-1]) dictionaries = options[0]['dictionaries'] dictionaries_source = dictionaries[:-1] dictionary_target = dictionaries[-1] # load source dictionary and invert word_dicts = [] word_idicts = [] for dictionary in dictionaries_source: word_dict = load_dict(dictionary) if options[0]['n_words_src']: for key, idx in word_dict.items(): if idx >= options[0]['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' word_dicts.append(word_dict) word_idicts.append(word_idict) # load target dictionary and invert word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # create input and output queues for processes # CAN I MAKE IT INTO SERVER ###### The following functions should be already a part of serverisation # utility function def _seqs2words(cc): ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) return ' '.join(ww) def _send_jobs(f, processes, queue): source_sentences = [] for idx, line in enumerate(f): if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = [] for w in words: w = [ word_dicts[i][f] if f in word_dicts[i] else 1 for (i, f) in enumerate(w.split('|')) ] if len(w) != options[0]['factors']: sys.stderr.write( 'Error: expected {0} factors, but input word has {1}\n' .format(options[0]['factors'], len(w))) for midx in xrange(n_process): processes[midx].terminate() sys.exit(1) x.append(w) x += [[0] * options[0]['factors']] queue.put((idx, x)) source_sentences.append(words) return idx + 1, source_sentences def _finish_processes(queue): for midx in xrange(n_process): queue.put(None) def _retrieve_jobs(n_samples, processes, queue, rqueue): trans = [None] * n_samples out_idx = 0 for idx in xrange(n_samples): resp = None while resp is None: try: resp = rqueue.get(True, 5) # if queue is empty after 5s, check if processes are still alive except Empty: for midx in xrange(n_process): if not processes[midx].is_alive(): # kill all other processes and raise exception if one dies queue.cancel_join_thread() rqueue.cancel_join_thread() for idx in xrange(n_process): processes[idx].terminate() sys.stderr.write( "Error: translate worker process {0} crashed with exitcode {1}" .format(processes[midx].pid, processes[midx].exitcode)) sys.exit(1) trans[resp[0]] = resp[1] if verbose and numpy.mod(idx, 10) == 0: sys.stderr.write('Sample {0} / {1} Done\n'.format((idx + 1), n_samples)) while out_idx < n_samples and trans[out_idx] != None: yield trans[out_idx] out_idx += 1 def _parallelized_main(fs_init, fs_next, c, bpe, tokenizer, detokenizer): source_file_t = sent_tokenize(c.recv(4096).decode('utf-8')) #print(source_file_t[i]) while source_file_t[0] != "EOT": for i in range(len(source_file_t)): # print source_file_t[i].decode('utf-8') #pipe = subprocess.Popen("echo " + source_file_t[i] + "| perl truecase.perl --model en-truecase.mdl", shell=True) #pipe = subprocess.Popen(["echo", '"' + source_file_t[i] + '"', "|", "perl", "truecase.perl", "--model", # "en-truecase.mdl"], stdout=subprocess.PIPE) #result = pipe.stdout.read() #print pipe.communicate() #print pipe #print pipe.stdout #print pipe.stdout.read() #print pipe. #print "Here" #print result #source_file_t[i] = subprocess.check_output() source_file_t[i] = bpe.segment( tokenizer.tokenize(source_file_t[i], return_str=True)).strip() #print "Passed" print source_file_t detokenized = '' queue = Queue() rqueue = Queue() processes = [None] * n_process for midx in xrange(n_process): processes[midx] = Process( target=translate_model, args=(queue, rqueue, midx, models, options, k, normalize, verbose, nbest, save_alignment is not None, suppress_unk, return_hyp_graph, fs_init, fs_next)) processes[midx].start() n_samples, source_sentences = _send_jobs(source_file_t, processes, queue) _finish_processes(queue) #### The model loading takes place in the head of for loop, prolly in _retrieve_jobs for i, trans in enumerate( _retrieve_jobs(n_samples, processes, queue, rqueue)): print "NEXT SENTENCE:" if nbest: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) order = numpy.argsort(scores) for j in order: if print_word_probabilities: probs = " ||| " + " ".join( "{0}".format(prob) for prob in word_probs[j]) else: probs = "" saveto.write('{0} ||| {1} ||| {2}{3}\n'.format( i, _seqs2words(samples[j]), scores[j], probs)) # print alignment matrix for each hypothesis # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos # translation_token_count+eos if save_alignment is not None: if a_json: print_matrix_json( alignment[j], source_sentences[i], _seqs2words(samples[j]).split(), i, i + j, save_alignment) else: save_alignment.write( '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n' .format(i, _seqs2words(samples[j]), scores[j], ' '.join(source_sentences[i]), len(source_sentences[i]) + 1, len(samples[j]))) print_matrix(alignment[j], save_alignment) else: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) ## TODO: Handle the output here #print((_seqs2words(samples) + "\n").encode('utf-8')) #text.append(_seqs2words(samples) + "\n") x = _seqs2words(samples) #print x[0].upper() + x[1:] detokenized += detokenizer.detokenize( (x.decode('utf-8') + " ").split(), return_str=True) detokenized = detokenized[0].upper() + detokenized[1:] #print "ref this" #print detokenized #detokenized[0] = detokenized[0].upper() #c.send(detokenized.replace('@@ ', '').encode('utf-8').strip()) ## TODO: End of output handling if print_word_probabilities: for prob in word_probs: saveto.write("{} ".format(prob)) saveto.write('\n') if save_alignment is not None: if a_json: print_matrix_json(alignment, source_sentences[i], _seqs2words(trans[0]).split(), i, i, save_alignment) else: save_alignment.write( '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'. format(i, _seqs2words(trans[0]), 0, ' '.join(source_sentences[i]), len(source_sentences[i]) + 1, len(trans[0]))) print_matrix(alignment, save_alignment) c.send(detokenized.replace('@@ ', '').encode('utf-8').strip()) source_file_t = sent_tokenize(c.recv(4096).decode('utf-8')) c.close() sys.stderr.write('Done\n') def _listen(c, addr, fs_init, fs_next, tokenizer, detokenizer, bpe): while True: try: # Establish connection with client. try: print 'Got connection from', addr print "Receiving..." fname = c.recv(4096) except socket.error: c.close() print "connection closed" break print fname c.send("okay") #if fname == 'exit': # print "Terminating connection with client." # c.close() # break #else: #t = threading.Thread(target=_parallelized_main, args=(fname, fs_init, fs_next, c)) try: t = threading.Thread(target=_parallelized_main, args=(fs_init, fs_next, c, bpe, tokenizer, detokenizer)) t.start() t.join() except socket.error: c.close() break except KeyboardInterrupt as e: LOG.debug('Crtrl+C issued ...') LOG.info('Terminating server ...') try: c.shutdown(socket.SHUT_RDWR) c.close() except: pass break s = socket.socket() # Create a socket object host = socket.gethostname() # Get local machine name port = 12345 # Reserve a port for your service. s.bind((host, port)) # Bind to the port # Now wait for client connection. # Beginning model loading from theano_util import (load_params, init_theano_params) from nmt import (build_sampler) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) fs_init = [] fs_next = [] for model, option in zip(models, options): # load model parameters and set theano shared variables param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) # word index f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=save_alignment is not None) fs_init.append(f_init) fs_next.append(f_next) # end of model loading tokenizer = moses.MosesTokenizer() detokenizer = moses.MosesDetokenizer() # start listening to connections once models are loaded args.codes = codecs.open(bpe_file[0], encoding='utf-8') bpe = BPE(args.codes, '@@') while True: try: s.listen(5) print("Waiting for connections and stuff...") c, addr = s.accept() t = threading.Thread(target=_listen, args=(c, addr, fs_init, fs_next, tokenizer, detokenizer, bpe)) t.start() except KeyboardInterrupt: break s.close()
def main(model, dictionary, dictionary_target, source_file, saveto, k=5, batch_size=16, opt_base=None, normalize=False, output_attention=False): trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) #load params if opt_base is None: options = load_config(model) else: options = load_config(opt_base) param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list, '') tparams = init_theano_params(params) #load dictionary if dictionary is None: dictionary = options['dictionaries'][0] word_dict = load_dict(dictionary) if options['n_words_src']: for key, idx in word_dict.items(): if idx >= options['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' if dictionary_target is None: dictionary_target = options['dictionaries'][1] word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' def _send_jobs(fname): retval = [] retval_ori = [] with open(fname, 'r') as f: for idx, line in enumerate(f): words = line.strip().split() if len(words) == 0: continue retval_ori.append(line.strip()) x = map(lambda w: word_dict[w] if w in word_dict else 1, words) x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x) retval.append(x) logging.info('total %s sentences' % len(retval)) return retval, retval_ori sources, sources_ori = _send_jobs(source_file) batches = [] for i in range(len(sources) / batch_size): batches.append( prepare_data(sources[i * batch_size:(i + 1) * batch_size])) if (i + 1) * batch_size < len(sources): batches.append(prepare_data(sources[(i + 1) * batch_size:])) final_sentences = [] print 'Building beam sampler...', f_beam_sample = build_beam_sampler(tparams, options, use_noise, trng, k) print 'Done' for batch in batches: final_word_ids, final_beam_ids, final_beam_scores = f_beam_sample( *batch) for i in range(final_word_ids.shape[1]): word_ids = process_beam_results(final_word_ids[:, i, :], final_beam_ids[:, i, :], final_beam_scores[i]) word_ids = [[wid for wid in line if wid != 0] for line in word_ids] words = [word_idict_trg[wid] for wid in word_ids[0]] sentence = ' '.join(words) + '\n' final_sentences.append(sentence) print len(final_sentences) with open(saveto, 'w') as fout: for sentence in final_sentences: fout.write(sentence) print 'Done'
def init(self, model_options): """Exposes: (but Pyro does not see them) self.f_init self.f_next self.f_log_probs self.f_grad_shared self.f_update """ reload_ = model_options['reload_'] saveto = model_options['saveto'] decay_c = model_options['decay_c'] alpha_c = model_options['alpha_c'] map_decay_c = model_options['map_decay_c'] finetune = model_options['finetune'] finetune_only_last = model_options['finetune_only_last'] clip_c = model_options['clip_c'] optimizer = model_options['optimizer'] comp_start = time.time() print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): print 'Reloading model parameters' params = load_params(saveto, params) self.tparams = init_theano_params(params) trng, self.use_noise, x, x_mask, y, y_mask, opt_ret, per_sent_neg_log_prob = build_model( self.tparams, model_options) inps = [x, x_mask, y, y_mask] self.f_init, self.f_next = build_sampler(self.tparams, model_options, self.use_noise, trng) # before any regularizer print 'Building f_log_probs...', self.f_log_probs = theano.function(inps, per_sent_neg_log_prob, profile=profile) print 'Done' # apply per-sentence weight to cost_vec before averaging per_sent_weight = tensor.vector('per_sent_weight', dtype='float32') per_sent_weight.tag.test_value = numpy.ones(10).astype('float32') cost = (per_sent_neg_log_prob * per_sent_weight).mean() # mean of elem-wise multiply # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in self.tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * (( tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # apply L2 regularisation to loaded model (map training) if map_decay_c > 0: map_decay_c = theano.shared(numpy.float32(map_decay_c), name="map_decay_c") weight_map_decay = 0. for kk, vv in self.tparams.iteritems(): init_value = theano.shared(vv.get_value(), name=kk + "_init") weight_map_decay += ((vv - init_value)**2).sum() weight_map_decay *= map_decay_c cost += weight_map_decay # allow finetuning with fixed embeddings if finetune: updated_params = OrderedDict([ (key, value) for (key, value) in self.tparams.iteritems() if not key.startswith('Wemb') ]) elif finetune_only_last: # allow finetuning of only last layer (becomes a linear model training problem) updated_params = OrderedDict([ (key, value) for (key, value) in self.tparams.iteritems() if key in ['ff_logit_W', 'ff_logit_b'] ]) else: updated_params = self.tparams print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(updated_params)) print 'Done' # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', op_map = { 'adam': optimizers.adam, 'adadelta': optimizers.adadelta, 'rmsprop': optimizers.rmsprop, 'sgd': optimizers.sgd } inps = inps + [ per_sent_weight, ] self.f_grad_shared, self.f_update = op_map[optimizer]( lr, updated_params, grads, inps, per_sent_neg_log_prob, profile=profile) print 'Done' print 'Total compilation time: {0:.1f}s'.format(time.time() - comp_start)
def translate_model_external_embedding(queue, rqueue, pid, models, options, k, normalize, verbose, nbest, return_alignment, suppress_unk): from theano_util import (load_params, init_theano_params) from nmt import (build_sampler_embeddings, gen_sample, init_params_embeddings) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) fs_init = [] fs_next = [] for model, option in zip(models, options): # allocate model parameters params = init_params_embeddings(option) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_theano_params(params) # word index f_init, f_next = build_sampler_embeddings( tparams, option, use_noise, trng, return_alignment=return_alignment) fs_init.append(f_init) fs_next.append(f_next) def _translate(seq): # sample given an input sequence and obtain scores sample, score, word_probs, alignment = gen_sample( fs_init, fs_next, seq, trng=trng, k=int(k), maxlen=200, stochastic=False, argmax=False, return_alignment=return_alignment, suppress_unk=suppress_unk) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score /= lengths if nbest: return sample, score, word_probs, alignment else: sidx = numpy.argmin(score) return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx] while True: req = queue.get() if req is None: break idx, x = req[0], req[1] if verbose: sys.stderr.write('{0} - {1}\n'.format(pid, idx)) seq = _translate(x) rqueue.put((idx, seq)) return
def rescore_model(source_file, target_file, saveto, models, options, b, normalize, verbose, alignweights): trng = RandomStreams(1234) fs_log_probs = [] for model, option in zip(models, options): # allocate model parameters params = init_params(option) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_theano_params(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, option) inps = [x, x_mask, y, y_mask] use_noise.set_value(0.) if alignweights: sys.stderr.write("\t*** Save weight mode ON, alignment matrix will be saved.\n") outputs = [cost, opt_ret['dec_alphas']] f_log_probs = theano.function(inps, outputs) else: f_log_probs = theano.function(inps, cost) fs_log_probs.append(f_log_probs) def _score(pairs, alignweights=False): # sample given an input sequence and obtain scores scores = [] alignments = [] for i, f_log_probs in enumerate(fs_log_probs): score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights = alignweights) scores.append(score) alignments.append(alignment) return scores, alignments pairs = TextIterator(source_file.name, target_file.name, options[0]['dictionaries'][:-1], options[0]['dictionaries'][1], n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'], batch_size=b, maxlen=float('inf'), sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd want to resort after scores, alignments = _score(pairs, alignweights) source_file.seek(0) target_file.seek(0) source_lines = source_file.readlines() target_lines = target_file.readlines() for i, line in enumerate(target_lines): score_str = ' '.join(map(str,[s[i] for s in scores])) saveto.write('{0} {1}\n'.format(line.strip(), score_str)) ### optional save weights mode. if alignweights: ### writing out the alignments. temp_name = saveto.name + ".json" with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT: for line in all_alignments: align_OUT.write(line + "\n") ### combining the actual source and target words. combine_source_target_text_1to1(source_file, target_file, saveto.name, align_OUT)
def rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights): trng = RandomStreams(1234) fs_log_probs = [] for model, option in zip(models, options): # allocate model parameters params = init_params(option) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_theano_params(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, option) inps = [x, x_mask, y, y_mask] use_noise.set_value(0.) if alignweights: sys.stderr.write("\t*** Save weight mode ON, alignment matrix will be saved.\n") outputs = [cost, opt_ret['dec_alphas']] f_log_probs = theano.function(inps, outputs) else: f_log_probs = theano.function(inps, cost) fs_log_probs.append(f_log_probs) def _score(pairs, alignweights=False): # sample given an input sequence and obtain scores scores = [] alignments = [] for i, f_log_probs in enumerate(fs_log_probs): score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights = alignweights) scores.append(score) alignments.append(alignment) return scores, alignments lines = source_file.readlines() nbest_lines = nbest_file.readlines() if alignweights: ### opening the temporary file. temp_name = saveto.name + ".json" align_OUT = tempfile.NamedTemporaryFile(prefix=temp_name) with tempfile.NamedTemporaryFile(prefix='rescore-tmpin') as tmp_in, tempfile.NamedTemporaryFile(prefix='rescore-tmpout') as tmp_out: for line in nbest_lines: linesplit = line.split(' ||| ') idx = int(linesplit[0]) ##index from the source file. Starting from 0. tmp_in.write(lines[idx]) tmp_out.write(linesplit[1] + '\n') tmp_in.seek(0) tmp_out.seek(0) pairs = TextIterator(tmp_in.name, tmp_out.name, options[0]['dictionaries'][:-1], options[0]['dictionaries'][1], n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'], batch_size=b, maxlen=float('inf'), sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after scores, alignments = _score(pairs, alignweights) for i, line in enumerate(nbest_lines): score_str = ' '.join(map(str,[s[i] for s in scores])) saveto.write('{0} {1}\n'.format(line.strip(), score_str)) ### optional save weights mode. if alignweights: for line in alignments: align_OUT.write(line + "\n") if alignweights: combine_source_target_text(source_file, nbest_file, saveto.name, align_OUT) align_OUT.close()