def sgd(lr, tparams, grads, inp, cost, profile=False): gshared = [ theano.shared(p.get_value() * 0., name='%s_grad' % k) for k, p in tparams.iteritems() ] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile) pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] f_update = theano.function([lr], [], updates=pup, profile=profile) return f_grad_shared, f_update
def rmsprop(lr, tparams, grads, inp, cost, profile=False): zipped_grads = [ theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad' % k) for k, p in tparams.iteritems() ] running_grads = [ theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad' % k) for k, p in tparams.iteritems() ] running_grads2 = [ theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems() ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp, cost, updates=zgup + rgup + rg2up, profile=profile) updir = [ theano.shared(p.get_value() * numpy.float32(0.), name='%s_updir' % k) for k, p in tparams.iteritems() ] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg**2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(itemlist(tparams), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
def adadelta(lr, tparams, grads, inp, cost, profile=False): zipped_grads = [ theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad' % k) for k, p in tparams.iteritems() ] running_up2 = [ theano.shared(p.get_value() * numpy.float32(0.), name='%s_rup2' % k) for k, p in tparams.iteritems() ] running_grads2 = [ theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems() ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp, cost, updates=zgup + rg2up, profile=profile) updir = [ -tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2) ] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud**2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] f_update = theano.function([lr], [], updates=ru2up + param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
def train( dim_word=100, # word vector dimensionality dim=1000, # the number of LSTM units factors=1, # input factors dim_per_factor=None, # list of word vector dimensionalities (one per factor): [250,200,50] for total dimensionality of 500 encoder='gru', decoder='gru_cond', patience=10, # early stopping patience max_epochs=5000, finish_after=10000000, # finish after this many updates dispFreq=100, decay_c=0., # L2 regularization penalty map_decay_c=0., # L2 regularization penalty towards original weights alpha_c=0., # alignment regularization clip_c=-1., # gradient clipping threshold lrate=0.01, # learning rate n_words_src=None, # source vocabulary size n_words=None, # target vocabulary size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size=16, valid_batch_size=16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq datasets=('/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'), valid_datasets=('../data/dev/newstest2011.en.tok', '../data/dev/newstest2011.fr.tok'), dictionaries=( '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'), use_dropout=False, dropout_embedding=0.2, # dropout for input embeddings (0: no dropout) dropout_hidden=0.5, # dropout for hidden layers (0: no dropout) dropout_source=0, # dropout source words (0: no dropout) dropout_target=0, # dropout target words (0: no dropout) reload_=False, overwrite=False, external_validation_script=None, shuffle_each_epoch=True, finetune=False, finetune_only_last=False, sort_by_length=True, use_domain_interpolation=False, domain_interpolation_min=0.1, domain_interpolation_inc=0.1, domain_interpolation_indomain_datasets=('indomain.en', 'indomain.fr'), maxibatch_size=20, #How many minibatches to load at one time model_version=0.1, #store version used for training for compatibility ): # Model options model_options = locals().copy() if model_options['dim_per_factor'] == None: if factors == 1: model_options['dim_per_factor'] = [model_options['dim_word']] else: sys.stderr.write( 'Error: if using factored input, you must specify \'dim_per_factor\'\n' ) sys.exit(1) assert (len(dictionaries) == factors + 1 ) # one dictionary per source factor + 1 for target factor assert (len(model_options['dim_per_factor']) == factors ) # each factor embedding has its own dimensionality assert ( sum(model_options['dim_per_factor']) == model_options['dim_word'] ) # dimensionality of factor embeddings sums up to total dimensionality of input embedding vector # load dictionaries and invert them worddicts = [None] * len(dictionaries) worddicts_r = [None] * len(dictionaries) for ii, dd in enumerate(dictionaries): worddicts[ii] = load_dict(dd) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk if n_words_src is None: n_words_src = len(worddicts[0]) model_options['n_words_src'] = n_words_src if n_words is None: n_words = len(worddicts[1]) model_options['n_words'] = n_words print('Loading data') domain_interpolation_cur = None if use_domain_interpolation: print( 'Using domain interpolation with initial ratio %s, increase rate %s' % (domain_interpolation_min, domain_interpolation_inc)) domain_interpolation_cur = domain_interpolation_min train = DomainInterpolatorTextIterator( datasets[0], datasets[1], dictionaries[:-1], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, batch_size=batch_size, maxlen=maxlen, shuffle_each_epoch=shuffle_each_epoch, sort_by_length=sort_by_length, indomain_source=domain_interpolation_indomain_datasets[0], indomain_target=domain_interpolation_indomain_datasets[1], interpolation_rate=domain_interpolation_cur, maxibatch_size=maxibatch_size) else: train = TextIterator(datasets[0], datasets[1], dictionaries[:-1], dictionaries[-1], n_words_source=n_words_src, n_words_target=n_words, batch_size=batch_size, maxlen=maxlen, skip_empty=True, shuffle_each_epoch=shuffle_each_epoch, sort_by_length=sort_by_length, maxibatch_size=maxibatch_size) if valid_datasets and validFreq: valid = TextIterator(valid_datasets[0], valid_datasets[1], dictionaries[:-1], dictionaries[-1], n_words_source=n_words_src, n_words_target=n_words, batch_size=valid_batch_size, maxlen=maxlen) else: valid = None comp_start = time.time() print('Building model') params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): print('Reloading model parameters') params = load_params(saveto, params) tparams = init_theano_params(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] if validFreq or sampleFreq: print('Building sampler') f_init, f_next = build_sampler(tparams, model_options, use_noise, trng) # before any regularizer print('Building f_log_probs...', ) f_log_probs = theano.function(inps, cost, profile=profile) print('Done') cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # apply L2 regularisation to loaded model (map training) if map_decay_c > 0: map_decay_c = theano.shared(numpy.float32(map_decay_c), name="map_decay_c") weight_map_decay = 0. for kk, vv in tparams.iteritems(): init_value = theano.shared(vv.get_value(), name=kk + "_init") weight_map_decay += ((vv - init_value)**2).sum() weight_map_decay *= map_decay_c cost += weight_map_decay # allow finetuning with fixed embeddings if finetune: updated_params = OrderedDict([(key, value) for (key, value) in tparams.iteritems() if not key.startswith('Wemb')]) else: updated_params = tparams # allow finetuning of only last layer (becomes a linear model training problem) if finetune_only_last: updated_params = OrderedDict([(key, value) for (key, value) in tparams.iteritems() if key in ['ff_logit_W', 'ff_logit_b']]) else: updated_params = tparams print('Computing gradient...', ) grads = tensor.grad(cost, wrt=itemlist(updated_params)) print('Done') # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print('Building optimizers...', ) f_grad_shared, f_update = eval(optimizer)(lr, updated_params, grads, inps, cost, profile=profile) print('Done') print('Total compilation time: {0:.1f}s'.format(time.time() - comp_start)) print('Optimization') best_p = None bad_counter = 0 uidx = 0 estop = False history_errs = [] # reload history if reload_ and os.path.exists(saveto): rmodel = numpy.load(saveto) history_errs = list(rmodel['history_errs']) if 'uidx' in rmodel: uidx = rmodel['uidx'] # save model options json.dump(model_options, open('%s.json' % saveto, 'wb'), indent=2) if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size if sampleFreq == -1: sampleFreq = len(train[0]) / batch_size valid_err = None last_disp_samples = 0 ud_start = time.time() p_validation = None for eidx in xrange(max_epochs): n_samples = 0 for x, y in train: n_samples += len(x) last_disp_samples += len(x) uidx += 1 use_noise.set_value(1.) # ensure consistency in number of factors if len(x) and len(x[0]) and len(x[0][0]) != factors: sys.stderr.write( 'Error: mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n' .format(factors, len(x[0][0]))) sys.exit(1) x, x_mask, y, y_mask = prepare_data( x, y, maxlen=maxlen ) # n_words_src=n_words_src, n_words=n_words) # TODO: why unused?? if x is None: print('Minibatch with zero sample under length ', maxlen) uidx -= 1 continue # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x, x_mask, y, y_mask) # do the update on parameters f_update(lrate) # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if numpy.isnan(cost) or numpy.isinf(cost): print('NaN detected') return 1., 1., 1. # verbose if numpy.mod(uidx, dispFreq) == 0: ud = time.time() - ud_start wps = (last_disp_samples) / float(ud) print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud, "{0:.2f} sentences/s".format(wps)) ud_start = time.time() last_disp_samples = 0 # save the best model so far, in addition, save the latest model # into a separate file with the iteration number for external eval if numpy.mod(uidx, saveFreq) == 0: print('Saving the best model...', ) if best_p is not None: params = best_p else: params = unzip_from_theano(tparams) numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params) print('Done') # save with uidx if not overwrite: print('Saving the model at iteration {}...'.format(uidx), ) saveto_uidx = '{}.iter{}.npz'.format( os.path.splitext(saveto)[0], uidx) numpy.savez(saveto_uidx, history_errs=history_errs, uidx=uidx, **unzip_from_theano(tparams)) print('Done') # generate some samples with the model and display them if sampleFreq and numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? for jj in xrange(numpy.minimum(5, x.shape[2])): stochastic = True x_current = x[:, :, jj][:, :, None] # remove padding x_current = x_current[:, :x_mask[:, jj].sum(), :] sample, score, sample_word_probs, alignment, hyp_graph = gen_sample( [f_init], [f_next], x_current, trng=trng, k=1, maxlen=30, stochastic=stochastic, argmax=False, suppress_unk=False, return_hyp_graph=False) print( 'Source ', jj, ': ', ) for pos in range(x.shape[1]): if x[0, pos, jj] == 0: break for factor in range(factors): vv = x[factor, pos, jj] if vv in worddicts_r[factor]: sys.stdout.write(worddicts_r[factor][vv]) else: sys.stdout.write('UNK') if factor + 1 < factors: sys.stdout.write('|') else: sys.stdout.write(' ') print() print( 'Truth ', jj, ' : ', ) for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[-1]: print(worddicts_r[-1][vv], ) else: print('UNK', ) print() print( 'Sample ', jj, ': ', ) if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] for vv in ss: if vv == 0: break if vv in worddicts_r[-1]: print(worddicts_r[-1][vv], ) else: print('UNK', ) print() # validate model on validation set and early stop if necessary if valid and validFreq and numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs, alignment = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip_from_theano(tparams) bad_counter = 0 if len(history_errs) > patience and valid_err >= \ numpy.array(history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: if use_domain_interpolation and ( domain_interpolation_cur < 1.0): domain_interpolation_cur = min( domain_interpolation_cur + domain_interpolation_inc, 1.0) print( 'No progress on the validation set, increasing domain interpolation rate to %s and resuming from best params' % domain_interpolation_cur) train.adjust_domain_interpolation_rate( domain_interpolation_cur) if best_p is not None: zip_to_theano(best_p, tparams) bad_counter = 0 else: print('Early Stop!') estop = True break if numpy.isnan(valid_err): ipdb.set_trace() print('Valid ', valid_err) if external_validation_script: print("Calling external validation script") if p_validation is not None and p_validation.poll( ) is None: print("Waiting for previous validation run to finish") print( "If this takes too long, consider increasing validation interval, reducing validation set size, or speeding up validation by using multiple processes" ) valid_wait_start = time.time() p_validation.wait() print("Waited for {0:.1f} seconds".format( time.time() - valid_wait_start)) print('Saving model...', ) params = unzip_from_theano(tparams) numpy.savez(saveto + '.dev', history_errs=history_errs, uidx=uidx, **params) json.dump(model_options, open('%s.dev.npz.json' % saveto, 'wb'), indent=2) print('Done') p_validation = Popen([external_validation_script]) # finish after this many updates if uidx >= finish_after: print('Finishing after %d iterations!' % uidx) estop = True break print('Seen %d samples' % n_samples) if estop: break if best_p is not None: zip_to_theano(best_p, tparams) if valid: use_noise.set_value(0.) valid_errs, alignment = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() print('Valid ', valid_err) if best_p is not None: params = copy.copy(best_p) else: params = unzip_from_theano(tparams) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, uidx=uidx, **params) return valid_err
def init(self, model_options): """Exposes: (but Pyro does not see them) self.f_init self.f_next self.f_log_probs self.f_grad_shared self.f_update """ reload_ = model_options['reload_'] saveto = model_options['saveto'] decay_c = model_options['decay_c'] alpha_c = model_options['alpha_c'] map_decay_c = model_options['map_decay_c'] finetune = model_options['finetune'] finetune_only_last = model_options['finetune_only_last'] clip_c = model_options['clip_c'] optimizer = model_options['optimizer'] comp_start = time.time() print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): print 'Reloading model parameters' params = load_params(saveto, params) self.tparams = init_theano_params(params) trng, self.use_noise, x, x_mask, y, y_mask, opt_ret, per_sent_neg_log_prob = build_model( self.tparams, model_options) inps = [x, x_mask, y, y_mask] self.f_init, self.f_next = build_sampler(self.tparams, model_options, self.use_noise, trng) # before any regularizer print 'Building f_log_probs...', self.f_log_probs = theano.function(inps, per_sent_neg_log_prob, profile=profile) print 'Done' # apply per-sentence weight to cost_vec before averaging per_sent_weight = tensor.vector('per_sent_weight', dtype='float32') per_sent_weight.tag.test_value = numpy.ones(10).astype('float32') cost = (per_sent_neg_log_prob * per_sent_weight).mean() # mean of elem-wise multiply # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in self.tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * (( tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # apply L2 regularisation to loaded model (map training) if map_decay_c > 0: map_decay_c = theano.shared(numpy.float32(map_decay_c), name="map_decay_c") weight_map_decay = 0. for kk, vv in self.tparams.iteritems(): init_value = theano.shared(vv.get_value(), name=kk + "_init") weight_map_decay += ((vv - init_value)**2).sum() weight_map_decay *= map_decay_c cost += weight_map_decay # allow finetuning with fixed embeddings if finetune: updated_params = OrderedDict([ (key, value) for (key, value) in self.tparams.iteritems() if not key.startswith('Wemb') ]) elif finetune_only_last: # allow finetuning of only last layer (becomes a linear model training problem) updated_params = OrderedDict([ (key, value) for (key, value) in self.tparams.iteritems() if key in ['ff_logit_W', 'ff_logit_b'] ]) else: updated_params = self.tparams print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(updated_params)) print 'Done' # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', op_map = { 'adam': optimizers.adam, 'adadelta': optimizers.adadelta, 'rmsprop': optimizers.rmsprop, 'sgd': optimizers.sgd } inps = inps + [ per_sent_weight, ] self.f_grad_shared, self.f_update = op_map[optimizer]( lr, updated_params, grads, inps, per_sent_neg_log_prob, profile=profile) print 'Done' print 'Total compilation time: {0:.1f}s'.format(time.time() - comp_start)
def train( dim_word=100, # word vector dimensionality dim=1000, # the number of LSTM units patience=10, # early stopping patience max_epochs=5000, finish_after=10000000, # finish after this many updates dispFreq=100, decay_c=0., # L2 regularization penalty map_decay_c=0., # L2 regularization penalty towards original weights alpha_c=0., # alignment regularization clip_c=-1., # gradient clipping threshold lrate=0.01, # learning rate n_words_src=None, # source vocabulary size n_words_tgt=None, # target vocabulary size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size=16, valid_batch_size=16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq datasets=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'], valid_datasets=['../data/dev/newstest2011.en.tok', '../data/dev/newstest2011.fr.tok'], dictionaries=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'], use_dropout=False, dropout_embedding=0.2, # dropout for input embeddings (0: no dropout) dropout_hidden=0.5, # dropout for hidden layers (0: no dropout) dropout_source=0, # dropout source words (0: no dropout) dropout_target=0, # dropout target words (0: no dropout) reload_=False, overwrite=False, external_validation_script=None, shuffle_each_epoch=True, sort_by_length=True, maxibatch_size=20, #How many minibatches to load at one time model_version = 0.1 ): # 获取局部参数 model_options = locals().copy() print 'Model options:',model_options # 加载字典,并且反转 worddicts = [None]*len(dictionaries) worddicts_r = [None]*len(dictionaries) for ii,dd in enumerate(dictionaries): worddicts[ii] = load_dict(dd) worddicts_r[ii] = dict() for kk,vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk # 若词汇总大小未设置,则给定默认值为词汇表大小 if n_words_src is None: n_words_src = len(worddicts[0]) model_options['n_words_src'] = n_words_src if n_words_tgt is None: n_words_tgt = len(worddicts[1]) model_options['n_words_tgt'] = n_words_tgt # 加载数据 print 'Loading data ...' train = TextIterator(datasets[0],datasets[1], dictionaries[0],dictionaries[1], n_words_source=n_words_src, n_words_target=n_words_tgt, batch_size=batch_size, maxlen=maxlen, shuffle_each_epoch=shuffle_each_epoch, sort_by_length=sort_by_length, maxibatch_size=maxibatch_size) valid = TextIterator(valid_datasets[0], valid_datasets[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words_tgt, batch_size=valid_batch_size, maxlen=maxlen) # 初始化模型参数 print 'Init parameters ...' params = init_params(model_options) # 重新载入模型,当程序意外中断的时候,可以继续运行代码 if reload_ and os.path.exists(saveto): print 'Reloading model parameters' params = load_params(saveto,params) # 把网络中的W,b 变为共享变量 tparams = init_theano_params(params) # 建立模型 print 'Building model ...' trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt = build_model(tparams,model_options) inps = [x, x_mask, y, y_mask] #建立采样器 if validFreq or sampleFreq: print 'Building sampler ...' f_init, f_next = build_sampler(tparams, model_options, use_noise, trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # apply L2 regularisation to loaded model (map training) if map_decay_c > 0: map_decay_c = theano.shared(numpy.float32(map_decay_c), name="map_decay_c") weight_map_decay = 0. for kk, vv in tparams.iteritems(): init_value = theano.shared(vv.get_value(), name= kk + "_init") weight_map_decay += ((vv -init_value) ** 2).sum() weight_map_decay *= map_decay_c cost += weight_map_decay # after all regularizers - compile the computational graph for cost print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) f_alpha = theano.function(inps, opt_ret['dec_alphas']) # alphas print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, profile=profile) print 'Done' #开始优化 print 'Optimization' best_p = None bad_counter = 0 uidx = 0 estop = False history_errs = [] # reload history if reload_ and os.path.exists(saveto): rmodel = numpy.load(saveto) history_errs = list(rmodel['history_errs']) if 'uidx' in rmodel: uidx = rmodel['uidx'] if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size if sampleFreq == -1: sampleFreq = len(train[0])/batch_size valid_err = None for eidx in xrange(max_epochs): n_samples = 0 for x, y in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen, n_words_src=n_words_src, n_words=n_words_tgt) #长度小于 maxlen 的值的句子为 0 if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x, x_mask, y, y_mask) # 画出词对齐矩阵 #print f_alpha(x, x_mask, y, y_mask).shape """ x_word = [worddicts_r[0][idx] for idx in x[:,0]] y_word = [worddicts_r[1][idx] for idx in y[:,0]] print len(x_word), x_word print len(y_word), y_word shape = f_alpha(x, x_mask, y, y_mask).shape for i in range(shape[1]): # print sum(f_alpha(x, x_mask, y, y_mask)[i,0,:]) mx = sum(y_mask[:,i]) my = sum(x_mask[:,i]) align_matrix = f_alpha(x, x_mask, y, y_mask)[:,i,:][0:mx,0:my] align_shape = align_matrix.shape scale_ = 20 # 图像大小 out_matrix = numpy.ones([scale_*align_shape[0],scale_*align_shape[1]]) for j in range(align_shape[0]): for k in range(align_shape[1]): out_matrix[j*scale_:(j+1)*scale_,k*scale_:(k+1)*scale_] *= align_matrix[j,k] plt.imshow(100*out_matrix, plt.cm.gray) plt.pause(1) plt.show() sys.exit(0) """ # do the update on parameters f_update(lrate) ud = time.time() - ud_start # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. # verbose if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud # save the best model so far, in addition, save the latest model # into a separate file with the iteration number for external eval if numpy.mod(uidx, saveFreq) == 0: print 'Saving the best model...', if best_p is not None: params = best_p else: params = unzip_from_theano(tparams) numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params) json.dump(model_options, open('%s.json' % saveto, 'wb'), indent=2) print 'Done' # save with uidx if not overwrite: print 'Saving the model at iteration {}...'.format(uidx), saveto_uidx = '{}.iter{}.npz'.format( os.path.splitext(saveto)[0], uidx) numpy.savez(saveto_uidx, history_errs=history_errs, uidx=uidx, **unzip_from_theano(tparams)) print 'Done' # generate some samples with the model and display them if sampleFreq and numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? for jj in xrange(numpy.minimum(5, x.shape[1])): stochastic = True sample, score, sample_word_probs, alignment = gen_sample([f_init], [f_next], x[:, jj][:, None], trng=trng, k=1, maxlen=30, stochastic=stochastic, argmax=False, suppress_unk=False) print 'Source ', jj, ': ', for vv in x[:,jj]: if vv == 0: break if vv in worddicts_r[0]: print worddicts_r[0][vv], else: print 'UNK' print print 'Truth ', jj, ' : ', for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[-1]: print worddicts_r[-1][vv], else: print 'UNK', print print 'Sample ', jj, ': ', if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] for vv in ss: if vv == 0: break if vv in worddicts_r[-1]: print worddicts_r[-1][vv], else: print 'UNK', print # validate model on validation set and early stop if necessary if valid and validFreq and numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs, alignment = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip_from_theano(tparams) bad_counter = 0 if len(history_errs) > patience and valid_err >= \ numpy.array(history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): ipdb.set_trace() print 'Valid ', valid_err if external_validation_script: print "Calling external validation script" print 'Saving model...', params = unzip_from_theano(tparams) #每次验证的时候,也会保存 uidx numpy.savez(saveto +'.dev', history_errs=history_errs, uidx=uidx, **params) json.dump(model_options, open('%s.dev.npz.json' % saveto, 'wb'), indent=2) print 'Done' p = Popen([external_validation_script]) # finish after this many updates if uidx >= finish_after: print 'Finishing after %d iterations!' % uidx estop = True break print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zip_to_theano(best_p, tparams) if valid: use_noise.set_value(0.) valid_errs, alignment = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() print 'Valid ', valid_err if best_p is not None: params = copy.copy(best_p) else: params = unzip_from_theano(tparams) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, uidx=uidx, **params) return valid_err