def pred_error(f_cost, prepare_data, options, valid): error = [] for x, y, hter in valid: x, x_mask, y, y_mask = nmt.prepare_data( x, y, maxlen=options['maxlen'], n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) hter = numpy.array(hter).astype('float32') hter = hter.reshape([hter.shape[0], 1]) error.append(f_cost(x, x_mask, y, y_mask, hter)) error = numpy.array(error) return error
def qe_rnn_trg(model='RNN_trg_model/wmt17.de-en.npz', datasets=['test/test16.bpe.src', 'test/test16.bpe.mt'], save_file='test16.hter.pred'): options = load_config(model) #------------------- params = rnn_trg.init_params(options) # 修改此处 params = load_params(model, params) tparams = init_theano_params(params) #------------------- trng,use_noise,x,x_mask,y,y_mask,\ hter,y_pred,cost = rnn_trg.build_model(tparams,options) # 修改此处 inps = [x, x_mask, y, y_mask] f_pred = theano.function(inps, y_pred, profile=False) test = data_iterator.TextIterator(datasets[0], datasets[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=options['valid_batch_size'], maxlen=options['maxlen'], sort_by_length=False) res = [] n_samples = 0 for x, y in test: x, x_mask, y, y_mask = nmt.prepare_data( x, y, maxlen=options['maxlen'], n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) res.extend(list(f_pred(x, x_mask, y, y_mask).flatten())) n_samples += x.shape[1] print 'processed:', n_samples, 'samples' with open('qe/' + save_file, 'w') as fp: for hh in res: fp.writelines(str(hh) + '\n')
def train( batch_size=80, valid_batch_size=80, dim=100, dim_word=500, dispFreq=100, saveFreq=3000, validFreq=1000, saveto='RNN_model/wmt17.en-de.npz', datasets=['tuning/train.bpe.en', 'tuning/train.bpe.de'], valid_datasets=['tuning/dev.bpe.en', 'tuning/dev.bpe.de'], dictionaries=['data/train.bpe.en.json', 'data/train.bpe.de.json'], hter=['tuning/train.hter', 'tuning/dev.hter'], n_words_src=40000, n_words_tgt=40000, nmt_model='model/model.npz.best_bleu', lrate=0.0001, # learning rate use_dropout=True, patience=10, max_epochs=5000, finish_after=1000000, maxibatch_size=20, optimizer='rmsprop', shuffle_each_epoch=True, reload_=True, overwrite=False, sort_by_length=False, maxlen=1000, decay_c=0., # L2 regularization penalty map_decay_c=0., # L2 regularization penalty towards original weights clip_c=1.0, dropout_embedding=0.2, # dropout for input embeddings (0: no dropout) dropout_hidden=0.2, # dropout for hidden layers (0: no dropout) dropout_source=0.1, # dropout source words (0: no dropout) dropout_target=0.1, # dropout target words (0: no dropout) model_version=0.1): #获取局部参数 model_options = locals().copy() print 'Model options:', model_options #加载字典,并且反转 worddicts = [None] * len(dictionaries) worddicts_r = [None] * len(dictionaries) for ii, dd in enumerate(dictionaries): worddicts[ii] = load_dict(dd) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk #若词汇总大小未设置,则给定默认值为词汇表大小 if n_words_src is None: n_words_src = len(worddicts[0]) model_options['n_words_src'] = n_words_src if n_words_tgt is None: n_words_tgt = len(worddicts[1]) model_options['n_words_tgt'] = n_words_tgt #加载数据 print 'Loading data ...' train = TextIterator(datasets[0], datasets[1], hter[0], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words_tgt, batch_size=batch_size, maxlen=maxlen, shuffle_each_epoch=shuffle_each_epoch, sort_by_length=sort_by_length, maxibatch_size=maxibatch_size) valid = TextIterator(valid_datasets[0], valid_datasets[1], hter[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words_tgt, batch_size=valid_batch_size, maxlen=maxlen) #numpy.random.seed(1234) # 初始化模型参数 print 'Init parameters ...' params = init_params(model_options) #reload parameters if reload_ and os.path.exists(saveto): print 'Reloading model parameters' params = load_params(saveto, params) #把网络中的W,b 变为共享变量 tparams = init_theano_params(params) # 建立模型 print 'Building model ...', trng,use_noise,x,x_mask,y,y_mask,hter, \ y_pred,cost = build_model(tparams,model_options) print 'Done' """ @function:调试 print Wt.get_value().shape print tparams['W'].get_value().shape f_tt = theano.function([x,x_mask,y,y_mask],tt) f_emb = theano.function([x,x_mask,y,y_mask],emb) f_pred = theano.function([x,x_mask,y,y_mask],y_pred) f_cost = theano.function([x,x_mask,y,y_mask,hter],cost) for x, y, hter in train: # 准备数据用于训练 x, x_mask, y, y_mask = nmt.prepare_data(x, y, maxlen=maxlen, n_words_src=n_words_src, n_words=n_words_tgt) hter = numpy.array(hter).astype('float32') hter = hter.reshape([hter.shape[0],1]) print f_pred(x,x_mask,y,y_mask).shape print f_cost(x,x_mask,y,y_mask,hter) #print f_cost(x,x_mask,y,y_mask,hter) sys.exit(0) """ # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # apply L2 regularisation to loaded model (map training) if map_decay_c > 0: map_decay_c = theano.shared(numpy.float32(map_decay_c), name="map_decay_c") weight_map_decay = 0. for kk, vv in tparams.iteritems(): init_value = theano.shared(vv.get_value(), name=kk + "_init") weight_map_decay += ((vv - init_value)**2).sum() weight_map_decay *= map_decay_c cost += weight_map_decay print 'Building f_pred...', inps = [x, x_mask, y, y_mask] f_pred = theano.function(inps, y_pred, profile=False) print 'Done' print 'Building f_cost...', inps = [x, x_mask, y, y_mask, hter] f_cost = theano.function(inps, cost, profile=False) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, profile=False) print 'Done' print 'Start Optimization' best_p = None bad_counter = 0 uidx = 0 estop = False history_errs = [] # reload history if reload_ and os.path.exists(saveto): rmodel = numpy.load(saveto) history_errs = list(rmodel['history_errs']) if 'uidx' in rmodel: uidx = rmodel['uidx'] if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size valid_err = None fp = open('RNN_model/valid.error', 'w') for eidx in xrange(max_epochs): n_samples = 0 for x, y, hter in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) # 准备数据用于训练 x, x_mask, y, y_mask = nmt.prepare_data(x, y, maxlen=maxlen, n_words_src=n_words_src, n_words=n_words_tgt) hter = numpy.array(hter).astype('float32') hter = hter.reshape([hter.shape[0], 1]) #长度小于 maxlen 的值的句子为 0 if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x, x_mask, y, y_mask, hter) # do the update on parameters f_update(lrate) ud = time.time() - ud_start # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. # verbose if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud # save the best model so far, in addition, save the latest model # into a separate file with the iteration number for external eval if numpy.mod(uidx, saveFreq) == 0: print 'Saving the best model...', if best_p is not None: params = best_p else: params = unzip_from_theano(tparams) numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params) json.dump(model_options, open('%s.json' % saveto, 'wb'), indent=2) print 'Done' # save with uidx if not overwrite: print 'Saving the model at iteration {}...'.format(uidx), saveto_uidx = '{}.iter{}.npz'.format( os.path.splitext(saveto)[0], uidx) numpy.savez(saveto_uidx, history_errs=history_errs, uidx=uidx, **unzip_from_theano(tparams)) print 'Done' # validate model on validation set and early stop if necessary if valid and validFreq and numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs = pred_error(f_cost, nmt.prepare_data, model_options, valid) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip_from_theano(tparams) bad_counter = 0 if len(history_errs) > patience and valid_err >= \ numpy.array(history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): ipdb.set_trace() print 'Valid ', valid_err fp.writelines('valid error: ' + str(valid_err) + '\n') # finish after this many updates if uidx >= finish_after: print 'Finishing after %d iterations!' % uidx estop = True break print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zip_to_theano(best_p, tparams) if valid: use_noise.set_value(0.) valid_errs = pred_error(f_cost, nmt.prepare_data, model_options, valid) valid_err = valid_errs.mean() print 'Valid ', valid_err fp.writelines('Finally error: ' + str(valid_err) + '\n') fp.close() if best_p is not None: params = copy.copy(best_p) else: params = unzip_from_theano(tparams) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, uidx=uidx, **params) return valid_err
def encoder_hidden(model='model/model.npz.best_bleu', train=['test/train.bpe.en', 'test/train.bpe.es'], test=['test/test.bpe.en', 'test/test.bpe.es'], batch_size=10): """ @function:获得对数似然特征 """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt, decoderh = build_model(tparams,options) #加载数据 train = TextIterator( train[0], train[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False test = TextIterator( test[0], test[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_ctx = theano.function([x, x_mask], ctx, name='f_ctx') #################### train ####################### n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) encoderh = f_ctx(x, x_mask) encoderh = numpy.concatenate( [encoderh[0, :, 1024:], encoderh[-1, :, :1024]], axis=1) with open('features/hidden/train.en-es.encoderh', 'a+') as fp: for hh_data in encoderh: fp.writelines('\t'.join(map(lambda x: str(x), list(hh_data))) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...' ################### test ######################## n_samples = 0 for x, y in test: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) encoderh = f_ctx(x, x_mask) encoderh = numpy.concatenate( [encoderh[0, :, 1024:], encoderh[-1, :, :1024]], axis=1) with open('features/hidden/test.en-es.encoderh', 'a+') as fp: for hh_data in encoderh: fp.writelines('\t'.join(map(lambda x: str(x), list(hh_data))) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...'
def get_qv(model='model/model.npz.best_bleu'): """ @function:获得质量向量(quality vector) """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt = build_model(tparams,options) #加载数据 train = TextIterator( options['datasets'][0], options['datasets'][1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=options['batch_size'], maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False dev = TextIterator( options['valid_datasets'][0], options['valid_datasets'][1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=options['valid_batch_size'], maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_tt = theano.function([x, x_mask, y, y_mask], tt, name='f_tt') #print tparams['ff_logit_W'].get_value().shape #### (500,40000) n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) tt_ = f_tt(x, x_mask, y, y_mask) Wt = tparams['ff_logit_W'].get_value() for j in range(y.shape[1]): qv_ = [] for i in range(y.shape[0]): if y_mask[i][j] == 1: index = y[i][j] qv = tt_[i, 0, :].T * Wt[:, index] qv_.append(list(qv)) with open('qv/train/' + str(n_samples + j) + '.qv.pkl', 'w') as fp: pkl.dump(qv_, fp) n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...'
def alignment( model='model/model.npz.best_bleu', train=['test/train.bpe.en','test/train.bpe.es'], test=['test/test.bpe.en','test/test.bpe.es'], batch_size=10 ): """ @function:获得对数似然特征 """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt, _ = build_model(tparams,options) #加载数据 train = TextIterator(train[0], train[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False test = TextIterator(test[0], test[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_align = theano.function([x,x_mask,y,y_mask],opt_ret,name='f_cost') #################### train ####################### """ n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) align = f_align(x,x_mask,y,y_mask)['dec_alphas'] # (y, batch_size, x) align = align * y_mask[:,:,None] # 注意此处技巧 align_shp = align.shape for j in range(align_shp[1]): row_ = int(numpy.sum(y_mask[:,j])) col_ = int(numpy.sum(x_mask[:,j])) align_data = align[:row_,j,:col_] # 词对齐矩阵 with open('features/alignment/train.en-es.word.align','a+') as fp: for data in align_data: fp.writelines('\t'.join(map(lambda x:str(x), data))+'\n') fp.writelines('\n') n_samples += y.shape[1] print 'processed:',n_samples,'samples ...' """ ################### test ######################## n_samples = 0 for x, y in test: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) align = f_align(x,x_mask,y,y_mask)['dec_alphas'] # (y, batch_size, x) align = align * y_mask[:,:,None] # 注意此处技巧 align_shp = align.shape for j in range(align_shp[1]): row_ = int(numpy.sum(y_mask[:,j])) col_ = int(numpy.sum(x_mask[:,j])) align_data = align[:row_,j,:col_] # 词对齐矩阵 with open('features/alignment/test.en-es.word.align','a+') as fp: for data in align_data: fp.writelines('\t'.join(map(lambda x:str(x), data))+'\n') fp.writelines('\n') n_samples += y.shape[1] print 'processed:',n_samples,'samples ...'
def extract_logprob(model='model/model.npz.best_bleu', train=['test/train.bpe.en', 'test/train.bpe.es'], test=['test/test.bpe.en', 'test/test.bpe.es'], batch_size=10): """ @function:获得对数似然特征 """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt, _ = build_model(tparams,options) #加载数据 train = TextIterator( train[0], train[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False test = TextIterator( test[0], test[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_cost = theano.function([x, x_mask, y, y_mask], cost, name='f_cost') #################### train ####################### n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) logprob = f_cost(x, x_mask, y, y_mask) with open('features/train.es-en.logprob', 'a+') as fp: fp.writelines('\n'.join(map(lambda x: str(x), list(logprob))) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...' ################### test ######################## n_samples = 0 for x, y in test: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) logprob = f_cost(x, x_mask, y, y_mask) with open('features/test.es-en.logprob', 'a+') as fp: fp.writelines('\n'.join(map(lambda x: str(x), list(logprob))) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...'
def extract_qv(model='model/model.npz.best_bleu', train=['test/train.bpe.en', 'test/train.bpe.es'], test=['test/test.bpe.en', 'test/test.bpe.es'], batch_size=10): """ @function:获得质量向量(quality vector) """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt, _ = build_model(tparams,options) #加载数据 train = TextIterator( train[0], train[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False test = TextIterator( test[0], test[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_tt = theano.function([x, x_mask, y, y_mask], tt, name='f_tt') #################### train ####################### n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) tt_ = f_tt(x, x_mask, y, y_mask) Wt = tparams['ff_logit_W'].get_value() for j in range(y.shape[1]): qv_ = [] for i in range(y.shape[0]): if y_mask[i][j] == 1: index = y[i][j] qv = Wt[:, index].T * tt_[i, j, :] qv_.append(list(qv)) qv_ = numpy.array(qv_) qv_ = list(map(lambda x: str(x), qv_.mean(axis=0))) with open('features/train.nmt.qv', 'a+') as fp: fp.writelines('\t'.join(qv_) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...' ################### test ######################## n_samples = 0 for x, y in test: #***** # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) tt_ = f_tt(x, x_mask, y, y_mask) Wt = tparams['ff_logit_W'].get_value() for j in range(y.shape[1]): qv_ = [] for i in range(y.shape[0]): if y_mask[i][j] == 1: index = y[i][j] qv = Wt[:, index].T * tt_[i, j, :] qv_.append(list(qv)) qv_ = numpy.array(qv_) qv_ = list(map(lambda x: str(x), qv_.mean(axis=0))) with open('features/test.nmt.qv', 'a+') as fp: #***** fp.writelines('\t'.join(qv_) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...'