def init_params(options): params = OrderedDict() #获取nmt的参数 nmt_options = load_config(options['nmt_model']) nmt_params = nmt.init_params(nmt_options) nmt_params = load_params(options['nmt_model'], nmt_params) for key in nmt_params: params[key] = nmt_params[key] #获取final_rnn的参数 final_options = load_config(options['final_model']) final_params = rnn.init_params(final_options) final_params = load_params(options['final_model'], final_params) for key in final_params: params[key] = final_params[key] return params
def qe_rnn_trg(model='RNN_trg_model/wmt17.de-en.npz', datasets=['test/test16.bpe.src', 'test/test16.bpe.mt'], save_file='test16.hter.pred'): options = load_config(model) #------------------- params = rnn_trg.init_params(options) # 修改此处 params = load_params(model, params) tparams = init_theano_params(params) #------------------- trng,use_noise,x,x_mask,y,y_mask,\ hter,y_pred,cost = rnn_trg.build_model(tparams,options) # 修改此处 inps = [x, x_mask, y, y_mask] f_pred = theano.function(inps, y_pred, profile=False) test = data_iterator.TextIterator(datasets[0], datasets[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=options['valid_batch_size'], maxlen=options['maxlen'], sort_by_length=False) res = [] n_samples = 0 for x, y in test: x, x_mask, y, y_mask = nmt.prepare_data( x, y, maxlen=options['maxlen'], n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) res.extend(list(f_pred(x, x_mask, y, y_mask).flatten())) n_samples += x.shape[1] print 'processed:', n_samples, 'samples' with open('qe/' + save_file, 'w') as fp: for hh in res: fp.writelines(str(hh) + '\n')
def build_model(tparams, options): """ @first:得到f_tt函数 """ old_options = load_config(options['nmt_model']) params = nmt.init_params(old_options) params = load_params(options['nmt_model'], params) old_tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt = nmt.build_model(old_tparams,old_options) hter = tensor.matrix('hter', dtype='float32') Wt = old_tparams['ff_logit_W'] #w2v = tensor.matrix('w2v',dtype='float32') n_timesteps = y.shape[0] n_samples = y.shape[1] emb = Wt.T[y.flatten()] emb = emb.reshape([n_timesteps, n_samples, 500]) emb = emb * tt #是否使用 dropout if options['use_dropout']: retain_probability_emb = 1 - options['dropout_embedding'] retain_probability_hidden = 1 - options['dropout_hidden'] retain_probability_source = 1 - options['dropout_source'] if options['model_version'] < 0.1: scaled = False else: scaled = True rec_dropout = shared_dropout_layer((2, n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled) emb_dropout = shared_dropout_layer((2, n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled) source_dropout = shared_dropout_layer( (n_timesteps, n_samples, 1), use_noise, trng, retain_probability_source, scaled) source_dropout = tensor.tile(source_dropout, (1, 1, options['dim_word'])) else: rec_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32')) emb_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32')) #if options['use_dropout']: # emb *= source_dropout #emb = get_qv_w2c(emb,y,w2v,dim=500) proj = gru_layer(tparams, emb, options, prefix='final_encoder', mask=y_mask, emb_dropout=emb_dropout, rec_dropout=rec_dropout, profile=False) hh = proj[0][-1, :, :] y_pred = tensor.dot(hh, tparams['final_W']) #此时得出的结果也不错 #y_pred = tensor.nnet.sigmoid(tensor.dot(hh,tparams['W'])) cost = tensor.abs_(y_pred - hter).mean(axis=0)[0] return trng, use_noise, x, x_mask, y, y_mask, hter, y_pred, cost
def encoder_hidden(model='model/model.npz.best_bleu', train=['test/train.bpe.en', 'test/train.bpe.es'], test=['test/test.bpe.en', 'test/test.bpe.es'], batch_size=10): """ @function:获得对数似然特征 """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt, decoderh = build_model(tparams,options) #加载数据 train = TextIterator( train[0], train[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False test = TextIterator( test[0], test[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_ctx = theano.function([x, x_mask], ctx, name='f_ctx') #################### train ####################### n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) encoderh = f_ctx(x, x_mask) encoderh = numpy.concatenate( [encoderh[0, :, 1024:], encoderh[-1, :, :1024]], axis=1) with open('features/hidden/train.en-es.encoderh', 'a+') as fp: for hh_data in encoderh: fp.writelines('\t'.join(map(lambda x: str(x), list(hh_data))) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...' ################### test ######################## n_samples = 0 for x, y in test: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) encoderh = f_ctx(x, x_mask) encoderh = numpy.concatenate( [encoderh[0, :, 1024:], encoderh[-1, :, :1024]], axis=1) with open('features/hidden/test.en-es.encoderh', 'a+') as fp: for hh_data in encoderh: fp.writelines('\t'.join(map(lambda x: str(x), list(hh_data))) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...'
def build_model(tparams, options): """ @first:得到f_tt函数 """ nmt_options = load_config(options['nmt_model']) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, nmt_cost, ctx, tt = nmt.build_model(tparams,nmt_options) # *** tparams *** #删除网络参数ff_logit_b,因为计算cost用不到此参数 tparams.pop('ff_logit_b') hter = tensor.matrix('hter', dtype='float32') Wt = tparams['ff_logit_W'] #old_tparams['ff_logit_W'] n_timesteps = y.shape[0] n_samples = y.shape[1] emb = Wt.T[y.flatten()] emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) emb = emb * tt #是否使用 dropout if options['use_dropout']: retain_probability_emb = 1 - options['dropout_embedding'] retain_probability_hidden = 1 - options['dropout_hidden'] retain_probability_source = 1 - options['dropout_source'] if options['model_version'] < 0.1: scaled = False else: scaled = True rec_dropout = shared_dropout_layer((2, n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled) emb_dropout = shared_dropout_layer((2, n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled) source_dropout = shared_dropout_layer( (n_timesteps, n_samples, 1), use_noise, trng, retain_probability_source, scaled) source_dropout = tensor.tile(source_dropout, (1, 1, options['dim_word'])) else: rec_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32')) emb_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32')) proj = gru_layer(tparams, emb, options, prefix='final_encoder', mask=y_mask, emb_dropout=emb_dropout, rec_dropout=rec_dropout, profile=False) hh = proj[0][-1, :, :] y_pred = tensor.dot(hh, tparams['final_W']) #此时得出的结果也不错 final_cost = tensor.abs_(y_pred - hter).mean(axis=0)[0] cost = final_cost return trng, use_noise, x, x_mask, y, y_mask, hter, y_pred, cost
def __init__(self, model_files, configs, model_weights=None): """" Create a ConstrainedTM using Nematus translation models Args: config: a dict containing key-->value for each argument supported by `nematus/translate.py` """ if configs is not None: assert len(model_files) == len( configs), 'Number of models differs from numer of config files' trng = RandomStreams(1234) # don't use noise use_noise = shared(numpy.float32(0.)) self.eos_token = '<eos>' self.fs_init = [] self.fs_next = [] # each entry in self.word_dicts is: # `{'input_dicts': [...], 'input_idicts': [...], 'output_dict': <dict>, 'output_idict': <dict>} self.word_dicts = [] if configs is None: # Nematus models with new format (no separate config) configs = [] for model in model_files: configs.append(load_config(model)) # backward compatibility fill_options(configs[-1]) for model, config in zip(model_files, configs): # fill in any unspecified options in-place fill_options(config) param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) # load model-specific input and output vocabularies # Note: some models have multiple input factors -- if so, we need to split that model's input into factors # using the same logic that was used at training time # Note: every model's output vocabulary must be exactly the same in order to do ensemble decoding self.word_dicts.append( self.load_dictionaries(config['dictionaries'], n_words_src=config.get( 'n_words_src', None), n_words_trg=config.get('n_words', None))) # WORKING: add passing attention model alignment through GBS # f_init, f_next = build_sampler(tparams, config, use_noise, trng, # return_alignment=config['return_alignment']) f_init, f_next = build_sampler(tparams, config, use_noise, trng, return_alignment=True) self.fs_init.append(f_init) self.fs_next.append(f_next) # Make sure all output dicts have the same number of items assert len( set(len(d['output_dict']) for d in self.word_dicts)) == 1, 'Output vocabularies must be identical' self.num_models = len(self.fs_init) if model_weights is None: self.model_weights = numpy.ones(self.num_models) / float( self.num_models) else: assert len( model_weights ) == self.num_models, 'if you specify weights, there must be one per model' self.model_weights = numpy.array(model_weights)
def get_qv(model='model/model.npz.best_bleu'): """ @function:获得质量向量(quality vector) """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt = build_model(tparams,options) #加载数据 train = TextIterator( options['datasets'][0], options['datasets'][1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=options['batch_size'], maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False dev = TextIterator( options['valid_datasets'][0], options['valid_datasets'][1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=options['valid_batch_size'], maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_tt = theano.function([x, x_mask, y, y_mask], tt, name='f_tt') #print tparams['ff_logit_W'].get_value().shape #### (500,40000) n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) tt_ = f_tt(x, x_mask, y, y_mask) Wt = tparams['ff_logit_W'].get_value() for j in range(y.shape[1]): qv_ = [] for i in range(y.shape[0]): if y_mask[i][j] == 1: index = y[i][j] qv = tt_[i, 0, :].T * Wt[:, index] qv_.append(list(qv)) with open('qv/train/' + str(n_samples + j) + '.qv.pkl', 'w') as fp: pkl.dump(qv_, fp) n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...'
def alignment( model='model/model.npz.best_bleu', train=['test/train.bpe.en','test/train.bpe.es'], test=['test/test.bpe.en','test/test.bpe.es'], batch_size=10 ): """ @function:获得对数似然特征 """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt, _ = build_model(tparams,options) #加载数据 train = TextIterator(train[0], train[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False test = TextIterator(test[0], test[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_align = theano.function([x,x_mask,y,y_mask],opt_ret,name='f_cost') #################### train ####################### """ n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) align = f_align(x,x_mask,y,y_mask)['dec_alphas'] # (y, batch_size, x) align = align * y_mask[:,:,None] # 注意此处技巧 align_shp = align.shape for j in range(align_shp[1]): row_ = int(numpy.sum(y_mask[:,j])) col_ = int(numpy.sum(x_mask[:,j])) align_data = align[:row_,j,:col_] # 词对齐矩阵 with open('features/alignment/train.en-es.word.align','a+') as fp: for data in align_data: fp.writelines('\t'.join(map(lambda x:str(x), data))+'\n') fp.writelines('\n') n_samples += y.shape[1] print 'processed:',n_samples,'samples ...' """ ################### test ######################## n_samples = 0 for x, y in test: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) align = f_align(x,x_mask,y,y_mask)['dec_alphas'] # (y, batch_size, x) align = align * y_mask[:,:,None] # 注意此处技巧 align_shp = align.shape for j in range(align_shp[1]): row_ = int(numpy.sum(y_mask[:,j])) col_ = int(numpy.sum(x_mask[:,j])) align_data = align[:row_,j,:col_] # 词对齐矩阵 with open('features/alignment/test.en-es.word.align','a+') as fp: for data in align_data: fp.writelines('\t'.join(map(lambda x:str(x), data))+'\n') fp.writelines('\n') n_samples += y.shape[1] print 'processed:',n_samples,'samples ...'
def word_embedding( model='model/model.npz.best_bleu', train=['test/train.bpe.en','test/train.bpe.es'], dev=['test/dev.bpe.en','test/dev.bpe.es'], test=['test/test.bpe.en','test/test.bpe.es'], batch_size=10 ): """ @function:获得词向量 """ options = load_config(model) # 加载设置的超参数 params = init_params(options) params = load_params(model, params) # 加载模型参数 #加载数据 train = TextIterator(train[0], train[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False dev = TextIterator(dev[0], dev[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False test = TextIterator(test[0], test[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False #################### train ####################### Wemb = params['Wemb'] Wemb_dec = params['Wemb_dec'] n_samples = 0 for x, y in train: x_emb = get_emb(x, Wemb) y_emb = get_emb(y, Wemb_dec) with open('features/emb/train.es-en.es.emb','a+') as fp: for x_row in x_emb: fp.writelines('\t'.join(map(lambda x:str(x), x_row))+'\n') with open('features/emb/train.es-en.en.emb','a+') as fp: for y_row in y_emb: fp.writelines('\t'.join(map(lambda x:str(x), y_row))+'\n') n_samples += len(x) print 'processed:',n_samples,'samples ...' ################### test ######################## Wemb = params['Wemb'] Wemb_dec = params['Wemb_dec'] n_samples = 0 for x, y in test: x_emb = get_emb(x, Wemb) y_emb = get_emb(y, Wemb_dec) with open('features/emb/test.es-en.es.emb','a+') as fp: for x_row in x_emb: fp.writelines('\t'.join(map(lambda x:str(x), x_row))+'\n') with open('features/emb/test.es-en.en.emb','a+') as fp: for y_row in y_emb: fp.writelines('\t'.join(map(lambda x:str(x), y_row))+'\n') n_samples += len(x) print 'processed:',n_samples,'samples ...' ################### dev ######################## Wemb = params['Wemb'] Wemb_dec = params['Wemb_dec'] n_samples = 0 for x, y in dev: x_emb = get_emb(x, Wemb) y_emb = get_emb(y, Wemb_dec) with open('features/emb/dev.es-en.es.emb','a+') as fp: for x_row in x_emb: fp.writelines('\t'.join(map(lambda x:str(x), x_row))+'\n') with open('features/emb/dev.es-en.en.emb','a+') as fp: for y_row in y_emb: fp.writelines('\t'.join(map(lambda x:str(x), y_row))+'\n') n_samples += len(x) print 'processed:',n_samples,'samples ...'
def extract_logprob(model='model/model.npz.best_bleu', train=['test/train.bpe.en', 'test/train.bpe.es'], test=['test/test.bpe.en', 'test/test.bpe.es'], batch_size=10): """ @function:获得对数似然特征 """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt, _ = build_model(tparams,options) #加载数据 train = TextIterator( train[0], train[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False test = TextIterator( test[0], test[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_cost = theano.function([x, x_mask, y, y_mask], cost, name='f_cost') #################### train ####################### n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) logprob = f_cost(x, x_mask, y, y_mask) with open('features/train.es-en.logprob', 'a+') as fp: fp.writelines('\n'.join(map(lambda x: str(x), list(logprob))) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...' ################### test ######################## n_samples = 0 for x, y in test: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) logprob = f_cost(x, x_mask, y, y_mask) with open('features/test.es-en.logprob', 'a+') as fp: fp.writelines('\n'.join(map(lambda x: str(x), list(logprob))) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...'
def extract_qv(model='model/model.npz.best_bleu', train=['test/train.bpe.en', 'test/train.bpe.es'], test=['test/test.bpe.en', 'test/test.bpe.es'], batch_size=10): """ @function:获得质量向量(quality vector) """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt, _ = build_model(tparams,options) #加载数据 train = TextIterator( train[0], train[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False test = TextIterator( test[0], test[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_tt = theano.function([x, x_mask, y, y_mask], tt, name='f_tt') #################### train ####################### n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) tt_ = f_tt(x, x_mask, y, y_mask) Wt = tparams['ff_logit_W'].get_value() for j in range(y.shape[1]): qv_ = [] for i in range(y.shape[0]): if y_mask[i][j] == 1: index = y[i][j] qv = Wt[:, index].T * tt_[i, j, :] qv_.append(list(qv)) qv_ = numpy.array(qv_) qv_ = list(map(lambda x: str(x), qv_.mean(axis=0))) with open('features/train.nmt.qv', 'a+') as fp: fp.writelines('\t'.join(qv_) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...' ################### test ######################## n_samples = 0 for x, y in test: #***** # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) tt_ = f_tt(x, x_mask, y, y_mask) Wt = tparams['ff_logit_W'].get_value() for j in range(y.shape[1]): qv_ = [] for i in range(y.shape[0]): if y_mask[i][j] == 1: index = y[i][j] qv = Wt[:, index].T * tt_[i, j, :] qv_.append(list(qv)) qv_ = numpy.array(qv_) qv_ = list(map(lambda x: str(x), qv_.mean(axis=0))) with open('features/test.nmt.qv', 'a+') as fp: #***** fp.writelines('\t'.join(qv_) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...'