Example #1
0
def init_params(options):
    params = OrderedDict()

    #获取nmt的参数
    nmt_options = load_config(options['nmt_model'])
    nmt_params = nmt.init_params(nmt_options)
    nmt_params = load_params(options['nmt_model'], nmt_params)
    for key in nmt_params:
        params[key] = nmt_params[key]

    #获取final_rnn的参数
    final_options = load_config(options['final_model'])
    final_params = rnn.init_params(final_options)
    final_params = load_params(options['final_model'], final_params)
    for key in final_params:
        params[key] = final_params[key]

    return params
Example #2
0
def qe_rnn_trg(model='RNN_trg_model/wmt17.de-en.npz',
               datasets=['test/test16.bpe.src', 'test/test16.bpe.mt'],
               save_file='test16.hter.pred'):

    options = load_config(model)
    #-------------------
    params = rnn_trg.init_params(options)  # 修改此处
    params = load_params(model, params)
    tparams = init_theano_params(params)

    #-------------------
    trng,use_noise,x,x_mask,y,y_mask,\
        hter,y_pred,cost = rnn_trg.build_model(tparams,options) # 修改此处
    inps = [x, x_mask, y, y_mask]
    f_pred = theano.function(inps, y_pred, profile=False)

    test = data_iterator.TextIterator(datasets[0],
                                      datasets[1],
                                      options['dictionaries'][0],
                                      options['dictionaries'][1],
                                      n_words_source=options['n_words_src'],
                                      n_words_target=options['n_words_tgt'],
                                      batch_size=options['valid_batch_size'],
                                      maxlen=options['maxlen'],
                                      sort_by_length=False)

    res = []
    n_samples = 0

    for x, y in test:
        x, x_mask, y, y_mask = nmt.prepare_data(
            x,
            y,
            maxlen=options['maxlen'],
            n_words_src=options['n_words_src'],
            n_words=options['n_words_tgt'])

        res.extend(list(f_pred(x, x_mask, y, y_mask).flatten()))
        n_samples += x.shape[1]
        print 'processed:', n_samples, 'samples'

    with open('qe/' + save_file, 'w') as fp:
        for hh in res:
            fp.writelines(str(hh) + '\n')
Example #3
0
def build_model(tparams, options):
    """
    @first:得到f_tt函数
    """
    old_options = load_config(options['nmt_model'])
    params = nmt.init_params(old_options)
    params = load_params(options['nmt_model'], params)
    old_tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt = nmt.build_model(old_tparams,old_options)

    hter = tensor.matrix('hter', dtype='float32')
    Wt = old_tparams['ff_logit_W']
    #w2v = tensor.matrix('w2v',dtype='float32')

    n_timesteps = y.shape[0]
    n_samples = y.shape[1]

    emb = Wt.T[y.flatten()]
    emb = emb.reshape([n_timesteps, n_samples, 500])
    emb = emb * tt

    #是否使用 dropout
    if options['use_dropout']:
        retain_probability_emb = 1 - options['dropout_embedding']
        retain_probability_hidden = 1 - options['dropout_hidden']
        retain_probability_source = 1 - options['dropout_source']
        if options['model_version'] < 0.1:
            scaled = False
        else:
            scaled = True
        rec_dropout = shared_dropout_layer((2, n_samples, options['dim']),
                                           use_noise, trng,
                                           retain_probability_hidden, scaled)
        emb_dropout = shared_dropout_layer((2, n_samples, options['dim_word']),
                                           use_noise, trng,
                                           retain_probability_emb, scaled)
        source_dropout = shared_dropout_layer(
            (n_timesteps, n_samples, 1), use_noise, trng,
            retain_probability_source, scaled)
        source_dropout = tensor.tile(source_dropout,
                                     (1, 1, options['dim_word']))
    else:
        rec_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32'))
        emb_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32'))

    #if options['use_dropout']:
    #    emb *= source_dropout
    #emb = get_qv_w2c(emb,y,w2v,dim=500)

    proj = gru_layer(tparams,
                     emb,
                     options,
                     prefix='final_encoder',
                     mask=y_mask,
                     emb_dropout=emb_dropout,
                     rec_dropout=rec_dropout,
                     profile=False)

    hh = proj[0][-1, :, :]
    y_pred = tensor.dot(hh, tparams['final_W'])  #此时得出的结果也不错

    #y_pred = tensor.nnet.sigmoid(tensor.dot(hh,tparams['W']))
    cost = tensor.abs_(y_pred - hter).mean(axis=0)[0]

    return trng, use_noise, x, x_mask, y, y_mask, hter, y_pred, cost
Example #4
0
def encoder_hidden(model='model/model.npz.best_bleu',
                   train=['test/train.bpe.en', 'test/train.bpe.es'],
                   test=['test/test.bpe.en', 'test/test.bpe.es'],
                   batch_size=10):
    """
    @function:获得对数似然特征
    """
    options = load_config(model)

    params = init_params(options)
    params = load_params(model, params)

    tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt, decoderh = build_model(tparams,options)

    #加载数据
    train = TextIterator(
        train[0],
        train[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    test = TextIterator(
        test[0],
        test[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    f_ctx = theano.function([x, x_mask], ctx, name='f_ctx')

    #################### train #######################
    n_samples = 0
    for x, y in train:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        encoderh = f_ctx(x, x_mask)
        encoderh = numpy.concatenate(
            [encoderh[0, :, 1024:], encoderh[-1, :, :1024]], axis=1)

        with open('features/hidden/train.en-es.encoderh', 'a+') as fp:
            for hh_data in encoderh:
                fp.writelines('\t'.join(map(lambda x: str(x), list(hh_data))) +
                              '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'

    ################### test ########################
    n_samples = 0
    for x, y in test:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        encoderh = f_ctx(x, x_mask)
        encoderh = numpy.concatenate(
            [encoderh[0, :, 1024:], encoderh[-1, :, :1024]], axis=1)

        with open('features/hidden/test.en-es.encoderh', 'a+') as fp:
            for hh_data in encoderh:
                fp.writelines('\t'.join(map(lambda x: str(x), list(hh_data))) +
                              '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'
Example #5
0
def build_model(tparams, options):
    """
    @first:得到f_tt函数
    """
    nmt_options = load_config(options['nmt_model'])
    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, nmt_cost, ctx, tt = nmt.build_model(tparams,nmt_options) # *** tparams ***

    #删除网络参数ff_logit_b,因为计算cost用不到此参数
    tparams.pop('ff_logit_b')

    hter = tensor.matrix('hter', dtype='float32')
    Wt = tparams['ff_logit_W']  #old_tparams['ff_logit_W']

    n_timesteps = y.shape[0]
    n_samples = y.shape[1]

    emb = Wt.T[y.flatten()]
    emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
    emb = emb * tt

    #是否使用 dropout
    if options['use_dropout']:
        retain_probability_emb = 1 - options['dropout_embedding']
        retain_probability_hidden = 1 - options['dropout_hidden']
        retain_probability_source = 1 - options['dropout_source']
        if options['model_version'] < 0.1:
            scaled = False
        else:
            scaled = True
        rec_dropout = shared_dropout_layer((2, n_samples, options['dim']),
                                           use_noise, trng,
                                           retain_probability_hidden, scaled)
        emb_dropout = shared_dropout_layer((2, n_samples, options['dim_word']),
                                           use_noise, trng,
                                           retain_probability_emb, scaled)
        source_dropout = shared_dropout_layer(
            (n_timesteps, n_samples, 1), use_noise, trng,
            retain_probability_source, scaled)
        source_dropout = tensor.tile(source_dropout,
                                     (1, 1, options['dim_word']))
    else:
        rec_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32'))
        emb_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32'))

    proj = gru_layer(tparams,
                     emb,
                     options,
                     prefix='final_encoder',
                     mask=y_mask,
                     emb_dropout=emb_dropout,
                     rec_dropout=rec_dropout,
                     profile=False)

    hh = proj[0][-1, :, :]
    y_pred = tensor.dot(hh, tparams['final_W'])  #此时得出的结果也不错

    final_cost = tensor.abs_(y_pred - hter).mean(axis=0)[0]
    cost = final_cost

    return trng, use_noise, x, x_mask, y, y_mask, hter, y_pred, cost
Example #6
0
    def __init__(self, model_files, configs, model_weights=None):
        """"
        Create a ConstrainedTM using Nematus translation models

        Args:
          config: a dict containing key-->value for each argument supported by `nematus/translate.py`

        """
        if configs is not None:
            assert len(model_files) == len(
                configs), 'Number of models differs from numer of config files'

        trng = RandomStreams(1234)
        # don't use noise
        use_noise = shared(numpy.float32(0.))

        self.eos_token = '<eos>'

        self.fs_init = []
        self.fs_next = []

        # each entry in self.word_dicts is:
        # `{'input_dicts': [...], 'input_idicts': [...], 'output_dict': <dict>, 'output_idict': <dict>}
        self.word_dicts = []

        if configs is None:
            # Nematus models with new format (no separate config)
            configs = []
            for model in model_files:
                configs.append(load_config(model))
                # backward compatibility
                fill_options(configs[-1])

        for model, config in zip(model_files, configs):
            # fill in any unspecified options in-place
            fill_options(config)
            param_list = numpy.load(model).files
            param_list = dict.fromkeys(
                [key for key in param_list if not key.startswith('adam_')], 0)
            params = load_params(model, param_list)
            tparams = init_theano_params(params)

            # load model-specific input and output vocabularies
            # Note: some models have multiple input factors -- if so, we need to split that model's input into factors
            #   using the same logic that was used at training time
            # Note: every model's output vocabulary must be exactly the same in order to do ensemble decoding
            self.word_dicts.append(
                self.load_dictionaries(config['dictionaries'],
                                       n_words_src=config.get(
                                           'n_words_src', None),
                                       n_words_trg=config.get('n_words',
                                                              None)))

            # WORKING: add passing attention model alignment through GBS
            # f_init, f_next = build_sampler(tparams, config, use_noise, trng,
            #                                return_alignment=config['return_alignment'])
            f_init, f_next = build_sampler(tparams,
                                           config,
                                           use_noise,
                                           trng,
                                           return_alignment=True)

            self.fs_init.append(f_init)
            self.fs_next.append(f_next)

        # Make sure all output dicts have the same number of items
        assert len(
            set(len(d['output_dict']) for d in
                self.word_dicts)) == 1, 'Output vocabularies must be identical'

        self.num_models = len(self.fs_init)

        if model_weights is None:
            self.model_weights = numpy.ones(self.num_models) / float(
                self.num_models)
        else:
            assert len(
                model_weights
            ) == self.num_models, 'if you specify weights, there must be one per model'
            self.model_weights = numpy.array(model_weights)
Example #7
0
def get_qv(model='model/model.npz.best_bleu'):
    """
    @function:获得质量向量(quality vector)
    """
    options = load_config(model)

    params = init_params(options)
    params = load_params(model, params)

    tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt = build_model(tparams,options)

    #加载数据
    train = TextIterator(
        options['datasets'][0],
        options['datasets'][1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=options['batch_size'],
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    dev = TextIterator(
        options['valid_datasets'][0],
        options['valid_datasets'][1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=options['valid_batch_size'],
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    f_tt = theano.function([x, x_mask, y, y_mask], tt, name='f_tt')

    #print tparams['ff_logit_W'].get_value().shape   #### (500,40000)
    n_samples = 0

    for x, y in train:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        tt_ = f_tt(x, x_mask, y, y_mask)
        Wt = tparams['ff_logit_W'].get_value()

        for j in range(y.shape[1]):

            qv_ = []
            for i in range(y.shape[0]):
                if y_mask[i][j] == 1:
                    index = y[i][j]
                    qv = tt_[i, 0, :].T * Wt[:, index]
                    qv_.append(list(qv))

            with open('qv/train/' + str(n_samples + j) + '.qv.pkl', 'w') as fp:
                pkl.dump(qv_, fp)

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'
Example #8
0
def alignment(
        model='model/model.npz.best_bleu',
        train=['test/train.bpe.en','test/train.bpe.es'],
        test=['test/test.bpe.en','test/test.bpe.es'],
        batch_size=10
        ):
    """
    @function:获得对数似然特征
    """
    options = load_config(model)
    
    params = init_params(options)
    params = load_params(model, params)
    
    tparams = init_theano_params(params)
    
    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt, _ = build_model(tparams,options)
    
    #加载数据
    train = TextIterator(train[0], train[1],
                        options['dictionaries'][0], options['dictionaries'][1],
                        n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'],
                        batch_size=batch_size,
                        maxlen=1000, #设置尽可能长的长度
                        sort_by_length=False) #设为 False
    
    test = TextIterator(test[0], test[1],
                        options['dictionaries'][0], options['dictionaries'][1],
                        n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'],
                        batch_size=batch_size,
                        maxlen=1000, #设置尽可能长的长度
                        sort_by_length=False) #设为 False
    
    f_align = theano.function([x,x_mask,y,y_mask],opt_ret,name='f_cost')

    #################### train #######################
    """
    n_samples = 0
    for x, y in train:
            # 准备数据用于训练
            x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000,
                                                n_words_src=options['n_words_src'],
                                                n_words=options['n_words_tgt'])
            align = f_align(x,x_mask,y,y_mask)['dec_alphas'] # (y, batch_size, x)
            align = align * y_mask[:,:,None] # 注意此处技巧
            align_shp = align.shape
            for j in range(align_shp[1]):
                row_ = int(numpy.sum(y_mask[:,j]))
                col_ = int(numpy.sum(x_mask[:,j]))
                align_data = align[:row_,j,:col_] # 词对齐矩阵
                
                with open('features/alignment/train.en-es.word.align','a+') as fp:
                    for data in align_data:
                        fp.writelines('\t'.join(map(lambda x:str(x), data))+'\n')
                    fp.writelines('\n')
                     
            n_samples += y.shape[1]
            print 'processed:',n_samples,'samples ...'
    """
    ################### test ########################
    n_samples = 0
    for x, y in test:
            # 准备数据用于训练
            x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000,
                                                n_words_src=options['n_words_src'],
                                                n_words=options['n_words_tgt'])
            align = f_align(x,x_mask,y,y_mask)['dec_alphas'] # (y, batch_size, x)
            align = align * y_mask[:,:,None] # 注意此处技巧
            align_shp = align.shape
            for j in range(align_shp[1]):
                row_ = int(numpy.sum(y_mask[:,j]))
                col_ = int(numpy.sum(x_mask[:,j]))
                align_data = align[:row_,j,:col_] # 词对齐矩阵
                
                with open('features/alignment/test.en-es.word.align','a+') as fp:
                    for data in align_data:
                        fp.writelines('\t'.join(map(lambda x:str(x), data))+'\n')
                    fp.writelines('\n')
                     
            n_samples += y.shape[1]
            print 'processed:',n_samples,'samples ...'
def word_embedding(
        model='model/model.npz.best_bleu',
        train=['test/train.bpe.en','test/train.bpe.es'],
        dev=['test/dev.bpe.en','test/dev.bpe.es'],
        test=['test/test.bpe.en','test/test.bpe.es'],
        batch_size=10
        ):
    """
    @function:获得词向量
    """
    options = load_config(model) # 加载设置的超参数
    
    params = init_params(options)
    params = load_params(model, params) # 加载模型参数
    
    #加载数据
    train = TextIterator(train[0], train[1],
                        options['dictionaries'][0], options['dictionaries'][1],
                        n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'],
                        batch_size=batch_size,
                        maxlen=1000, #设置尽可能长的长度
                        sort_by_length=False) #设为 False
    
    dev = TextIterator(dev[0], dev[1],
                        options['dictionaries'][0], options['dictionaries'][1],
                        n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'],
                        batch_size=batch_size,
                        maxlen=1000, #设置尽可能长的长度
                        sort_by_length=False) #设为 False
    
    test = TextIterator(test[0], test[1],
                        options['dictionaries'][0], options['dictionaries'][1],
                        n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'],
                        batch_size=batch_size,
                        maxlen=1000, #设置尽可能长的长度
                        sort_by_length=False) #设为 False
    
    #################### train #######################
    Wemb = params['Wemb']
    Wemb_dec = params['Wemb_dec']
    
    n_samples = 0
    for x, y in train:
        x_emb = get_emb(x, Wemb)
        y_emb = get_emb(y, Wemb_dec)
        
        with open('features/emb/train.es-en.es.emb','a+') as fp:
            for x_row in x_emb: 
                fp.writelines('\t'.join(map(lambda x:str(x), x_row))+'\n')
        with open('features/emb/train.es-en.en.emb','a+') as fp:
            for y_row in y_emb: 
                fp.writelines('\t'.join(map(lambda x:str(x), y_row))+'\n')
                
        n_samples += len(x)
        print 'processed:',n_samples,'samples ...'
    
    ################### test ########################
    Wemb = params['Wemb']
    Wemb_dec = params['Wemb_dec']
    
    n_samples = 0
    for x, y in test:
        x_emb = get_emb(x, Wemb)
        y_emb = get_emb(y, Wemb_dec)
        
        with open('features/emb/test.es-en.es.emb','a+') as fp:
            for x_row in x_emb: 
                fp.writelines('\t'.join(map(lambda x:str(x), x_row))+'\n')
        with open('features/emb/test.es-en.en.emb','a+') as fp:
            for y_row in y_emb: 
                fp.writelines('\t'.join(map(lambda x:str(x), y_row))+'\n')
                
        n_samples += len(x)
        print 'processed:',n_samples,'samples ...'
    
    ################### dev ########################
    Wemb = params['Wemb']
    Wemb_dec = params['Wemb_dec']
    
    n_samples = 0
    for x, y in dev:
        x_emb = get_emb(x, Wemb)
        y_emb = get_emb(y, Wemb_dec)
        
        with open('features/emb/dev.es-en.es.emb','a+') as fp:
            for x_row in x_emb: 
                fp.writelines('\t'.join(map(lambda x:str(x), x_row))+'\n')
        with open('features/emb/dev.es-en.en.emb','a+') as fp:
            for y_row in y_emb: 
                fp.writelines('\t'.join(map(lambda x:str(x), y_row))+'\n')
                
        n_samples += len(x)
        print 'processed:',n_samples,'samples ...'
Example #10
0
def extract_logprob(model='model/model.npz.best_bleu',
                    train=['test/train.bpe.en', 'test/train.bpe.es'],
                    test=['test/test.bpe.en', 'test/test.bpe.es'],
                    batch_size=10):
    """
    @function:获得对数似然特征
    """
    options = load_config(model)

    params = init_params(options)
    params = load_params(model, params)

    tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt, _ = build_model(tparams,options)

    #加载数据
    train = TextIterator(
        train[0],
        train[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    test = TextIterator(
        test[0],
        test[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    f_cost = theano.function([x, x_mask, y, y_mask], cost, name='f_cost')

    #################### train #######################
    n_samples = 0
    for x, y in train:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        logprob = f_cost(x, x_mask, y, y_mask)
        with open('features/train.es-en.logprob', 'a+') as fp:
            fp.writelines('\n'.join(map(lambda x: str(x), list(logprob))) +
                          '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'

    ################### test ########################
    n_samples = 0
    for x, y in test:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        logprob = f_cost(x, x_mask, y, y_mask)
        with open('features/test.es-en.logprob', 'a+') as fp:
            fp.writelines('\n'.join(map(lambda x: str(x), list(logprob))) +
                          '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'
def extract_qv(model='model/model.npz.best_bleu',
               train=['test/train.bpe.en', 'test/train.bpe.es'],
               test=['test/test.bpe.en', 'test/test.bpe.es'],
               batch_size=10):
    """
    @function:获得质量向量(quality vector)
    """
    options = load_config(model)

    params = init_params(options)
    params = load_params(model, params)

    tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt, _ = build_model(tparams,options)

    #加载数据
    train = TextIterator(
        train[0],
        train[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    test = TextIterator(
        test[0],
        test[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    f_tt = theano.function([x, x_mask, y, y_mask], tt, name='f_tt')

    #################### train #######################
    n_samples = 0
    for x, y in train:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        tt_ = f_tt(x, x_mask, y, y_mask)
        Wt = tparams['ff_logit_W'].get_value()

        for j in range(y.shape[1]):

            qv_ = []
            for i in range(y.shape[0]):
                if y_mask[i][j] == 1:
                    index = y[i][j]
                    qv = Wt[:, index].T * tt_[i, j, :]
                    qv_.append(list(qv))
            qv_ = numpy.array(qv_)
            qv_ = list(map(lambda x: str(x), qv_.mean(axis=0)))

            with open('features/train.nmt.qv', 'a+') as fp:
                fp.writelines('\t'.join(qv_) + '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'

    ################### test ########################
    n_samples = 0
    for x, y in test:  #*****
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        tt_ = f_tt(x, x_mask, y, y_mask)
        Wt = tparams['ff_logit_W'].get_value()

        for j in range(y.shape[1]):

            qv_ = []
            for i in range(y.shape[0]):
                if y_mask[i][j] == 1:
                    index = y[i][j]
                    qv = Wt[:, index].T * tt_[i, j, :]
                    qv_.append(list(qv))
            qv_ = numpy.array(qv_)
            qv_ = list(map(lambda x: str(x), qv_.mean(axis=0)))

            with open('features/test.nmt.qv', 'a+') as fp:  #*****
                fp.writelines('\t'.join(qv_) + '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'