def init_params(options):
    params = OrderedDict()

    #获取nmt的参数
    nmt_options = load_config(options['nmt_model'])
    nmt_params = nmt.init_params(nmt_options)
    nmt_params = load_params(options['nmt_model'], nmt_params)
    for key in nmt_params:
        params[key] = nmt_params[key]

    #获取final_rnn的参数
    final_options = load_config(options['final_model'])
    final_params = rnn.init_params(final_options)
    final_params = load_params(options['final_model'], final_params)
    for key in final_params:
        params[key] = final_params[key]

    return params
Beispiel #2
0
def build_model(tparams, options):
    """
    @first:得到f_tt函数
    """
    old_options = load_config(options['nmt_model'])
    params = nmt.init_params(old_options)
    params = load_params(options['nmt_model'], params)
    old_tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt = nmt.build_model(old_tparams,old_options)

    hter = tensor.matrix('hter', dtype='float32')
    Wt = old_tparams['ff_logit_W']
    #w2v = tensor.matrix('w2v',dtype='float32')

    n_timesteps = y.shape[0]
    n_samples = y.shape[1]

    emb = Wt.T[y.flatten()]
    emb = emb.reshape([n_timesteps, n_samples, 500])
    emb = emb * tt

    #是否使用 dropout
    if options['use_dropout']:
        retain_probability_emb = 1 - options['dropout_embedding']
        retain_probability_hidden = 1 - options['dropout_hidden']
        retain_probability_source = 1 - options['dropout_source']
        if options['model_version'] < 0.1:
            scaled = False
        else:
            scaled = True
        rec_dropout = shared_dropout_layer((2, n_samples, options['dim']),
                                           use_noise, trng,
                                           retain_probability_hidden, scaled)
        emb_dropout = shared_dropout_layer((2, n_samples, options['dim_word']),
                                           use_noise, trng,
                                           retain_probability_emb, scaled)
        source_dropout = shared_dropout_layer(
            (n_timesteps, n_samples, 1), use_noise, trng,
            retain_probability_source, scaled)
        source_dropout = tensor.tile(source_dropout,
                                     (1, 1, options['dim_word']))
    else:
        rec_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32'))
        emb_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32'))

    #if options['use_dropout']:
    #    emb *= source_dropout
    #emb = get_qv_w2c(emb,y,w2v,dim=500)

    proj = gru_layer(tparams,
                     emb,
                     options,
                     prefix='final_encoder',
                     mask=y_mask,
                     emb_dropout=emb_dropout,
                     rec_dropout=rec_dropout,
                     profile=False)

    hh = proj[0][-1, :, :]
    y_pred = tensor.dot(hh, tparams['final_W'])  #此时得出的结果也不错

    #y_pred = tensor.nnet.sigmoid(tensor.dot(hh,tparams['W']))
    cost = tensor.abs_(y_pred - hter).mean(axis=0)[0]

    return trng, use_noise, x, x_mask, y, y_mask, hter, y_pred, cost
Beispiel #3
0
def encoder_hidden(model='model/model.npz.best_bleu',
                   train=['test/train.bpe.en', 'test/train.bpe.es'],
                   test=['test/test.bpe.en', 'test/test.bpe.es'],
                   batch_size=10):
    """
    @function:获得对数似然特征
    """
    options = load_config(model)

    params = init_params(options)
    params = load_params(model, params)

    tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt, decoderh = build_model(tparams,options)

    #加载数据
    train = TextIterator(
        train[0],
        train[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    test = TextIterator(
        test[0],
        test[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    f_ctx = theano.function([x, x_mask], ctx, name='f_ctx')

    #################### train #######################
    n_samples = 0
    for x, y in train:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        encoderh = f_ctx(x, x_mask)
        encoderh = numpy.concatenate(
            [encoderh[0, :, 1024:], encoderh[-1, :, :1024]], axis=1)

        with open('features/hidden/train.en-es.encoderh', 'a+') as fp:
            for hh_data in encoderh:
                fp.writelines('\t'.join(map(lambda x: str(x), list(hh_data))) +
                              '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'

    ################### test ########################
    n_samples = 0
    for x, y in test:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        encoderh = f_ctx(x, x_mask)
        encoderh = numpy.concatenate(
            [encoderh[0, :, 1024:], encoderh[-1, :, :1024]], axis=1)

        with open('features/hidden/test.en-es.encoderh', 'a+') as fp:
            for hh_data in encoderh:
                fp.writelines('\t'.join(map(lambda x: str(x), list(hh_data))) +
                              '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'
Beispiel #4
0
def get_qv(model='model/model.npz.best_bleu'):
    """
    @function:获得质量向量(quality vector)
    """
    options = load_config(model)

    params = init_params(options)
    params = load_params(model, params)

    tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt = build_model(tparams,options)

    #加载数据
    train = TextIterator(
        options['datasets'][0],
        options['datasets'][1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=options['batch_size'],
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    dev = TextIterator(
        options['valid_datasets'][0],
        options['valid_datasets'][1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=options['valid_batch_size'],
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    f_tt = theano.function([x, x_mask, y, y_mask], tt, name='f_tt')

    #print tparams['ff_logit_W'].get_value().shape   #### (500,40000)
    n_samples = 0

    for x, y in train:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        tt_ = f_tt(x, x_mask, y, y_mask)
        Wt = tparams['ff_logit_W'].get_value()

        for j in range(y.shape[1]):

            qv_ = []
            for i in range(y.shape[0]):
                if y_mask[i][j] == 1:
                    index = y[i][j]
                    qv = tt_[i, 0, :].T * Wt[:, index]
                    qv_.append(list(qv))

            with open('qv/train/' + str(n_samples + j) + '.qv.pkl', 'w') as fp:
                pkl.dump(qv_, fp)

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'
Beispiel #5
0
def alignment(
        model='model/model.npz.best_bleu',
        train=['test/train.bpe.en','test/train.bpe.es'],
        test=['test/test.bpe.en','test/test.bpe.es'],
        batch_size=10
        ):
    """
    @function:获得对数似然特征
    """
    options = load_config(model)
    
    params = init_params(options)
    params = load_params(model, params)
    
    tparams = init_theano_params(params)
    
    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt, _ = build_model(tparams,options)
    
    #加载数据
    train = TextIterator(train[0], train[1],
                        options['dictionaries'][0], options['dictionaries'][1],
                        n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'],
                        batch_size=batch_size,
                        maxlen=1000, #设置尽可能长的长度
                        sort_by_length=False) #设为 False
    
    test = TextIterator(test[0], test[1],
                        options['dictionaries'][0], options['dictionaries'][1],
                        n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'],
                        batch_size=batch_size,
                        maxlen=1000, #设置尽可能长的长度
                        sort_by_length=False) #设为 False
    
    f_align = theano.function([x,x_mask,y,y_mask],opt_ret,name='f_cost')

    #################### train #######################
    """
    n_samples = 0
    for x, y in train:
            # 准备数据用于训练
            x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000,
                                                n_words_src=options['n_words_src'],
                                                n_words=options['n_words_tgt'])
            align = f_align(x,x_mask,y,y_mask)['dec_alphas'] # (y, batch_size, x)
            align = align * y_mask[:,:,None] # 注意此处技巧
            align_shp = align.shape
            for j in range(align_shp[1]):
                row_ = int(numpy.sum(y_mask[:,j]))
                col_ = int(numpy.sum(x_mask[:,j]))
                align_data = align[:row_,j,:col_] # 词对齐矩阵
                
                with open('features/alignment/train.en-es.word.align','a+') as fp:
                    for data in align_data:
                        fp.writelines('\t'.join(map(lambda x:str(x), data))+'\n')
                    fp.writelines('\n')
                     
            n_samples += y.shape[1]
            print 'processed:',n_samples,'samples ...'
    """
    ################### test ########################
    n_samples = 0
    for x, y in test:
            # 准备数据用于训练
            x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000,
                                                n_words_src=options['n_words_src'],
                                                n_words=options['n_words_tgt'])
            align = f_align(x,x_mask,y,y_mask)['dec_alphas'] # (y, batch_size, x)
            align = align * y_mask[:,:,None] # 注意此处技巧
            align_shp = align.shape
            for j in range(align_shp[1]):
                row_ = int(numpy.sum(y_mask[:,j]))
                col_ = int(numpy.sum(x_mask[:,j]))
                align_data = align[:row_,j,:col_] # 词对齐矩阵
                
                with open('features/alignment/test.en-es.word.align','a+') as fp:
                    for data in align_data:
                        fp.writelines('\t'.join(map(lambda x:str(x), data))+'\n')
                    fp.writelines('\n')
                     
            n_samples += y.shape[1]
            print 'processed:',n_samples,'samples ...'
def word_embedding(
        model='model/model.npz.best_bleu',
        train=['test/train.bpe.en','test/train.bpe.es'],
        dev=['test/dev.bpe.en','test/dev.bpe.es'],
        test=['test/test.bpe.en','test/test.bpe.es'],
        batch_size=10
        ):
    """
    @function:获得词向量
    """
    options = load_config(model) # 加载设置的超参数
    
    params = init_params(options)
    params = load_params(model, params) # 加载模型参数
    
    #加载数据
    train = TextIterator(train[0], train[1],
                        options['dictionaries'][0], options['dictionaries'][1],
                        n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'],
                        batch_size=batch_size,
                        maxlen=1000, #设置尽可能长的长度
                        sort_by_length=False) #设为 False
    
    dev = TextIterator(dev[0], dev[1],
                        options['dictionaries'][0], options['dictionaries'][1],
                        n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'],
                        batch_size=batch_size,
                        maxlen=1000, #设置尽可能长的长度
                        sort_by_length=False) #设为 False
    
    test = TextIterator(test[0], test[1],
                        options['dictionaries'][0], options['dictionaries'][1],
                        n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'],
                        batch_size=batch_size,
                        maxlen=1000, #设置尽可能长的长度
                        sort_by_length=False) #设为 False
    
    #################### train #######################
    Wemb = params['Wemb']
    Wemb_dec = params['Wemb_dec']
    
    n_samples = 0
    for x, y in train:
        x_emb = get_emb(x, Wemb)
        y_emb = get_emb(y, Wemb_dec)
        
        with open('features/emb/train.es-en.es.emb','a+') as fp:
            for x_row in x_emb: 
                fp.writelines('\t'.join(map(lambda x:str(x), x_row))+'\n')
        with open('features/emb/train.es-en.en.emb','a+') as fp:
            for y_row in y_emb: 
                fp.writelines('\t'.join(map(lambda x:str(x), y_row))+'\n')
                
        n_samples += len(x)
        print 'processed:',n_samples,'samples ...'
    
    ################### test ########################
    Wemb = params['Wemb']
    Wemb_dec = params['Wemb_dec']
    
    n_samples = 0
    for x, y in test:
        x_emb = get_emb(x, Wemb)
        y_emb = get_emb(y, Wemb_dec)
        
        with open('features/emb/test.es-en.es.emb','a+') as fp:
            for x_row in x_emb: 
                fp.writelines('\t'.join(map(lambda x:str(x), x_row))+'\n')
        with open('features/emb/test.es-en.en.emb','a+') as fp:
            for y_row in y_emb: 
                fp.writelines('\t'.join(map(lambda x:str(x), y_row))+'\n')
                
        n_samples += len(x)
        print 'processed:',n_samples,'samples ...'
    
    ################### dev ########################
    Wemb = params['Wemb']
    Wemb_dec = params['Wemb_dec']
    
    n_samples = 0
    for x, y in dev:
        x_emb = get_emb(x, Wemb)
        y_emb = get_emb(y, Wemb_dec)
        
        with open('features/emb/dev.es-en.es.emb','a+') as fp:
            for x_row in x_emb: 
                fp.writelines('\t'.join(map(lambda x:str(x), x_row))+'\n')
        with open('features/emb/dev.es-en.en.emb','a+') as fp:
            for y_row in y_emb: 
                fp.writelines('\t'.join(map(lambda x:str(x), y_row))+'\n')
                
        n_samples += len(x)
        print 'processed:',n_samples,'samples ...'
Beispiel #7
0
def extract_logprob(model='model/model.npz.best_bleu',
                    train=['test/train.bpe.en', 'test/train.bpe.es'],
                    test=['test/test.bpe.en', 'test/test.bpe.es'],
                    batch_size=10):
    """
    @function:获得对数似然特征
    """
    options = load_config(model)

    params = init_params(options)
    params = load_params(model, params)

    tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt, _ = build_model(tparams,options)

    #加载数据
    train = TextIterator(
        train[0],
        train[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    test = TextIterator(
        test[0],
        test[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    f_cost = theano.function([x, x_mask, y, y_mask], cost, name='f_cost')

    #################### train #######################
    n_samples = 0
    for x, y in train:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        logprob = f_cost(x, x_mask, y, y_mask)
        with open('features/train.es-en.logprob', 'a+') as fp:
            fp.writelines('\n'.join(map(lambda x: str(x), list(logprob))) +
                          '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'

    ################### test ########################
    n_samples = 0
    for x, y in test:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        logprob = f_cost(x, x_mask, y, y_mask)
        with open('features/test.es-en.logprob', 'a+') as fp:
            fp.writelines('\n'.join(map(lambda x: str(x), list(logprob))) +
                          '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'
def extract_qv(model='model/model.npz.best_bleu',
               train=['test/train.bpe.en', 'test/train.bpe.es'],
               test=['test/test.bpe.en', 'test/test.bpe.es'],
               batch_size=10):
    """
    @function:获得质量向量(quality vector)
    """
    options = load_config(model)

    params = init_params(options)
    params = load_params(model, params)

    tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt, _ = build_model(tparams,options)

    #加载数据
    train = TextIterator(
        train[0],
        train[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    test = TextIterator(
        test[0],
        test[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    f_tt = theano.function([x, x_mask, y, y_mask], tt, name='f_tt')

    #################### train #######################
    n_samples = 0
    for x, y in train:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        tt_ = f_tt(x, x_mask, y, y_mask)
        Wt = tparams['ff_logit_W'].get_value()

        for j in range(y.shape[1]):

            qv_ = []
            for i in range(y.shape[0]):
                if y_mask[i][j] == 1:
                    index = y[i][j]
                    qv = Wt[:, index].T * tt_[i, j, :]
                    qv_.append(list(qv))
            qv_ = numpy.array(qv_)
            qv_ = list(map(lambda x: str(x), qv_.mean(axis=0)))

            with open('features/train.nmt.qv', 'a+') as fp:
                fp.writelines('\t'.join(qv_) + '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'

    ################### test ########################
    n_samples = 0
    for x, y in test:  #*****
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        tt_ = f_tt(x, x_mask, y, y_mask)
        Wt = tparams['ff_logit_W'].get_value()

        for j in range(y.shape[1]):

            qv_ = []
            for i in range(y.shape[0]):
                if y_mask[i][j] == 1:
                    index = y[i][j]
                    qv = Wt[:, index].T * tt_[i, j, :]
                    qv_.append(list(qv))
            qv_ = numpy.array(qv_)
            qv_ = list(map(lambda x: str(x), qv_.mean(axis=0)))

            with open('features/test.nmt.qv', 'a+') as fp:  #*****
                fp.writelines('\t'.join(qv_) + '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'