Exemple #1
0
def build_model(tparams, options):
    """
    @first:得到f_tt函数
    """
    old_options = load_config(options['nmt_model'])
    params = nmt.init_params(old_options)
    params = load_params(options['nmt_model'], params)
    old_tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt = nmt.build_model(old_tparams,old_options)

    hter = tensor.matrix('hter', dtype='float32')
    Wt = old_tparams['ff_logit_W']
    #w2v = tensor.matrix('w2v',dtype='float32')

    n_timesteps = y.shape[0]
    n_samples = y.shape[1]

    emb = Wt.T[y.flatten()]
    emb = emb.reshape([n_timesteps, n_samples, 500])
    emb = emb * tt

    #是否使用 dropout
    if options['use_dropout']:
        retain_probability_emb = 1 - options['dropout_embedding']
        retain_probability_hidden = 1 - options['dropout_hidden']
        retain_probability_source = 1 - options['dropout_source']
        if options['model_version'] < 0.1:
            scaled = False
        else:
            scaled = True
        rec_dropout = shared_dropout_layer((2, n_samples, options['dim']),
                                           use_noise, trng,
                                           retain_probability_hidden, scaled)
        emb_dropout = shared_dropout_layer((2, n_samples, options['dim_word']),
                                           use_noise, trng,
                                           retain_probability_emb, scaled)
        source_dropout = shared_dropout_layer(
            (n_timesteps, n_samples, 1), use_noise, trng,
            retain_probability_source, scaled)
        source_dropout = tensor.tile(source_dropout,
                                     (1, 1, options['dim_word']))
    else:
        rec_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32'))
        emb_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32'))

    #if options['use_dropout']:
    #    emb *= source_dropout
    #emb = get_qv_w2c(emb,y,w2v,dim=500)

    proj = gru_layer(tparams,
                     emb,
                     options,
                     prefix='final_encoder',
                     mask=y_mask,
                     emb_dropout=emb_dropout,
                     rec_dropout=rec_dropout,
                     profile=False)

    hh = proj[0][-1, :, :]
    y_pred = tensor.dot(hh, tparams['final_W'])  #此时得出的结果也不错

    #y_pred = tensor.nnet.sigmoid(tensor.dot(hh,tparams['W']))
    cost = tensor.abs_(y_pred - hter).mean(axis=0)[0]

    return trng, use_noise, x, x_mask, y, y_mask, hter, y_pred, cost
def build_model(tparams, options):
    """
    @first:得到f_tt函数
    """
    nmt_options = load_config(options['nmt_model'])
    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, nmt_cost, ctx, tt = nmt.build_model(tparams,nmt_options) # *** tparams ***

    #删除网络参数ff_logit_b,因为计算cost用不到此参数
    tparams.pop('ff_logit_b')

    hter = tensor.matrix('hter', dtype='float32')
    Wt = tparams['ff_logit_W']  #old_tparams['ff_logit_W']

    n_timesteps = y.shape[0]
    n_samples = y.shape[1]

    emb = Wt.T[y.flatten()]
    emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
    emb = emb * tt

    #是否使用 dropout
    if options['use_dropout']:
        retain_probability_emb = 1 - options['dropout_embedding']
        retain_probability_hidden = 1 - options['dropout_hidden']
        retain_probability_source = 1 - options['dropout_source']
        if options['model_version'] < 0.1:
            scaled = False
        else:
            scaled = True
        rec_dropout = shared_dropout_layer((2, n_samples, options['dim']),
                                           use_noise, trng,
                                           retain_probability_hidden, scaled)
        emb_dropout = shared_dropout_layer((2, n_samples, options['dim_word']),
                                           use_noise, trng,
                                           retain_probability_emb, scaled)
        source_dropout = shared_dropout_layer(
            (n_timesteps, n_samples, 1), use_noise, trng,
            retain_probability_source, scaled)
        source_dropout = tensor.tile(source_dropout,
                                     (1, 1, options['dim_word']))
    else:
        rec_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32'))
        emb_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32'))

    proj = gru_layer(tparams,
                     emb,
                     options,
                     prefix='final_encoder',
                     mask=y_mask,
                     emb_dropout=emb_dropout,
                     rec_dropout=rec_dropout,
                     profile=False)

    hh = proj[0][-1, :, :]
    y_pred = tensor.dot(hh, tparams['final_W'])  #此时得出的结果也不错

    final_cost = tensor.abs_(y_pred - hter).mean(axis=0)[0]
    cost = final_cost

    return trng, use_noise, x, x_mask, y, y_mask, hter, y_pred, cost
Exemple #3
0
def encoder_hidden(model='model/model.npz.best_bleu',
                   train=['test/train.bpe.en', 'test/train.bpe.es'],
                   test=['test/test.bpe.en', 'test/test.bpe.es'],
                   batch_size=10):
    """
    @function:获得对数似然特征
    """
    options = load_config(model)

    params = init_params(options)
    params = load_params(model, params)

    tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt, decoderh = build_model(tparams,options)

    #加载数据
    train = TextIterator(
        train[0],
        train[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    test = TextIterator(
        test[0],
        test[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    f_ctx = theano.function([x, x_mask], ctx, name='f_ctx')

    #################### train #######################
    n_samples = 0
    for x, y in train:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        encoderh = f_ctx(x, x_mask)
        encoderh = numpy.concatenate(
            [encoderh[0, :, 1024:], encoderh[-1, :, :1024]], axis=1)

        with open('features/hidden/train.en-es.encoderh', 'a+') as fp:
            for hh_data in encoderh:
                fp.writelines('\t'.join(map(lambda x: str(x), list(hh_data))) +
                              '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'

    ################### test ########################
    n_samples = 0
    for x, y in test:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        encoderh = f_ctx(x, x_mask)
        encoderh = numpy.concatenate(
            [encoderh[0, :, 1024:], encoderh[-1, :, :1024]], axis=1)

        with open('features/hidden/test.en-es.encoderh', 'a+') as fp:
            for hh_data in encoderh:
                fp.writelines('\t'.join(map(lambda x: str(x), list(hh_data))) +
                              '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'
Exemple #4
0
def get_qv(model='model/model.npz.best_bleu'):
    """
    @function:获得质量向量(quality vector)
    """
    options = load_config(model)

    params = init_params(options)
    params = load_params(model, params)

    tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt = build_model(tparams,options)

    #加载数据
    train = TextIterator(
        options['datasets'][0],
        options['datasets'][1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=options['batch_size'],
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    dev = TextIterator(
        options['valid_datasets'][0],
        options['valid_datasets'][1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=options['valid_batch_size'],
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    f_tt = theano.function([x, x_mask, y, y_mask], tt, name='f_tt')

    #print tparams['ff_logit_W'].get_value().shape   #### (500,40000)
    n_samples = 0

    for x, y in train:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        tt_ = f_tt(x, x_mask, y, y_mask)
        Wt = tparams['ff_logit_W'].get_value()

        for j in range(y.shape[1]):

            qv_ = []
            for i in range(y.shape[0]):
                if y_mask[i][j] == 1:
                    index = y[i][j]
                    qv = tt_[i, 0, :].T * Wt[:, index]
                    qv_.append(list(qv))

            with open('qv/train/' + str(n_samples + j) + '.qv.pkl', 'w') as fp:
                pkl.dump(qv_, fp)

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'
Exemple #5
0
def alignment(
        model='model/model.npz.best_bleu',
        train=['test/train.bpe.en','test/train.bpe.es'],
        test=['test/test.bpe.en','test/test.bpe.es'],
        batch_size=10
        ):
    """
    @function:获得对数似然特征
    """
    options = load_config(model)
    
    params = init_params(options)
    params = load_params(model, params)
    
    tparams = init_theano_params(params)
    
    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt, _ = build_model(tparams,options)
    
    #加载数据
    train = TextIterator(train[0], train[1],
                        options['dictionaries'][0], options['dictionaries'][1],
                        n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'],
                        batch_size=batch_size,
                        maxlen=1000, #设置尽可能长的长度
                        sort_by_length=False) #设为 False
    
    test = TextIterator(test[0], test[1],
                        options['dictionaries'][0], options['dictionaries'][1],
                        n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'],
                        batch_size=batch_size,
                        maxlen=1000, #设置尽可能长的长度
                        sort_by_length=False) #设为 False
    
    f_align = theano.function([x,x_mask,y,y_mask],opt_ret,name='f_cost')

    #################### train #######################
    """
    n_samples = 0
    for x, y in train:
            # 准备数据用于训练
            x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000,
                                                n_words_src=options['n_words_src'],
                                                n_words=options['n_words_tgt'])
            align = f_align(x,x_mask,y,y_mask)['dec_alphas'] # (y, batch_size, x)
            align = align * y_mask[:,:,None] # 注意此处技巧
            align_shp = align.shape
            for j in range(align_shp[1]):
                row_ = int(numpy.sum(y_mask[:,j]))
                col_ = int(numpy.sum(x_mask[:,j]))
                align_data = align[:row_,j,:col_] # 词对齐矩阵
                
                with open('features/alignment/train.en-es.word.align','a+') as fp:
                    for data in align_data:
                        fp.writelines('\t'.join(map(lambda x:str(x), data))+'\n')
                    fp.writelines('\n')
                     
            n_samples += y.shape[1]
            print 'processed:',n_samples,'samples ...'
    """
    ################### test ########################
    n_samples = 0
    for x, y in test:
            # 准备数据用于训练
            x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000,
                                                n_words_src=options['n_words_src'],
                                                n_words=options['n_words_tgt'])
            align = f_align(x,x_mask,y,y_mask)['dec_alphas'] # (y, batch_size, x)
            align = align * y_mask[:,:,None] # 注意此处技巧
            align_shp = align.shape
            for j in range(align_shp[1]):
                row_ = int(numpy.sum(y_mask[:,j]))
                col_ = int(numpy.sum(x_mask[:,j]))
                align_data = align[:row_,j,:col_] # 词对齐矩阵
                
                with open('features/alignment/test.en-es.word.align','a+') as fp:
                    for data in align_data:
                        fp.writelines('\t'.join(map(lambda x:str(x), data))+'\n')
                    fp.writelines('\n')
                     
            n_samples += y.shape[1]
            print 'processed:',n_samples,'samples ...'
Exemple #6
0
def main(model,
         dictionary,
         dictionary_target,
         source_file,
         target_file,
         gold_align,
         saveto,
         k=5,
         pkl_file=None,
         normalize=False,
         output_attention=False):
    # load model model_options
    # if pkl_file is None:
    #     pkl_file = model + '.pkl'
    # with open(pkl_file, 'rb') as f:
    #     options = pkl.load(f)

    options = load_config(model)
    options['factor'] = 1
    # load source dictionary and invert
    word_dict = load_dict(dictionary)
    word_idict = dict()  # id2word
    for kk, vv in word_dict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # # load target dictionary and invert
    word_dict_trg = load_dict(dictionary_target)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    # utility function
    def _seqs2words(caps):
        capsw = []
        for cc in caps:
            ww = []
            for w in cc:
                if w == 0:
                    break
                ww.append(word_idict_trg[w])
            capsw.append(' '.join(ww))
        return capsw

    def _send_jobs(fx_name, fy_name):
        retval = []
        retval_ori = []
        with open(fx_name, 'r') as fx, open(fy_name, 'r') as fy:
            for idx, (line_x, line_y) in enumerate(zip(fx, fy)):
                words = line_x.strip().split()
                x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
                x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x)
                # x += [0]

                words = line_y.strip().split()
                y = map(
                    lambda w: word_dict_trg[w]
                    if w in word_dict_trg else 1, words)
                y = map(lambda ii: ii if ii < options['n_words'] else 1, y)
                # y += [0]

                retval_ori.append((line_x.strip(), line_y.strip()))
                retval.append((x, y))

        return retval, retval_ori

    # load params
    param_list = numpy.load(model).files
    param_list = dict.fromkeys(
        [key for key in param_list if not key.startswith('adam_')], 0)
    params = load_params(model, param_list)
    tparams = init_theano_params(params)

    # build model
    trng, use_noise, \
    x, x_mask, y, y_mask, \
    opt_ret, \
    cost = \
        build_model(tparams, options)

    inps = [x, x_mask, y, y_mask]

    # compile f_align
    logging.info('Building f_align...')
    # f_align= theano.function(inps, opt_ret['dec_alphas'], profile=profile)
    f_align = theano.function(inps, cost, profile=profile)
    logging.info('Done')

    print 'Processing ', source_file, '...'
    sys.stdout.flush()

    n_samples, n_samples_src = _send_jobs(source_file, target_file)

    atts = []
    idx = 0

    def _prepare_data(x, y):
        # print len(x), len(y)
        x = numpy.array([x]).T
        y = numpy.array([y]).T
        return x[None, :, :], numpy.ones_like(
            x, dtype='float32'), y, numpy.ones_like(y, dtype='float32')

    start_time = datetime.datetime.now()
    words = 0.
    for (x, y) in n_samples:
        # print x
        x, x_mask, y, y_mask = _prepare_data(x, y)

        att = f_align(x, x_mask, y, y_mask)  # (len_y, nsample=1, len_x)
        # att = numpy.squeeze(att, 1)
        # atts.append(att.T)
        # ipdb.set_trace()
        # print idx
        # idx += 1
        # if idx % 100 == 0:
        #     print idx,
        # break
    last = datetime.datetime.now() - start_time
    print last.total_seconds(), len(n_samples) / last.total_seconds()

    def _force_decode(x, y):
        # sample given an input sequence and obtain scores
        att = f_force_decode(numpy.array(x)[:, None], numpy.array(y)[:, None])
        _output_attention(0, att[0].squeeze(1).T)

    def _output_attention(sent_idx, att):
        dirname = saveto + '.attention'
        if not os.path.exists(dirname):
            os.mkdir(dirname)
        with open(dirname + '/' + str(sent_idx), 'w') as fp:
            fp.write("%d %d\n" % (att.shape[0], att.shape[1]))
            for row in att:
                # fp.write(str(row.argmax()) + " " + ' '.join([str(x) for x in row]) + '\n')
                fp.write('[' + ','.join([str(x) for x in row]) + '],')
                # fp.write(att)

    if output_attention:
        with open(saveto + '.att', 'w') as f:
            for idx, ((x, y), att) in enumerate(zip(n_samples_src, atts)):
                print >> f, ' '.join([
                    "{}:{}".format(idx + 1,
                                   hehe.argmax() + 1)
                    for idx, hehe in enumerate(att)
                ])
                # print >> f
    with open(saveto + '.att') as f_att, open(gold_align) as f_gold:
        AER = []
        count_S, count_P, len_A, len_S = 0., 0., 0., 0.
        for idx, (cand, gold) in enumerate(zip(f_att, f_gold)):
            aer, count_s, count_p, len_a, len_s = calc_aer(cand, gold)
            AER.append(aer)
            count_S += count_s
            count_P += count_p
            len_A += len_a
            len_S += len_s
        ave_AER = numpy.average(AER)
        overall_AER = 1 - (count_S + count_P) / (len_A + len_S)
        print 'ave_AER ', ave_AER
        print 'overall_AER ', overall_AER
        ipdb.set_trace()
    print 'Done'
Exemple #7
0
def extract_logprob(model='model/model.npz.best_bleu',
                    train=['test/train.bpe.en', 'test/train.bpe.es'],
                    test=['test/test.bpe.en', 'test/test.bpe.es'],
                    batch_size=10):
    """
    @function:获得对数似然特征
    """
    options = load_config(model)

    params = init_params(options)
    params = load_params(model, params)

    tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt, _ = build_model(tparams,options)

    #加载数据
    train = TextIterator(
        train[0],
        train[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    test = TextIterator(
        test[0],
        test[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    f_cost = theano.function([x, x_mask, y, y_mask], cost, name='f_cost')

    #################### train #######################
    n_samples = 0
    for x, y in train:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        logprob = f_cost(x, x_mask, y, y_mask)
        with open('features/train.es-en.logprob', 'a+') as fp:
            fp.writelines('\n'.join(map(lambda x: str(x), list(logprob))) +
                          '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'

    ################### test ########################
    n_samples = 0
    for x, y in test:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        logprob = f_cost(x, x_mask, y, y_mask)
        with open('features/test.es-en.logprob', 'a+') as fp:
            fp.writelines('\n'.join(map(lambda x: str(x), list(logprob))) +
                          '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'
def extract_qv(model='model/model.npz.best_bleu',
               train=['test/train.bpe.en', 'test/train.bpe.es'],
               test=['test/test.bpe.en', 'test/test.bpe.es'],
               batch_size=10):
    """
    @function:获得质量向量(quality vector)
    """
    options = load_config(model)

    params = init_params(options)
    params = load_params(model, params)

    tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt, _ = build_model(tparams,options)

    #加载数据
    train = TextIterator(
        train[0],
        train[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    test = TextIterator(
        test[0],
        test[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    f_tt = theano.function([x, x_mask, y, y_mask], tt, name='f_tt')

    #################### train #######################
    n_samples = 0
    for x, y in train:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        tt_ = f_tt(x, x_mask, y, y_mask)
        Wt = tparams['ff_logit_W'].get_value()

        for j in range(y.shape[1]):

            qv_ = []
            for i in range(y.shape[0]):
                if y_mask[i][j] == 1:
                    index = y[i][j]
                    qv = Wt[:, index].T * tt_[i, j, :]
                    qv_.append(list(qv))
            qv_ = numpy.array(qv_)
            qv_ = list(map(lambda x: str(x), qv_.mean(axis=0)))

            with open('features/train.nmt.qv', 'a+') as fp:
                fp.writelines('\t'.join(qv_) + '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'

    ################### test ########################
    n_samples = 0
    for x, y in test:  #*****
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        tt_ = f_tt(x, x_mask, y, y_mask)
        Wt = tparams['ff_logit_W'].get_value()

        for j in range(y.shape[1]):

            qv_ = []
            for i in range(y.shape[0]):
                if y_mask[i][j] == 1:
                    index = y[i][j]
                    qv = Wt[:, index].T * tt_[i, j, :]
                    qv_.append(list(qv))
            qv_ = numpy.array(qv_)
            qv_ = list(map(lambda x: str(x), qv_.mean(axis=0)))

            with open('features/test.nmt.qv', 'a+') as fp:  #*****
                fp.writelines('\t'.join(qv_) + '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'