def init_params(options):
    params = OrderedDict()

    #获取nmt的参数
    nmt_options = load_config(options['nmt_model'])
    nmt_params = nmt.init_params(nmt_options)
    nmt_params = load_params(options['nmt_model'], nmt_params)
    for key in nmt_params:
        params[key] = nmt_params[key]

    #获取final_rnn的参数
    final_options = load_config(options['final_model'])
    final_params = rnn.init_params(final_options)
    final_params = load_params(options['final_model'], final_params)
    for key in final_params:
        params[key] = final_params[key]

    return params
Exemple #2
0
def qe_rnn_trg(model='RNN_trg_model/wmt17.de-en.npz',
               datasets=['test/test16.bpe.src', 'test/test16.bpe.mt'],
               save_file='test16.hter.pred'):

    options = load_config(model)
    #-------------------
    params = rnn_trg.init_params(options)  # 修改此处
    params = load_params(model, params)
    tparams = init_theano_params(params)

    #-------------------
    trng,use_noise,x,x_mask,y,y_mask,\
        hter,y_pred,cost = rnn_trg.build_model(tparams,options) # 修改此处
    inps = [x, x_mask, y, y_mask]
    f_pred = theano.function(inps, y_pred, profile=False)

    test = data_iterator.TextIterator(datasets[0],
                                      datasets[1],
                                      options['dictionaries'][0],
                                      options['dictionaries'][1],
                                      n_words_source=options['n_words_src'],
                                      n_words_target=options['n_words_tgt'],
                                      batch_size=options['valid_batch_size'],
                                      maxlen=options['maxlen'],
                                      sort_by_length=False)

    res = []
    n_samples = 0

    for x, y in test:
        x, x_mask, y, y_mask = nmt.prepare_data(
            x,
            y,
            maxlen=options['maxlen'],
            n_words_src=options['n_words_src'],
            n_words=options['n_words_tgt'])

        res.extend(list(f_pred(x, x_mask, y, y_mask).flatten()))
        n_samples += x.shape[1]
        print 'processed:', n_samples, 'samples'

    with open('qe/' + save_file, 'w') as fp:
        for hh in res:
            fp.writelines(str(hh) + '\n')
Exemple #3
0
def build_model(tparams, options):
    """
    @first:得到f_tt函数
    """
    old_options = load_config(options['nmt_model'])
    params = nmt.init_params(old_options)
    params = load_params(options['nmt_model'], params)
    old_tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt = nmt.build_model(old_tparams,old_options)

    hter = tensor.matrix('hter', dtype='float32')
    Wt = old_tparams['ff_logit_W']
    #w2v = tensor.matrix('w2v',dtype='float32')

    n_timesteps = y.shape[0]
    n_samples = y.shape[1]

    emb = Wt.T[y.flatten()]
    emb = emb.reshape([n_timesteps, n_samples, 500])
    emb = emb * tt

    #是否使用 dropout
    if options['use_dropout']:
        retain_probability_emb = 1 - options['dropout_embedding']
        retain_probability_hidden = 1 - options['dropout_hidden']
        retain_probability_source = 1 - options['dropout_source']
        if options['model_version'] < 0.1:
            scaled = False
        else:
            scaled = True
        rec_dropout = shared_dropout_layer((2, n_samples, options['dim']),
                                           use_noise, trng,
                                           retain_probability_hidden, scaled)
        emb_dropout = shared_dropout_layer((2, n_samples, options['dim_word']),
                                           use_noise, trng,
                                           retain_probability_emb, scaled)
        source_dropout = shared_dropout_layer(
            (n_timesteps, n_samples, 1), use_noise, trng,
            retain_probability_source, scaled)
        source_dropout = tensor.tile(source_dropout,
                                     (1, 1, options['dim_word']))
    else:
        rec_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32'))
        emb_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32'))

    #if options['use_dropout']:
    #    emb *= source_dropout
    #emb = get_qv_w2c(emb,y,w2v,dim=500)

    proj = gru_layer(tparams,
                     emb,
                     options,
                     prefix='final_encoder',
                     mask=y_mask,
                     emb_dropout=emb_dropout,
                     rec_dropout=rec_dropout,
                     profile=False)

    hh = proj[0][-1, :, :]
    y_pred = tensor.dot(hh, tparams['final_W'])  #此时得出的结果也不错

    #y_pred = tensor.nnet.sigmoid(tensor.dot(hh,tparams['W']))
    cost = tensor.abs_(y_pred - hter).mean(axis=0)[0]

    return trng, use_noise, x, x_mask, y, y_mask, hter, y_pred, cost
Exemple #4
0
def train(
        batch_size=80,
        valid_batch_size=80,
        dim=100,
        dim_word=500,
        dispFreq=100,
        saveFreq=3000,
        validFreq=1000,
        saveto='RNN_model/wmt17.en-de.npz',
        datasets=['tuning/train.bpe.en', 'tuning/train.bpe.de'],
        valid_datasets=['tuning/dev.bpe.en', 'tuning/dev.bpe.de'],
        dictionaries=['data/train.bpe.en.json', 'data/train.bpe.de.json'],
        hter=['tuning/train.hter', 'tuning/dev.hter'],
        n_words_src=40000,
        n_words_tgt=40000,
        nmt_model='model/model.npz.best_bleu',
        lrate=0.0001,  # learning rate
        use_dropout=True,
        patience=10,
        max_epochs=5000,
        finish_after=1000000,
        maxibatch_size=20,
        optimizer='rmsprop',
        shuffle_each_epoch=True,
        reload_=True,
        overwrite=False,
        sort_by_length=False,
        maxlen=1000,
        decay_c=0.,  # L2 regularization penalty
        map_decay_c=0.,  # L2 regularization penalty towards original weights
        clip_c=1.0,
        dropout_embedding=0.2,  # dropout for input embeddings (0: no dropout)
        dropout_hidden=0.2,  # dropout for hidden layers (0: no dropout)
        dropout_source=0.1,  # dropout source words (0: no dropout)
        dropout_target=0.1,  # dropout target words (0: no dropout)
        model_version=0.1):

    #获取局部参数
    model_options = locals().copy()
    print 'Model options:', model_options

    #加载字典,并且反转
    worddicts = [None] * len(dictionaries)
    worddicts_r = [None] * len(dictionaries)
    for ii, dd in enumerate(dictionaries):
        worddicts[ii] = load_dict(dd)
        worddicts_r[ii] = dict()
        for kk, vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk

    #若词汇总大小未设置,则给定默认值为词汇表大小
    if n_words_src is None:
        n_words_src = len(worddicts[0])
        model_options['n_words_src'] = n_words_src
    if n_words_tgt is None:
        n_words_tgt = len(worddicts[1])
        model_options['n_words_tgt'] = n_words_tgt

    #加载数据
    print 'Loading data ...'
    train = TextIterator(datasets[0],
                         datasets[1],
                         hter[0],
                         dictionaries[0],
                         dictionaries[1],
                         n_words_source=n_words_src,
                         n_words_target=n_words_tgt,
                         batch_size=batch_size,
                         maxlen=maxlen,
                         shuffle_each_epoch=shuffle_each_epoch,
                         sort_by_length=sort_by_length,
                         maxibatch_size=maxibatch_size)
    valid = TextIterator(valid_datasets[0],
                         valid_datasets[1],
                         hter[1],
                         dictionaries[0],
                         dictionaries[1],
                         n_words_source=n_words_src,
                         n_words_target=n_words_tgt,
                         batch_size=valid_batch_size,
                         maxlen=maxlen)

    #numpy.random.seed(1234)
    # 初始化模型参数
    print 'Init parameters ...'
    params = init_params(model_options)

    #reload parameters
    if reload_ and os.path.exists(saveto):
        print 'Reloading model parameters'
        params = load_params(saveto, params)

    #把网络中的W,b 变为共享变量
    tparams = init_theano_params(params)

    # 建立模型
    print 'Building model ...',
    trng,use_noise,x,x_mask,y,y_mask,hter, \
        y_pred,cost = build_model(tparams,model_options)
    print 'Done'
    """
    @function:调试
    print Wt.get_value().shape
    print tparams['W'].get_value().shape
     
    f_tt = theano.function([x,x_mask,y,y_mask],tt)
    f_emb = theano.function([x,x_mask,y,y_mask],emb)
    f_pred = theano.function([x,x_mask,y,y_mask],y_pred)
    f_cost = theano.function([x,x_mask,y,y_mask,hter],cost)
    
    
    for x, y, hter in train:
            # 准备数据用于训练
            x, x_mask, y, y_mask = nmt.prepare_data(x, y, maxlen=maxlen,
                                                n_words_src=n_words_src,
                                                n_words=n_words_tgt)
            hter = numpy.array(hter).astype('float32')
            hter = hter.reshape([hter.shape[0],1])

            print f_pred(x,x_mask,y,y_mask).shape
            print f_cost(x,x_mask,y,y_mask,hter)
            #print f_cost(x,x_mask,y,y_mask,hter)
            
            sys.exit(0)
    """

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # apply L2 regularisation to loaded model (map training)
    if map_decay_c > 0:
        map_decay_c = theano.shared(numpy.float32(map_decay_c),
                                    name="map_decay_c")
        weight_map_decay = 0.
        for kk, vv in tparams.iteritems():
            init_value = theano.shared(vv.get_value(), name=kk + "_init")
            weight_map_decay += ((vv - init_value)**2).sum()
        weight_map_decay *= map_decay_c
        cost += weight_map_decay

    print 'Building f_pred...',
    inps = [x, x_mask, y, y_mask]
    f_pred = theano.function(inps, y_pred, profile=False)
    print 'Done'

    print 'Building f_cost...',
    inps = [x, x_mask, y, y_mask, hter]
    f_cost = theano.function(inps, cost, profile=False)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c,
                              g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = eval(optimizer)(lr,
                                              tparams,
                                              grads,
                                              inps,
                                              cost,
                                              profile=False)
    print 'Done'

    print 'Start Optimization'
    best_p = None
    bad_counter = 0
    uidx = 0
    estop = False
    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        rmodel = numpy.load(saveto)
        history_errs = list(rmodel['history_errs'])
        if 'uidx' in rmodel:
            uidx = rmodel['uidx']

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    valid_err = None
    fp = open('RNN_model/valid.error', 'w')

    for eidx in xrange(max_epochs):

        n_samples = 0
        for x, y, hter in train:
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)
            # 准备数据用于训练
            x, x_mask, y, y_mask = nmt.prepare_data(x,
                                                    y,
                                                    maxlen=maxlen,
                                                    n_words_src=n_words_src,
                                                    n_words=n_words_tgt)
            hter = numpy.array(hter).astype('float32')
            hter = hter.reshape([hter.shape[0], 1])

            #长度小于 maxlen 的值的句子为 0
            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x, x_mask, y, y_mask, hter)

            # do the update on parameters
            f_update(lrate)

            ud = time.time() - ud_start

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud

            # save the best model so far, in addition, save the latest model
            # into a separate file with the iteration number for external eval
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving the best model...',
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip_from_theano(tparams)
                numpy.savez(saveto,
                            history_errs=history_errs,
                            uidx=uidx,
                            **params)
                json.dump(model_options,
                          open('%s.json' % saveto, 'wb'),
                          indent=2)
                print 'Done'

                # save with uidx
                if not overwrite:
                    print 'Saving the model at iteration {}...'.format(uidx),
                    saveto_uidx = '{}.iter{}.npz'.format(
                        os.path.splitext(saveto)[0], uidx)
                    numpy.savez(saveto_uidx,
                                history_errs=history_errs,
                                uidx=uidx,
                                **unzip_from_theano(tparams))
                    print 'Done'

            # validate model on validation set and early stop if necessary
            if valid and validFreq and numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs = pred_error(f_cost, nmt.prepare_data,
                                        model_options, valid)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip_from_theano(tparams)
                    bad_counter = 0
                if len(history_errs) > patience and valid_err >= \
                        numpy.array(history_errs)[:-patience].min():
                    bad_counter += 1
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

                if numpy.isnan(valid_err):
                    ipdb.set_trace()

                print 'Valid ', valid_err
                fp.writelines('valid error: ' + str(valid_err) + '\n')

            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zip_to_theano(best_p, tparams)

    if valid:
        use_noise.set_value(0.)
        valid_errs = pred_error(f_cost, nmt.prepare_data, model_options, valid)
        valid_err = valid_errs.mean()
        print 'Valid ', valid_err

        fp.writelines('Finally error: ' + str(valid_err) + '\n')
        fp.close()

    if best_p is not None:
        params = copy.copy(best_p)
    else:
        params = unzip_from_theano(tparams)
    numpy.savez(saveto,
                zipped_params=best_p,
                history_errs=history_errs,
                uidx=uidx,
                **params)

    return valid_err
Exemple #5
0
def encoder_hidden(model='model/model.npz.best_bleu',
                   train=['test/train.bpe.en', 'test/train.bpe.es'],
                   test=['test/test.bpe.en', 'test/test.bpe.es'],
                   batch_size=10):
    """
    @function:获得对数似然特征
    """
    options = load_config(model)

    params = init_params(options)
    params = load_params(model, params)

    tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt, decoderh = build_model(tparams,options)

    #加载数据
    train = TextIterator(
        train[0],
        train[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    test = TextIterator(
        test[0],
        test[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    f_ctx = theano.function([x, x_mask], ctx, name='f_ctx')

    #################### train #######################
    n_samples = 0
    for x, y in train:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        encoderh = f_ctx(x, x_mask)
        encoderh = numpy.concatenate(
            [encoderh[0, :, 1024:], encoderh[-1, :, :1024]], axis=1)

        with open('features/hidden/train.en-es.encoderh', 'a+') as fp:
            for hh_data in encoderh:
                fp.writelines('\t'.join(map(lambda x: str(x), list(hh_data))) +
                              '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'

    ################### test ########################
    n_samples = 0
    for x, y in test:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        encoderh = f_ctx(x, x_mask)
        encoderh = numpy.concatenate(
            [encoderh[0, :, 1024:], encoderh[-1, :, :1024]], axis=1)

        with open('features/hidden/test.en-es.encoderh', 'a+') as fp:
            for hh_data in encoderh:
                fp.writelines('\t'.join(map(lambda x: str(x), list(hh_data))) +
                              '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'
Exemple #6
0
    def __init__(self, model_files, configs, model_weights=None):
        """"
        Create a ConstrainedTM using Nematus translation models

        Args:
          config: a dict containing key-->value for each argument supported by `nematus/translate.py`

        """
        if configs is not None:
            assert len(model_files) == len(
                configs), 'Number of models differs from numer of config files'

        trng = RandomStreams(1234)
        # don't use noise
        use_noise = shared(numpy.float32(0.))

        self.eos_token = '<eos>'

        self.fs_init = []
        self.fs_next = []

        # each entry in self.word_dicts is:
        # `{'input_dicts': [...], 'input_idicts': [...], 'output_dict': <dict>, 'output_idict': <dict>}
        self.word_dicts = []

        if configs is None:
            # Nematus models with new format (no separate config)
            configs = []
            for model in model_files:
                configs.append(load_config(model))
                # backward compatibility
                fill_options(configs[-1])

        for model, config in zip(model_files, configs):
            # fill in any unspecified options in-place
            fill_options(config)
            param_list = numpy.load(model).files
            param_list = dict.fromkeys(
                [key for key in param_list if not key.startswith('adam_')], 0)
            params = load_params(model, param_list)
            tparams = init_theano_params(params)

            # load model-specific input and output vocabularies
            # Note: some models have multiple input factors -- if so, we need to split that model's input into factors
            #   using the same logic that was used at training time
            # Note: every model's output vocabulary must be exactly the same in order to do ensemble decoding
            self.word_dicts.append(
                self.load_dictionaries(config['dictionaries'],
                                       n_words_src=config.get(
                                           'n_words_src', None),
                                       n_words_trg=config.get('n_words',
                                                              None)))

            # WORKING: add passing attention model alignment through GBS
            # f_init, f_next = build_sampler(tparams, config, use_noise, trng,
            #                                return_alignment=config['return_alignment'])
            f_init, f_next = build_sampler(tparams,
                                           config,
                                           use_noise,
                                           trng,
                                           return_alignment=True)

            self.fs_init.append(f_init)
            self.fs_next.append(f_next)

        # Make sure all output dicts have the same number of items
        assert len(
            set(len(d['output_dict']) for d in
                self.word_dicts)) == 1, 'Output vocabularies must be identical'

        self.num_models = len(self.fs_init)

        if model_weights is None:
            self.model_weights = numpy.ones(self.num_models) / float(
                self.num_models)
        else:
            assert len(
                model_weights
            ) == self.num_models, 'if you specify weights, there must be one per model'
            self.model_weights = numpy.array(model_weights)
Exemple #7
0
def get_qv(model='model/model.npz.best_bleu'):
    """
    @function:获得质量向量(quality vector)
    """
    options = load_config(model)

    params = init_params(options)
    params = load_params(model, params)

    tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt = build_model(tparams,options)

    #加载数据
    train = TextIterator(
        options['datasets'][0],
        options['datasets'][1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=options['batch_size'],
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    dev = TextIterator(
        options['valid_datasets'][0],
        options['valid_datasets'][1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=options['valid_batch_size'],
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    f_tt = theano.function([x, x_mask, y, y_mask], tt, name='f_tt')

    #print tparams['ff_logit_W'].get_value().shape   #### (500,40000)
    n_samples = 0

    for x, y in train:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        tt_ = f_tt(x, x_mask, y, y_mask)
        Wt = tparams['ff_logit_W'].get_value()

        for j in range(y.shape[1]):

            qv_ = []
            for i in range(y.shape[0]):
                if y_mask[i][j] == 1:
                    index = y[i][j]
                    qv = tt_[i, 0, :].T * Wt[:, index]
                    qv_.append(list(qv))

            with open('qv/train/' + str(n_samples + j) + '.qv.pkl', 'w') as fp:
                pkl.dump(qv_, fp)

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'
Exemple #8
0
def alignment(
        model='model/model.npz.best_bleu',
        train=['test/train.bpe.en','test/train.bpe.es'],
        test=['test/test.bpe.en','test/test.bpe.es'],
        batch_size=10
        ):
    """
    @function:获得对数似然特征
    """
    options = load_config(model)
    
    params = init_params(options)
    params = load_params(model, params)
    
    tparams = init_theano_params(params)
    
    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt, _ = build_model(tparams,options)
    
    #加载数据
    train = TextIterator(train[0], train[1],
                        options['dictionaries'][0], options['dictionaries'][1],
                        n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'],
                        batch_size=batch_size,
                        maxlen=1000, #设置尽可能长的长度
                        sort_by_length=False) #设为 False
    
    test = TextIterator(test[0], test[1],
                        options['dictionaries'][0], options['dictionaries'][1],
                        n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'],
                        batch_size=batch_size,
                        maxlen=1000, #设置尽可能长的长度
                        sort_by_length=False) #设为 False
    
    f_align = theano.function([x,x_mask,y,y_mask],opt_ret,name='f_cost')

    #################### train #######################
    """
    n_samples = 0
    for x, y in train:
            # 准备数据用于训练
            x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000,
                                                n_words_src=options['n_words_src'],
                                                n_words=options['n_words_tgt'])
            align = f_align(x,x_mask,y,y_mask)['dec_alphas'] # (y, batch_size, x)
            align = align * y_mask[:,:,None] # 注意此处技巧
            align_shp = align.shape
            for j in range(align_shp[1]):
                row_ = int(numpy.sum(y_mask[:,j]))
                col_ = int(numpy.sum(x_mask[:,j]))
                align_data = align[:row_,j,:col_] # 词对齐矩阵
                
                with open('features/alignment/train.en-es.word.align','a+') as fp:
                    for data in align_data:
                        fp.writelines('\t'.join(map(lambda x:str(x), data))+'\n')
                    fp.writelines('\n')
                     
            n_samples += y.shape[1]
            print 'processed:',n_samples,'samples ...'
    """
    ################### test ########################
    n_samples = 0
    for x, y in test:
            # 准备数据用于训练
            x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000,
                                                n_words_src=options['n_words_src'],
                                                n_words=options['n_words_tgt'])
            align = f_align(x,x_mask,y,y_mask)['dec_alphas'] # (y, batch_size, x)
            align = align * y_mask[:,:,None] # 注意此处技巧
            align_shp = align.shape
            for j in range(align_shp[1]):
                row_ = int(numpy.sum(y_mask[:,j]))
                col_ = int(numpy.sum(x_mask[:,j]))
                align_data = align[:row_,j,:col_] # 词对齐矩阵
                
                with open('features/alignment/test.en-es.word.align','a+') as fp:
                    for data in align_data:
                        fp.writelines('\t'.join(map(lambda x:str(x), data))+'\n')
                    fp.writelines('\n')
                     
            n_samples += y.shape[1]
            print 'processed:',n_samples,'samples ...'
def word_embedding(
        model='model/model.npz.best_bleu',
        train=['test/train.bpe.en','test/train.bpe.es'],
        dev=['test/dev.bpe.en','test/dev.bpe.es'],
        test=['test/test.bpe.en','test/test.bpe.es'],
        batch_size=10
        ):
    """
    @function:获得词向量
    """
    options = load_config(model) # 加载设置的超参数
    
    params = init_params(options)
    params = load_params(model, params) # 加载模型参数
    
    #加载数据
    train = TextIterator(train[0], train[1],
                        options['dictionaries'][0], options['dictionaries'][1],
                        n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'],
                        batch_size=batch_size,
                        maxlen=1000, #设置尽可能长的长度
                        sort_by_length=False) #设为 False
    
    dev = TextIterator(dev[0], dev[1],
                        options['dictionaries'][0], options['dictionaries'][1],
                        n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'],
                        batch_size=batch_size,
                        maxlen=1000, #设置尽可能长的长度
                        sort_by_length=False) #设为 False
    
    test = TextIterator(test[0], test[1],
                        options['dictionaries'][0], options['dictionaries'][1],
                        n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'],
                        batch_size=batch_size,
                        maxlen=1000, #设置尽可能长的长度
                        sort_by_length=False) #设为 False
    
    #################### train #######################
    Wemb = params['Wemb']
    Wemb_dec = params['Wemb_dec']
    
    n_samples = 0
    for x, y in train:
        x_emb = get_emb(x, Wemb)
        y_emb = get_emb(y, Wemb_dec)
        
        with open('features/emb/train.es-en.es.emb','a+') as fp:
            for x_row in x_emb: 
                fp.writelines('\t'.join(map(lambda x:str(x), x_row))+'\n')
        with open('features/emb/train.es-en.en.emb','a+') as fp:
            for y_row in y_emb: 
                fp.writelines('\t'.join(map(lambda x:str(x), y_row))+'\n')
                
        n_samples += len(x)
        print 'processed:',n_samples,'samples ...'
    
    ################### test ########################
    Wemb = params['Wemb']
    Wemb_dec = params['Wemb_dec']
    
    n_samples = 0
    for x, y in test:
        x_emb = get_emb(x, Wemb)
        y_emb = get_emb(y, Wemb_dec)
        
        with open('features/emb/test.es-en.es.emb','a+') as fp:
            for x_row in x_emb: 
                fp.writelines('\t'.join(map(lambda x:str(x), x_row))+'\n')
        with open('features/emb/test.es-en.en.emb','a+') as fp:
            for y_row in y_emb: 
                fp.writelines('\t'.join(map(lambda x:str(x), y_row))+'\n')
                
        n_samples += len(x)
        print 'processed:',n_samples,'samples ...'
    
    ################### dev ########################
    Wemb = params['Wemb']
    Wemb_dec = params['Wemb_dec']
    
    n_samples = 0
    for x, y in dev:
        x_emb = get_emb(x, Wemb)
        y_emb = get_emb(y, Wemb_dec)
        
        with open('features/emb/dev.es-en.es.emb','a+') as fp:
            for x_row in x_emb: 
                fp.writelines('\t'.join(map(lambda x:str(x), x_row))+'\n')
        with open('features/emb/dev.es-en.en.emb','a+') as fp:
            for y_row in y_emb: 
                fp.writelines('\t'.join(map(lambda x:str(x), y_row))+'\n')
                
        n_samples += len(x)
        print 'processed:',n_samples,'samples ...'
Exemple #10
0
def extract_logprob(model='model/model.npz.best_bleu',
                    train=['test/train.bpe.en', 'test/train.bpe.es'],
                    test=['test/test.bpe.en', 'test/test.bpe.es'],
                    batch_size=10):
    """
    @function:获得对数似然特征
    """
    options = load_config(model)

    params = init_params(options)
    params = load_params(model, params)

    tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt, _ = build_model(tparams,options)

    #加载数据
    train = TextIterator(
        train[0],
        train[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    test = TextIterator(
        test[0],
        test[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    f_cost = theano.function([x, x_mask, y, y_mask], cost, name='f_cost')

    #################### train #######################
    n_samples = 0
    for x, y in train:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        logprob = f_cost(x, x_mask, y, y_mask)
        with open('features/train.es-en.logprob', 'a+') as fp:
            fp.writelines('\n'.join(map(lambda x: str(x), list(logprob))) +
                          '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'

    ################### test ########################
    n_samples = 0
    for x, y in test:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        logprob = f_cost(x, x_mask, y, y_mask)
        with open('features/test.es-en.logprob', 'a+') as fp:
            fp.writelines('\n'.join(map(lambda x: str(x), list(logprob))) +
                          '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'
def extract_qv(model='model/model.npz.best_bleu',
               train=['test/train.bpe.en', 'test/train.bpe.es'],
               test=['test/test.bpe.en', 'test/test.bpe.es'],
               batch_size=10):
    """
    @function:获得质量向量(quality vector)
    """
    options = load_config(model)

    params = init_params(options)
    params = load_params(model, params)

    tparams = init_theano_params(params)

    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt, _ = build_model(tparams,options)

    #加载数据
    train = TextIterator(
        train[0],
        train[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    test = TextIterator(
        test[0],
        test[1],
        options['dictionaries'][0],
        options['dictionaries'][1],
        n_words_source=options['n_words_src'],
        n_words_target=options['n_words_tgt'],
        batch_size=batch_size,
        maxlen=1000,  #设置尽可能长的长度
        sort_by_length=False)  #设为 False

    f_tt = theano.function([x, x_mask, y, y_mask], tt, name='f_tt')

    #################### train #######################
    n_samples = 0
    for x, y in train:
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        tt_ = f_tt(x, x_mask, y, y_mask)
        Wt = tparams['ff_logit_W'].get_value()

        for j in range(y.shape[1]):

            qv_ = []
            for i in range(y.shape[0]):
                if y_mask[i][j] == 1:
                    index = y[i][j]
                    qv = Wt[:, index].T * tt_[i, j, :]
                    qv_.append(list(qv))
            qv_ = numpy.array(qv_)
            qv_ = list(map(lambda x: str(x), qv_.mean(axis=0)))

            with open('features/train.nmt.qv', 'a+') as fp:
                fp.writelines('\t'.join(qv_) + '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'

    ################### test ########################
    n_samples = 0
    for x, y in test:  #*****
        # 准备数据用于训练
        x, x_mask, y, y_mask = prepare_data(x,
                                            y,
                                            maxlen=1000,
                                            n_words_src=options['n_words_src'],
                                            n_words=options['n_words_tgt'])
        tt_ = f_tt(x, x_mask, y, y_mask)
        Wt = tparams['ff_logit_W'].get_value()

        for j in range(y.shape[1]):

            qv_ = []
            for i in range(y.shape[0]):
                if y_mask[i][j] == 1:
                    index = y[i][j]
                    qv = Wt[:, index].T * tt_[i, j, :]
                    qv_.append(list(qv))
            qv_ = numpy.array(qv_)
            qv_ = list(map(lambda x: str(x), qv_.mean(axis=0)))

            with open('features/test.nmt.qv', 'a+') as fp:  #*****
                fp.writelines('\t'.join(qv_) + '\n')

        n_samples += y.shape[1]
        print 'processed:', n_samples, 'samples ...'