Ejemplo n.º 1
0
    def _load_theano(self):
        """
        Loads models, sets theano shared variables and builds samplers.
        This entails irrevocable binding to a specific GPU.
        """

        from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
        from theano import shared

        from nmt import (build_sampler, gen_sample)
        from theano_util import (numpy_floatX, load_params, init_theano_params)

        trng = RandomStreams(1234)
        use_noise = shared(numpy_floatX(0.))

        fs_init = []
        fs_next = []

        for model, option in zip(self._models, self._options):
            param_list = numpy.load(model).files
            param_list = dict.fromkeys(
                [key for key in param_list if not key.startswith('adam_')], 0)
            params = load_params(model, param_list)
            tparams = init_theano_params(params)

            # always return alignment at this point
            f_init, f_next = build_sampler(
                tparams, option, use_noise, trng, return_alignment=True)

            fs_init.append(f_init)
            fs_next.append(f_next)

        return trng, fs_init, fs_next, gen_sample
Ejemplo n.º 2
0
def load_scorer(model, option, alignweights=None):

    # load model parameters and set theano shared variables
    param_list = numpy.load(model).files
    param_list = dict.fromkeys(
        [key for key in param_list if not key.startswith('adam_')], 0)
    params = load_params(model, param_list)
    tparams = init_theano_params(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, option)
    inps = [x, x_mask, y, y_mask]
    use_noise.set_value(0.)

    if alignweights:
        logging.debug("Save weight mode ON, alignment matrix will be saved.")
        outputs = [cost, opt_ret['dec_alphas']]
        f_log_probs = theano.function(inps, outputs)
    else:
        f_log_probs = theano.function(inps, cost)

    return f_log_probs
Ejemplo n.º 3
0
def translate_model(queue, rqueue, pid, models, options, k, normalize, verbose, nbest, return_alignment, suppress_unk, return_hyp_graph):

    from theano_util import (load_params, init_theano_params)
    from nmt import (build_sampler, gen_sample, init_params)

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    from theano import shared
    trng = RandomStreams(1234)
    use_noise = shared(numpy.float32(0.))

    fs_init = []
    fs_next = []

    for model, option in zip(models, options):


        # load model parameters and set theano shared variables
        param_list = numpy.load(model).files
        param_list = dict.fromkeys([key for key in param_list if not key.startswith('adam_')], 0)
        params = load_params(model, param_list)
        tparams = init_theano_params(params)

        # word index
        f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=return_alignment)

        fs_init.append(f_init)
        fs_next.append(f_next)

    def _translate(seq):
        # sample given an input sequence and obtain scores
        sample, score, word_probs, alignment, hyp_graph = gen_sample(fs_init, fs_next,
                                   numpy.array(seq).T.reshape([len(seq[0]), len(seq), 1]),
                                   trng=trng, k=k, maxlen=200,
                                   stochastic=False, argmax=False, return_alignment=return_alignment,
                                   suppress_unk=suppress_unk, return_hyp_graph=return_hyp_graph)

        # normalize scores according to sequence lengths
        if normalize:
            lengths = numpy.array([len(s) for s in sample])
            score = score / lengths
        if nbest:
            return sample, score, word_probs, alignment, hyp_graph
        else:
            sidx = numpy.argmin(score)
            return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx], hyp_graph

    while True:
        req = queue.get()
        if req is None:
            break

        idx, x = req[0], req[1]
        if verbose:
            sys.stderr.write('{0} - {1}\n'.format(pid,idx))
        seq = _translate(x)

        rqueue.put((idx, seq))

    return
Ejemplo n.º 4
0
def translate_model(queue, rqueue, pid, models, options, k, normalize, verbose, nbest, return_alignment, suppress_unk):

    from theano_util import (load_params, init_theano_params)
    from nmt import (build_sampler, gen_sample, init_params)

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    from theano import shared
    trng = RandomStreams(1234)
    use_noise = shared(numpy.float32(0.))

    fs_init = []
    fs_next = []

    for model, option in zip(models, options):

        # allocate model parameters
        params = init_params(option)

        # load model parameters and set theano shared variables
        params = load_params(model, params)
        tparams = init_theano_params(params)

        # word index
        f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=return_alignment)

        fs_init.append(f_init)
        fs_next.append(f_next)

    def _translate(seq):
        # sample given an input sequence and obtain scores
        sample, score, word_probs, alignment = gen_sample(fs_init, fs_next,
                                   numpy.array(seq).T.reshape([len(seq[0]), len(seq), 1]),
                                   trng=trng, k=k, maxlen=200,
                                   stochastic=False, argmax=False, return_alignment=return_alignment, suppress_unk=suppress_unk)

        # normalize scores according to sequence lengths
        if normalize:
            lengths = numpy.array([len(s) for s in sample])
            score = score / lengths
        if nbest:
            return sample, score, word_probs, alignment
        else:
            sidx = numpy.argmin(score)
            return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx]

    while True:
        req = queue.get()
        if req is None:
            break

        idx, x = req[0], req[1]
        if verbose:
            sys.stderr.write('{0} - {1}\n'.format(pid,idx))
        seq = _translate(x)

        rqueue.put((idx, seq))

    return
Ejemplo n.º 5
0
    def _load_theano(self):
        """
        Loads models, sets theano shared variables and builds samplers.
        This entails irrevocable binding to a specific GPU.
        """
        from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
        from theano import shared

        from nmt import (build_sampler, build_multi_sampler, gen_sample)
        from theano_util import (numpy_floatX, load_params, init_theano_params)

        trng = RandomStreams(1234)
        use_noise = shared(numpy_floatX(0.))

        fs_init = []
        fs_next = []

        for model, option in zip(self._models, self._options):

            # check compatibility with multisource
            if option["multisource_type"] is not None and len(
                    option['extra_sources']) == 0:
                logging.error(
                    "This model is multi-source but no auxiliary source file was provided."
                )
                sys.exit(1)
            elif option["multisource_type"] is None and len(
                    option['extra_sources']) != 0:
                logging.warn(
                    "You provided an auxiliary input but this model is not multi-source. Ignoring extra input."
                )

            param_list = numpy.load(model).files
            param_list = dict.fromkeys(
                [key for key in param_list if not key.startswith('adam_')], 0)
            params = load_params(model, param_list)
            tparams = init_theano_params(params)

            # always return alignment at this point
            if option['multisource_type'] is not None:
                f_init, f_next = build_multi_sampler(tparams,
                                                     option,
                                                     use_noise,
                                                     trng,
                                                     return_alignment=True)
            else:
                f_init, f_next = build_sampler(tparams,
                                               option,
                                               use_noise,
                                               trng,
                                               return_alignment=True)

            fs_init.append(f_init)
            fs_next.append(f_next)

        return trng, fs_init, fs_next, gen_sample
Ejemplo n.º 6
0
def load_scorer(model, option, alignweights=None):
    # load model parameters and set theano shared variables
    param_list = numpy.load(model).files
    param_list = dict.fromkeys(
        [key for key in param_list if not key.startswith('adam_')], 0)
    params = load_params(model, param_list)
    tparams = init_theano_params(params)

    # compatibility with multi-source
    if 'extra_sources' not in option:
        option['extra_sources'] = []
    if 'multisource_type' not in option:
        option['multisource_type'] = None

    #if 'multisource_type' not in option or option['multisource_type'] is None:
    #   print("building single source model")
    # trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost = build_model(tparams, option)
    #  inps = [x, x_mask, y, y_mask]
    #else:
    trng, use_noise, xs, x_masks, y, y_mask, opt_ret, cost = build_multisource_model(
        tparams, option)
    #inps = [xs[0], x_masks[0], xs[1], x_masks[1], y, y_mask]

    inps = [z for (x, x_mask) in zip(xs, x_masks)
            for z in (x, x_mask)] + [y, y_mask]  # list of inputs

    use_noise.set_value(0.)

    if alignweights:
        logging.debug("Save weight mode ON, alignment matrix will be saved.")

        outputs = [cost]
        if option['multisource_type'] == 'init-decoder':
            extra_encoders = 0
        else:
            extra_encoders = len(option['extra_sources'])
        for i in range(extra_encoders + 1):
            outputs.append(opt_ret['dec_alphas' + str(i)])
        outputs.append(opt_ret['cost_per_word'])

        #if 'multisource_type' not in option or option['multisource_type'] is None:
        #    outputs = [cost, opt_ret['dec_alphas0'], opt_ret['cost_per_word']]
        #else:
        #    outputs = [cost, opt_ret['dec_alphas0'], opt_ret['dec_alphas1'], opt_ret['cost_per_word']]

        f_log_probs = theano.function(inps, outputs)
    else:
        f_log_probs = theano.function(inps, [cost, opt_ret['cost_per_word']])

    return f_log_probs
Ejemplo n.º 7
0
def rescore_model(source_file, nbest_file, saveto, models, options, b,
                  normalize, verbose, alignweights):

    trng = RandomStreams(1234)

    fs_log_probs = []

    for model, option in zip(models, options):

        # load model parameters and set theano shared variables
        param_list = numpy.load(model).files
        param_list = dict.fromkeys(
            [key for key in param_list if not key.startswith('adam_')], 0)
        params = load_params(model, param_list)
        tparams = init_theano_params(params)

        trng, use_noise, \
            x, x_mask, y, y_mask, \
            opt_ret, \
            cost = \
            build_model(tparams, option)
        inps = [x, x_mask, y, y_mask]
        use_noise.set_value(0.)

        if alignweights:
            sys.stderr.write(
                "\t*** Save weight mode ON, alignment matrix will be saved.\n")
            outputs = [cost, opt_ret['dec_alphas']]
            f_log_probs = theano.function(inps, outputs)
        else:
            f_log_probs = theano.function(inps, cost)

        fs_log_probs.append(f_log_probs)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        alignments = []
        for i, f_log_probs in enumerate(fs_log_probs):
            score, alignment = pred_probs(f_log_probs,
                                          prepare_data,
                                          options[i],
                                          pairs,
                                          normalize=normalize,
                                          alignweights=alignweights)
            scores.append(score)
            alignments.append(alignment)

        return scores, alignments

    lines = source_file.readlines()
    nbest_lines = nbest_file.readlines()

    if alignweights:  ### opening the temporary file.
        temp_name = saveto.name + ".json"
        align_OUT = tempfile.NamedTemporaryFile(prefix=temp_name)

    with tempfile.NamedTemporaryFile(
            prefix='rescore-tmpin') as tmp_in, tempfile.NamedTemporaryFile(
                prefix='rescore-tmpout') as tmp_out:
        for line in nbest_lines:
            linesplit = line.split(' ||| ')
            idx = int(
                linesplit[0])  ##index from the source file. Starting from 0.
            tmp_in.write(lines[idx])
            tmp_out.write(linesplit[1] + '\n')

        tmp_in.seek(0)
        tmp_out.seek(0)
        pairs = TextIterator(
            tmp_in.name,
            tmp_out.name,
            options[0]['dictionaries'][:-1],
            options[0]['dictionaries'][1],
            n_words_source=options[0]['n_words_src'],
            n_words_target=options[0]['n_words'],
            batch_size=b,
            maxlen=float('inf'),
            sort_by_length=False
        )  #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after

        scores, alignments = _score(pairs, alignweights)

        for i, line in enumerate(nbest_lines):
            score_str = ' '.join(map(str, [s[i] for s in scores]))
            saveto.write('{0} {1}\n'.format(line.strip(), score_str))

        ### optional save weights mode.
        if alignweights:
            for line in alignments:
                align_OUT.write(line + "\n")
    if alignweights:
        combine_source_target_text(source_file, nbest_file, saveto.name,
                                   align_OUT)
        align_OUT.close()
def train(
    dim_word=100,  # word vector dimensionality
    dim=1000,  # the number of LSTM units
    patience=10,  # early stopping patience
    max_epochs=5000,
    finish_after=10000000,  # finish after this many updates
    dispFreq=100,
    decay_c=0.,  # L2 regularization penalty
    map_decay_c=0., # L2 regularization penalty towards original weights
    alpha_c=0.,  # alignment regularization
    clip_c=-1.,  # gradient clipping threshold
    lrate=0.01,  # learning rate
    n_words_src=None,  # source vocabulary size
    n_words_tgt=None,  # target vocabulary size
    maxlen=100,  # maximum length of the description
    optimizer='rmsprop',
    batch_size=16,
    valid_batch_size=16,
    saveto='model.npz',
    validFreq=1000,
    saveFreq=1000,   # save the parameters after every saveFreq updates
    sampleFreq=100,   # generate some samples after every sampleFreq
    datasets=[
        '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
        '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'],
    valid_datasets=['../data/dev/newstest2011.en.tok',
                    '../data/dev/newstest2011.fr.tok'],
    dictionaries=[
        '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
        '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'],
    use_dropout=False,
    dropout_embedding=0.2, # dropout for input embeddings (0: no dropout)
    dropout_hidden=0.5, # dropout for hidden layers (0: no dropout)
    dropout_source=0, # dropout source words (0: no dropout)
    dropout_target=0, # dropout target words (0: no dropout)
    reload_=False,
    overwrite=False,
    external_validation_script=None,
    shuffle_each_epoch=True,
    sort_by_length=True,
    maxibatch_size=20, #How many minibatches to load at one time
    model_version = 0.1
    ):
    # 获取局部参数
    model_options = locals().copy()  
    print 'Model options:',model_options
    
    # 加载字典,并且反转
    worddicts = [None]*len(dictionaries)
    worddicts_r = [None]*len(dictionaries)
    for ii,dd in enumerate(dictionaries):
        worddicts[ii] = load_dict(dd)
        worddicts_r[ii] = dict()
        for kk,vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk
    
    # 若词汇总大小未设置,则给定默认值为词汇表大小
    if n_words_src is None:
        n_words_src = len(worddicts[0])
        model_options['n_words_src'] = n_words_src
    if n_words_tgt is None:
        n_words_tgt = len(worddicts[1])
        model_options['n_words_tgt'] = n_words_tgt
    
    # 加载数据
    print 'Loading data ...'
    train = TextIterator(datasets[0],datasets[1],
                        dictionaries[0],dictionaries[1],
                        n_words_source=n_words_src,
                        n_words_target=n_words_tgt,
                        batch_size=batch_size,
                        maxlen=maxlen,
                        shuffle_each_epoch=shuffle_each_epoch,
                        sort_by_length=sort_by_length,
                        maxibatch_size=maxibatch_size)
    valid = TextIterator(valid_datasets[0], valid_datasets[1],
                        dictionaries[0], dictionaries[1],
                        n_words_source=n_words_src, n_words_target=n_words_tgt,
                        batch_size=valid_batch_size,
                        maxlen=maxlen)
    
    # 初始化模型参数
    print 'Init parameters ...'
    params = init_params(model_options)
    
    # 重新载入模型,当程序意外中断的时候,可以继续运行代码
    if reload_ and os.path.exists(saveto):
        print 'Reloading model parameters'
        params = load_params(saveto,params)
    
    # 把网络中的W,b 变为共享变量
    tparams = init_theano_params(params)
    
    # 建立模型
    print 'Building model ...'
    
    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt = build_model(tparams,model_options)
    
    inps = [x, x_mask, y, y_mask]

    #建立采样器
    if validFreq or sampleFreq:
        print 'Building sampler ...'
        f_init, f_next = build_sampler(tparams, model_options, use_noise, trng)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
        cost += alpha_reg

     # apply L2 regularisation to loaded model (map training)
    if map_decay_c > 0:
        map_decay_c = theano.shared(numpy.float32(map_decay_c), name="map_decay_c")
        weight_map_decay = 0.
        for kk, vv in tparams.iteritems():
            init_value = theano.shared(vv.get_value(), name= kk + "_init")
            weight_map_decay += ((vv -init_value) ** 2).sum()
        weight_map_decay *= map_decay_c
        cost += weight_map_decay
    
    # after all regularizers - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    f_alpha = theano.function(inps, opt_ret['dec_alphas']) # alphas
    print 'Done'
    
    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'
    
    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(tensor.switch(g2 > (clip_c**2),
                                           g / tensor.sqrt(g2) * clip_c,
                                           g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    
    print 'Building optimizers...',
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, profile=profile)
    print 'Done'
    
    #开始优化
    print 'Optimization'

    best_p = None
    bad_counter = 0
    uidx = 0
    estop = False
    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        rmodel = numpy.load(saveto)
        history_errs = list(rmodel['history_errs'])
        if 'uidx' in rmodel:
            uidx = rmodel['uidx']

    if validFreq == -1:
        validFreq = len(train[0])/batch_size
    if saveFreq == -1:
        saveFreq = len(train[0])/batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0])/batch_size

    valid_err = None

    for eidx in xrange(max_epochs):
        n_samples = 0

        for x, y in train:
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)
            # 准备数据用于训练
            x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen,
                                                n_words_src=n_words_src,
                                                n_words=n_words_tgt)
            #长度小于 maxlen 的值的句子为 0
            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x, x_mask, y, y_mask)
            
            # 画出词对齐矩阵
            #print f_alpha(x, x_mask, y, y_mask).shape
            """
            x_word = [worddicts_r[0][idx] for idx in x[:,0]]
            y_word = [worddicts_r[1][idx] for idx in y[:,0]]
            print len(x_word), x_word
            print len(y_word), y_word
            shape = f_alpha(x, x_mask, y, y_mask).shape
            for i in range(shape[1]):
                # print sum(f_alpha(x, x_mask, y, y_mask)[i,0,:])
                mx = sum(y_mask[:,i])
                my = sum(x_mask[:,i])
                align_matrix = f_alpha(x, x_mask, y, y_mask)[:,i,:][0:mx,0:my]
                align_shape = align_matrix.shape
                scale_ = 20 # 图像大小
                out_matrix = numpy.ones([scale_*align_shape[0],scale_*align_shape[1]])
                for j in range(align_shape[0]):
                    for k in range(align_shape[1]):
                        out_matrix[j*scale_:(j+1)*scale_,k*scale_:(k+1)*scale_] *= align_matrix[j,k]
                
                plt.imshow(100*out_matrix, plt.cm.gray)
                plt.pause(1)
                
            plt.show()
            sys.exit(0)
            """
            
            # do the update on parameters
            f_update(lrate)

            ud = time.time() - ud_start

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud

            # save the best model so far, in addition, save the latest model
            # into a separate file with the iteration number for external eval
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving the best model...',
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip_from_theano(tparams)
                numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params)
                json.dump(model_options, open('%s.json' % saveto, 'wb'), indent=2)
                print 'Done'

                # save with uidx
                if not overwrite:
                    print 'Saving the model at iteration {}...'.format(uidx),
                    saveto_uidx = '{}.iter{}.npz'.format(
                        os.path.splitext(saveto)[0], uidx)
                    numpy.savez(saveto_uidx, history_errs=history_errs,
                                uidx=uidx, **unzip_from_theano(tparams))
                    print 'Done'


            # generate some samples with the model and display them
            
            if sampleFreq and numpy.mod(uidx, sampleFreq) == 0:
                # FIXME: random selection?
                for jj in xrange(numpy.minimum(5, x.shape[1])):
                    stochastic = True
                    sample, score, sample_word_probs, alignment = gen_sample([f_init], [f_next],
                                               x[:, jj][:, None],
                                               trng=trng, k=1,
                                               maxlen=30,
                                               stochastic=stochastic,
                                               argmax=False,
                                               suppress_unk=False)
                    print 'Source ', jj, ': ',
                    for vv in x[:,jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[0]:
                            print worddicts_r[0][vv],
                        else:
                            print 'UNK'
                    print
                    print 'Truth ', jj, ' : ',
                    for vv in y[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[-1]:
                            print worddicts_r[-1][vv],
                        else:
                            print 'UNK',
                    print
                    print 'Sample ', jj, ': ',
                    if stochastic:
                        ss = sample
                    else:
                        score = score / numpy.array([len(s) for s in sample])
                        ss = sample[score.argmin()]
                    for vv in ss:
                        if vv == 0:
                            break
                        if vv in worddicts_r[-1]:
                            print worddicts_r[-1][vv],
                        else:
                            print 'UNK',
                    print
            
            # validate model on validation set and early stop if necessary
            if valid and validFreq and numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs, alignment = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip_from_theano(tparams)
                    bad_counter = 0
                if len(history_errs) > patience and valid_err >= \
                        numpy.array(history_errs)[:-patience].min():
                    bad_counter += 1
                    if bad_counter > patience:
                            print 'Early Stop!'
                            estop = True
                            break

                if numpy.isnan(valid_err):
                    ipdb.set_trace()

                print 'Valid ', valid_err

                if external_validation_script:
                    print "Calling external validation script"
                    print 'Saving  model...',
                    params = unzip_from_theano(tparams)
                    #每次验证的时候,也会保存 uidx
                    numpy.savez(saveto +'.dev', history_errs=history_errs, uidx=uidx, **params)
                    json.dump(model_options, open('%s.dev.npz.json' % saveto, 'wb'), indent=2)
                    print 'Done'
                    p = Popen([external_validation_script])

            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zip_to_theano(best_p, tparams)

    if valid:
        use_noise.set_value(0.)
        valid_errs, alignment = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid)
        valid_err = valid_errs.mean()

        print 'Valid ', valid_err

    if best_p is not None:
        params = copy.copy(best_p)
    else:
        params = unzip_from_theano(tparams)
    numpy.savez(saveto, zipped_params=best_p,
                history_errs=history_errs,
                uidx=uidx,
                **params)

    return valid_err
Ejemplo n.º 9
0
def main(model, dictionary, dictionary_target, source_file, saveto, k=5, batch_size = 1, opt_base=None,
         normalize=False, output_attention=False):
    trng = RandomStreams(1234)
    use_noise = shared(numpy.float32(0.))

    #load params
    if opt_base is None:
        options = load_config(model)
    else:
        options = load_config(opt_base)

    param_list = numpy.load(model).files
    param_list = dict.fromkeys(
        [key for key in param_list if not key.startswith('adam_')], 0)
    params = load_params(model, param_list, '')
    tparams = init_theano_params(params)

    #load dictionary
    if dictionary is None:
        dictionary = options['dictionaries'][0]
    word_dict = load_dict(dictionary)

    if options['n_words_src']:
        for key, idx in word_dict.items():
            if idx >= options['n_words_src']:
                del word_dict[key]
    word_idict = dict()
    for kk, vv in word_dict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    if dictionary_target is None:
        dictionary_target = options['dictionaries'][1]
    word_dict_trg = load_dict(dictionary_target)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    def _send_jobs(fname):
        retval = []
        retval_ori = []
        with open(fname, 'r') as f:
            for idx, line in enumerate(f):
                words = line.strip().split()
                if len(words) == 0:
                    continue
                retval_ori.append(line.strip())
                x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
                x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x)
                retval.append(x)
        logging.info('total %s sentences' % len(retval))
        return retval, retval_ori

    sources, sources_ori = _send_jobs(source_file)

    batches = []
    for i in range(len(sources) / batch_size):
        batches.append(prepare_data(sources[i * batch_size: (i + 1) * batch_size]))
    if (i + 1) * batch_size < len(sources):
        batches.append(prepare_data(sources[(i + 1) * batch_size: ]))
    final_sentences = []
    f_init, f_next = build_sampler(tparams, options, use_noise, trng)
    for batch in batches:
        samples, scores, word_probs, _, _ = gen_sample([f_init], [f_next],
                                                       batch[0],
                                                       trng=trng, k=k, maxlen=200,
                                                       stochastic=False, argmax=False)
        if normalize:
            lengths = numpy.array([len(s) for s in samples])
            scores = scores / lengths
        final_words = samples[numpy.argmin(scores)]
        final_sentences.append(' '.join([word_idict_trg[w] for w in final_words]) + '\n')

    with open(saveto, 'w') as fout:
        for sentence in final_sentences:
            fout.write(sentence)
    print 'Done'
Ejemplo n.º 10
0
def rescore_model(source_file, target_file, saveto, models, options, b,
                  normalization_alpha, verbose, alignweights):

    trng = RandomStreams(1234)

    fs_log_probs = []

    for model, option in zip(models, options):

        # load model parameters and set theano shared variables
        param_list = numpy.load(model).files
        param_list = dict.fromkeys(
            [key for key in param_list if not key.startswith('adam_')], 0)
        params = load_params(model, param_list)
        tparams = init_theano_params(params)

        trng, use_noise, \
            x, x_mask, y, y_mask, \
            opt_ret, \
            cost = \
            build_model(tparams, option)
        inps = [x, x_mask, y, y_mask]
        use_noise.set_value(0.)

        if alignweights:
            logging.debug(
                "Save weight mode ON, alignment matrix will be saved.")
            outputs = [cost, opt_ret['dec_alphas']]
            f_log_probs = theano.function(inps, outputs)
        else:
            f_log_probs = theano.function(inps, cost)

        fs_log_probs.append(f_log_probs)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        alignments = []
        for i, f_log_probs in enumerate(fs_log_probs):
            score, alignment = pred_probs(
                f_log_probs,
                prepare_data,
                options[i],
                pairs,
                normalization_alpha=normalization_alpha,
                alignweights=alignweights)
            scores.append(score)
            alignments.append(alignment)

        return scores, alignments

    pairs = TextIterator(
        source_file.name,
        target_file.name,
        options[0]['dictionaries'][:-1],
        options[0]['dictionaries'][-1],
        n_words_source=options[0]['n_words_src'],
        n_words_target=options[0]['n_words'],
        batch_size=b,
        maxlen=float('inf'),
        sort_by_length=False
    )  #TODO: sorting by length could be more efficient, but we'd want to resort after

    scores, alignments = _score(pairs, alignweights)

    source_file.seek(0)
    target_file.seek(0)
    source_lines = source_file.readlines()
    target_lines = target_file.readlines()

    for i, line in enumerate(target_lines):
        score_str = ' '.join(map(str, [s[i] for s in scores]))
        if verbose:
            saveto.write('{0} '.format(line.strip()))
        saveto.write('{0}\n'.format(score_str))

    ### optional save weights mode.
    if alignweights:
        ### writing out the alignments.
        temp_name = saveto.name + ".json"
        with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT:
            for line in all_alignments:
                align_OUT.write(line + "\n")
            ### combining the actual source and target words.
            combine_source_target_text_1to1(source_file, target_file,
                                            saveto.name, align_OUT)
Ejemplo n.º 11
0
Archivo: nmt.py Proyecto: sohuren/DL4MT
def train(
        dim_word=100,  # word vector dimensionality
        dim=1000,  # the number of LSTM units
        factors=1,  # input factors
        dim_per_factor=None,  # list of word vector dimensionalities (one per factor): [250,200,50] for total dimensionality of 500
        encoder='gru',
        decoder='gru_cond',
        patience=10,  # early stopping patience
        max_epochs=5000,
        finish_after=10000000,  # finish after this many updates
        dispFreq=100,
        decay_c=0.,  # L2 regularization penalty
        map_decay_c=0.,  # L2 regularization penalty towards original weights
        alpha_c=0.,  # alignment regularization
        clip_c=-1.,  # gradient clipping threshold
        lrate=0.01,  # learning rate
        n_words_src=None,  # source vocabulary size
        n_words=None,  # target vocabulary size
        maxlen=100,  # maximum length of the description
        optimizer='rmsprop',
        batch_size=16,
        valid_batch_size=16,
        saveto='model.npz',
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        sampleFreq=100,  # generate some samples after every sampleFreq
        datasets=('/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
                  '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'),
        valid_datasets=('../data/dev/newstest2011.en.tok',
                        '../data/dev/newstest2011.fr.tok'),
        dictionaries=(
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'),
        use_dropout=False,
        dropout_embedding=0.2,  # dropout for input embeddings (0: no dropout)
        dropout_hidden=0.5,  # dropout for hidden layers (0: no dropout)
        dropout_source=0,  # dropout source words (0: no dropout)
        dropout_target=0,  # dropout target words (0: no dropout)
        reload_=False,
        overwrite=False,
        external_validation_script=None,
        shuffle_each_epoch=True,
        finetune=False,
        finetune_only_last=False,
        sort_by_length=True,
        use_domain_interpolation=False,
        domain_interpolation_min=0.1,
        domain_interpolation_inc=0.1,
        domain_interpolation_indomain_datasets=('indomain.en', 'indomain.fr'),
        maxibatch_size=20,  #How many minibatches to load at one time
        model_version=0.1,  #store version used for training for compatibility
):

    # Model options
    model_options = locals().copy()

    if model_options['dim_per_factor'] == None:
        if factors == 1:
            model_options['dim_per_factor'] = [model_options['dim_word']]
        else:
            sys.stderr.write(
                'Error: if using factored input, you must specify \'dim_per_factor\'\n'
            )
            sys.exit(1)

    assert (len(dictionaries) == factors + 1
            )  # one dictionary per source factor + 1 for target factor
    assert (len(model_options['dim_per_factor']) == factors
            )  # each factor embedding has its own dimensionality
    assert (
        sum(model_options['dim_per_factor']) == model_options['dim_word']
    )  # dimensionality of factor embeddings sums up to total dimensionality of input embedding vector

    # load dictionaries and invert them
    worddicts = [None] * len(dictionaries)
    worddicts_r = [None] * len(dictionaries)
    for ii, dd in enumerate(dictionaries):
        worddicts[ii] = load_dict(dd)
        worddicts_r[ii] = dict()
        for kk, vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk

    if n_words_src is None:
        n_words_src = len(worddicts[0])
        model_options['n_words_src'] = n_words_src
    if n_words is None:
        n_words = len(worddicts[1])
        model_options['n_words'] = n_words

    print('Loading data')
    domain_interpolation_cur = None
    if use_domain_interpolation:
        print(
            'Using domain interpolation with initial ratio %s, increase rate %s'
            % (domain_interpolation_min, domain_interpolation_inc))
        domain_interpolation_cur = domain_interpolation_min
        train = DomainInterpolatorTextIterator(
            datasets[0],
            datasets[1],
            dictionaries[:-1],
            dictionaries[1],
            n_words_source=n_words_src,
            n_words_target=n_words,
            batch_size=batch_size,
            maxlen=maxlen,
            shuffle_each_epoch=shuffle_each_epoch,
            sort_by_length=sort_by_length,
            indomain_source=domain_interpolation_indomain_datasets[0],
            indomain_target=domain_interpolation_indomain_datasets[1],
            interpolation_rate=domain_interpolation_cur,
            maxibatch_size=maxibatch_size)
    else:
        train = TextIterator(datasets[0],
                             datasets[1],
                             dictionaries[:-1],
                             dictionaries[-1],
                             n_words_source=n_words_src,
                             n_words_target=n_words,
                             batch_size=batch_size,
                             maxlen=maxlen,
                             skip_empty=True,
                             shuffle_each_epoch=shuffle_each_epoch,
                             sort_by_length=sort_by_length,
                             maxibatch_size=maxibatch_size)

    if valid_datasets and validFreq:
        valid = TextIterator(valid_datasets[0],
                             valid_datasets[1],
                             dictionaries[:-1],
                             dictionaries[-1],
                             n_words_source=n_words_src,
                             n_words_target=n_words,
                             batch_size=valid_batch_size,
                             maxlen=maxlen)
    else:
        valid = None

    comp_start = time.time()

    print('Building model')
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(saveto):
        print('Reloading model parameters')
        params = load_params(saveto, params)

    tparams = init_theano_params(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)

    inps = [x, x_mask, y, y_mask]

    if validFreq or sampleFreq:
        print('Building sampler')
        f_init, f_next = build_sampler(tparams, model_options, use_noise, trng)

    # before any regularizer
    print('Building f_log_probs...', )
    f_log_probs = theano.function(inps, cost, profile=profile)
    print('Done')

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
        cost += alpha_reg

    # apply L2 regularisation to loaded model (map training)
    if map_decay_c > 0:
        map_decay_c = theano.shared(numpy.float32(map_decay_c),
                                    name="map_decay_c")
        weight_map_decay = 0.
        for kk, vv in tparams.iteritems():
            init_value = theano.shared(vv.get_value(), name=kk + "_init")
            weight_map_decay += ((vv - init_value)**2).sum()
        weight_map_decay *= map_decay_c
        cost += weight_map_decay

    # allow finetuning with fixed embeddings
    if finetune:
        updated_params = OrderedDict([(key, value)
                                      for (key, value) in tparams.iteritems()
                                      if not key.startswith('Wemb')])
    else:
        updated_params = tparams

    # allow finetuning of only last layer (becomes a linear model training problem)
    if finetune_only_last:
        updated_params = OrderedDict([(key, value)
                                      for (key, value) in tparams.iteritems()
                                      if key in ['ff_logit_W', 'ff_logit_b']])
    else:
        updated_params = tparams

    print('Computing gradient...', )
    grads = tensor.grad(cost, wrt=itemlist(updated_params))
    print('Done')

    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c,
                              g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')

    print('Building optimizers...', )
    f_grad_shared, f_update = eval(optimizer)(lr,
                                              updated_params,
                                              grads,
                                              inps,
                                              cost,
                                              profile=profile)
    print('Done')

    print('Total compilation time: {0:.1f}s'.format(time.time() - comp_start))

    print('Optimization')

    best_p = None
    bad_counter = 0
    uidx = 0
    estop = False
    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        rmodel = numpy.load(saveto)
        history_errs = list(rmodel['history_errs'])
        if 'uidx' in rmodel:
            uidx = rmodel['uidx']

    # save model options
    json.dump(model_options, open('%s.json' % saveto, 'wb'), indent=2)

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0]) / batch_size

    valid_err = None

    last_disp_samples = 0
    ud_start = time.time()
    p_validation = None
    for eidx in xrange(max_epochs):
        n_samples = 0

        for x, y in train:
            n_samples += len(x)
            last_disp_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)

            # ensure consistency in number of factors
            if len(x) and len(x[0]) and len(x[0][0]) != factors:
                sys.stderr.write(
                    'Error: mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n'
                    .format(factors, len(x[0][0])))
                sys.exit(1)

            x, x_mask, y, y_mask = prepare_data(
                x, y, maxlen=maxlen
            )  # n_words_src=n_words_src, n_words=n_words) # TODO: why unused??

            if x is None:
                print('Minibatch with zero sample under length ', maxlen)
                uidx -= 1
                continue

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x, x_mask, y, y_mask)

            # do the update on parameters
            f_update(lrate)

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if numpy.isnan(cost) or numpy.isinf(cost):
                print('NaN detected')
                return 1., 1., 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                ud = time.time() - ud_start
                wps = (last_disp_samples) / float(ud)
                print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ',
                      ud, "{0:.2f} sentences/s".format(wps))
                ud_start = time.time()
                last_disp_samples = 0

            # save the best model so far, in addition, save the latest model
            # into a separate file with the iteration number for external eval
            if numpy.mod(uidx, saveFreq) == 0:
                print('Saving the best model...', )
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip_from_theano(tparams)
                numpy.savez(saveto,
                            history_errs=history_errs,
                            uidx=uidx,
                            **params)
                print('Done')

                # save with uidx
                if not overwrite:
                    print('Saving the model at iteration {}...'.format(uidx), )
                    saveto_uidx = '{}.iter{}.npz'.format(
                        os.path.splitext(saveto)[0], uidx)
                    numpy.savez(saveto_uidx,
                                history_errs=history_errs,
                                uidx=uidx,
                                **unzip_from_theano(tparams))
                    print('Done')

            # generate some samples with the model and display them
            if sampleFreq and numpy.mod(uidx, sampleFreq) == 0:
                # FIXME: random selection?
                for jj in xrange(numpy.minimum(5, x.shape[2])):
                    stochastic = True
                    x_current = x[:, :, jj][:, :, None]

                    # remove padding
                    x_current = x_current[:, :x_mask[:, jj].sum(), :]

                    sample, score, sample_word_probs, alignment, hyp_graph = gen_sample(
                        [f_init], [f_next],
                        x_current,
                        trng=trng,
                        k=1,
                        maxlen=30,
                        stochastic=stochastic,
                        argmax=False,
                        suppress_unk=False,
                        return_hyp_graph=False)
                    print(
                        'Source ',
                        jj,
                        ': ',
                    )
                    for pos in range(x.shape[1]):
                        if x[0, pos, jj] == 0:
                            break
                        for factor in range(factors):
                            vv = x[factor, pos, jj]
                            if vv in worddicts_r[factor]:
                                sys.stdout.write(worddicts_r[factor][vv])
                            else:
                                sys.stdout.write('UNK')
                            if factor + 1 < factors:
                                sys.stdout.write('|')
                            else:
                                sys.stdout.write(' ')
                    print()
                    print(
                        'Truth ',
                        jj,
                        ' : ',
                    )
                    for vv in y[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[-1]:
                            print(worddicts_r[-1][vv], )
                        else:
                            print('UNK', )
                    print()
                    print(
                        'Sample ',
                        jj,
                        ': ',
                    )
                    if stochastic:
                        ss = sample
                    else:
                        score = score / numpy.array([len(s) for s in sample])
                        ss = sample[score.argmin()]
                    for vv in ss:
                        if vv == 0:
                            break
                        if vv in worddicts_r[-1]:
                            print(worddicts_r[-1][vv], )
                        else:
                            print('UNK', )
                    print()

            # validate model on validation set and early stop if necessary
            if valid and validFreq and numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs, alignment = pred_probs(f_log_probs, prepare_data,
                                                   model_options, valid)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip_from_theano(tparams)
                    bad_counter = 0
                if len(history_errs) > patience and valid_err >= \
                        numpy.array(history_errs)[:-patience].min():
                    bad_counter += 1
                    if bad_counter > patience:
                        if use_domain_interpolation and (
                                domain_interpolation_cur < 1.0):
                            domain_interpolation_cur = min(
                                domain_interpolation_cur +
                                domain_interpolation_inc, 1.0)
                            print(
                                'No progress on the validation set, increasing domain interpolation rate to %s and resuming from best params'
                                % domain_interpolation_cur)
                            train.adjust_domain_interpolation_rate(
                                domain_interpolation_cur)
                            if best_p is not None:
                                zip_to_theano(best_p, tparams)
                            bad_counter = 0
                        else:
                            print('Early Stop!')
                            estop = True
                            break

                if numpy.isnan(valid_err):
                    ipdb.set_trace()

                print('Valid ', valid_err)

                if external_validation_script:
                    print("Calling external validation script")
                    if p_validation is not None and p_validation.poll(
                    ) is None:
                        print("Waiting for previous validation run to finish")
                        print(
                            "If this takes too long, consider increasing validation interval, reducing validation set size, or speeding up validation by using multiple processes"
                        )
                        valid_wait_start = time.time()
                        p_validation.wait()
                        print("Waited for {0:.1f} seconds".format(
                            time.time() - valid_wait_start))
                    print('Saving  model...', )
                    params = unzip_from_theano(tparams)
                    numpy.savez(saveto + '.dev',
                                history_errs=history_errs,
                                uidx=uidx,
                                **params)
                    json.dump(model_options,
                              open('%s.dev.npz.json' % saveto, 'wb'),
                              indent=2)
                    print('Done')
                    p_validation = Popen([external_validation_script])

            # finish after this many updates
            if uidx >= finish_after:
                print('Finishing after %d iterations!' % uidx)
                estop = True
                break

        print('Seen %d samples' % n_samples)

        if estop:
            break

    if best_p is not None:
        zip_to_theano(best_p, tparams)

    if valid:
        use_noise.set_value(0.)
        valid_errs, alignment = pred_probs(f_log_probs, prepare_data,
                                           model_options, valid)
        valid_err = valid_errs.mean()

        print('Valid ', valid_err)

    if best_p is not None:
        params = copy.copy(best_p)
    else:
        params = unzip_from_theano(tparams)
    numpy.savez(saveto,
                zipped_params=best_p,
                history_errs=history_errs,
                uidx=uidx,
                **params)

    return valid_err
Ejemplo n.º 12
0
def translate_model(queue, rqueue, pid, models, options, k, normalization_alpha, verbose, nbest, return_alignment, suppress_unk, return_hyp_graph, deviceid):

    # if the --device-list argument is set
    if deviceid != '':
        import os
        theano_flags = os.environ['THEANO_FLAGS'].split(',')
        exist = False
        for i in xrange(len(theano_flags)):
            if theano_flags[i].strip().startswith('device'):
                exist = True
                theano_flags[i] = '%s=%s' % ('device', deviceid)
                break
        if exist == False:
            theano_flags.append('%s=%s' % ('device', deviceid))
        os.environ['THEANO_FLAGS'] = ','.join(theano_flags)

    from theano_util import (floatX, numpy_floatX, load_params, init_theano_params)
    from nmt import (build_sampler, gen_sample, init_params)

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    from theano import shared
    trng = RandomStreams(1234)
    use_noise = shared(numpy_floatX(0.))

    fs_init = []
    fs_next = []

    for model, option in zip(models, options):
        # load model parameters and set theano shared variables
        param_list = numpy.load(model).files
        param_list = dict.fromkeys([key for key in param_list if not key.startswith('adam_')], 0)
        params = load_params(model, param_list)
        tparams = init_theano_params(params)

        # word index
        f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=return_alignment)

        fs_init.append(f_init)
        fs_next.append(f_next)

    def _translate(seq):
        # sample given an input sequence and obtain scores
        sample, score, word_probs, alignment, hyp_graph = gen_sample(fs_init, fs_next,
                                   numpy.array(seq).T.reshape([len(seq[0]), len(seq), 1]),
                                   trng=trng, k=k, maxlen=200,
                                   stochastic=False, argmax=False, return_alignment=return_alignment,
                                   suppress_unk=suppress_unk, return_hyp_graph=return_hyp_graph)

        # normalize scores according to sequence lengths
        if normalization_alpha:
            adjusted_lengths = numpy.array([len(s) ** normalization_alpha for s in sample])
            score = score / adjusted_lengths
        if nbest:
            return sample, score, word_probs, alignment, hyp_graph
        else:
            sidx = numpy.argmin(score)
            return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx], hyp_graph

    while True:
        req = queue.get()
        if req is None:
            break

        idx, x = req[0], req[1]
        if verbose:
            sys.stderr.write('{0} - {1}\n'.format(pid,idx))
        seq = _translate(x)

        rqueue.put((idx, seq))

    return
Ejemplo n.º 13
0
def translate_model(queue, rqueue, pid, models, options, k, normalize, verbose,
                    nbest, return_alignment, suppress_unk, return_hyp_graph):
    from theano_util import (load_params, init_theano_params)
    from nrg import (build_sampler, gen_sample, init_params)

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    from theano import shared
    trng = RandomStreams(1234)
    use_noise = shared(numpy.float32(0.))

    fs_init = []
    fs_next = []
    print models

    for model, option in zip(models, options):
        # load model parameters and set theano shared variables
        param_list = numpy.load(model).files
        param_list = dict.fromkeys(
            [key for key in param_list if not key.startswith('adam_')], 0)
        #print param_list
        params = load_params(model, param_list)

        # output models in plain texts
        numpy.set_printoptions(threshold='nan')
        #for kk, vv in params.iteritems():
        #print kk
        #print vv
        tparams = init_theano_params(params)

        f_init, f_next = build_sampler(tparams,
                                       option,
                                       use_noise,
                                       trng,
                                       return_alignment=return_alignment)

        fs_init.append(f_init)
        fs_next.append(f_next)

    def _translate(seq):
        sample, score, word_probs, alignment, hyp_graph = gen_sample(
            fs_init,
            fs_next,
            # factors, time-steps, n-sample
            numpy.array(seq).T.reshape([len(seq[0]), len(seq), 1]),
            trng=trng,
            k=k,
            maxlen=200,
            stochastic=False,
            argmax=False,  # these two is a kind of search method
            return_alignment=return_alignment,
            suppress_unk=suppress_unk,
            return_hyp_graph=return_hyp_graph)
        if normalize:  # length normalization
            lengths = numpy.array([len(s) for s in sample])
            scores = scores / length

        if nbest:  # return n-best
            return sample, score, word_probs, alignment, hyp_graph
        else:  # return the top best
            sidx = numpy.argmin(score)
            return sample[sidx], score[sidx], word_probs[sidx], alignment[
                sidx], hyp_graph

    while True:
        req = queue.get()
        if req is None:
            break

        idx, x = req[0], req[1]
        seq = _translate(x)

        rqueue.put((idx, seq))
Ejemplo n.º 14
0
def main(models,
         saveto,
         bpe_file,
         save_alignment=None,
         k=5,
         normalize=False,
         n_process=5,
         chr_level=False,
         verbose=False,
         nbest=False,
         suppress_unk=False,
         a_json=False,
         print_word_probabilities=False,
         return_hyp_graph=False):
    # load model model_options
    options = []
    for model in models:
        options.append(load_config(model))

        fill_options(options[-1])

    dictionaries = options[0]['dictionaries']

    dictionaries_source = dictionaries[:-1]
    dictionary_target = dictionaries[-1]

    # load source dictionary and invert
    word_dicts = []
    word_idicts = []
    for dictionary in dictionaries_source:
        word_dict = load_dict(dictionary)
        if options[0]['n_words_src']:
            for key, idx in word_dict.items():
                if idx >= options[0]['n_words_src']:
                    del word_dict[key]
        word_idict = dict()
        for kk, vv in word_dict.iteritems():
            word_idict[vv] = kk
        word_idict[0] = '<eos>'
        word_idict[1] = 'UNK'
        word_dicts.append(word_dict)
        word_idicts.append(word_idict)

    # load target dictionary and invert
    word_dict_trg = load_dict(dictionary_target)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    # create input and output queues for processes
    # CAN I MAKE IT INTO SERVER

    ###### The following functions should be already a part of serverisation

    # utility function
    def _seqs2words(cc):
        ww = []
        for w in cc:
            if w == 0:
                break
            ww.append(word_idict_trg[w])
        return ' '.join(ww)

    def _send_jobs(f, processes, queue):
        source_sentences = []
        for idx, line in enumerate(f):
            if chr_level:
                words = list(line.decode('utf-8').strip())
            else:
                words = line.strip().split()

            x = []
            for w in words:
                w = [
                    word_dicts[i][f] if f in word_dicts[i] else 1
                    for (i, f) in enumerate(w.split('|'))
                ]
                if len(w) != options[0]['factors']:
                    sys.stderr.write(
                        'Error: expected {0} factors, but input word has {1}\n'
                        .format(options[0]['factors'], len(w)))
                    for midx in xrange(n_process):
                        processes[midx].terminate()
                    sys.exit(1)
                x.append(w)

            x += [[0] * options[0]['factors']]
            queue.put((idx, x))
            source_sentences.append(words)
        return idx + 1, source_sentences

    def _finish_processes(queue):
        for midx in xrange(n_process):
            queue.put(None)

    def _retrieve_jobs(n_samples, processes, queue, rqueue):
        trans = [None] * n_samples
        out_idx = 0
        for idx in xrange(n_samples):
            resp = None
            while resp is None:
                try:
                    resp = rqueue.get(True, 5)
                # if queue is empty after 5s, check if processes are still alive
                except Empty:
                    for midx in xrange(n_process):
                        if not processes[midx].is_alive():
                            # kill all other processes and raise exception if one dies
                            queue.cancel_join_thread()
                            rqueue.cancel_join_thread()
                            for idx in xrange(n_process):
                                processes[idx].terminate()
                            sys.stderr.write(
                                "Error: translate worker process {0} crashed with exitcode {1}"
                                .format(processes[midx].pid,
                                        processes[midx].exitcode))
                            sys.exit(1)
            trans[resp[0]] = resp[1]
            if verbose and numpy.mod(idx, 10) == 0:
                sys.stderr.write('Sample {0} / {1} Done\n'.format((idx + 1),
                                                                  n_samples))
            while out_idx < n_samples and trans[out_idx] != None:
                yield trans[out_idx]
                out_idx += 1

    def _parallelized_main(fs_init, fs_next, c, bpe, tokenizer, detokenizer):
        source_file_t = sent_tokenize(c.recv(4096).decode('utf-8'))
        #print(source_file_t[i])
        while source_file_t[0] != "EOT":
            for i in range(len(source_file_t)):
                # print source_file_t[i].decode('utf-8')
                #pipe = subprocess.Popen("echo " + source_file_t[i] + "| perl truecase.perl --model en-truecase.mdl", shell=True)
                #pipe = subprocess.Popen(["echo", '"' + source_file_t[i] + '"', "|", "perl", "truecase.perl", "--model",
                #                         "en-truecase.mdl"], stdout=subprocess.PIPE)
                #result = pipe.stdout.read()
                #print pipe.communicate()
                #print pipe
                #print pipe.stdout
                #print pipe.stdout.read()
                #print pipe.
                #print "Here"
                #print result
                #source_file_t[i] = subprocess.check_output()
                source_file_t[i] = bpe.segment(
                    tokenizer.tokenize(source_file_t[i],
                                       return_str=True)).strip()
            #print "Passed"
            print source_file_t
            detokenized = ''
            queue = Queue()
            rqueue = Queue()
            processes = [None] * n_process
            for midx in xrange(n_process):
                processes[midx] = Process(
                    target=translate_model,
                    args=(queue, rqueue, midx, models, options, k, normalize,
                          verbose, nbest, save_alignment is not None,
                          suppress_unk, return_hyp_graph, fs_init, fs_next))
                processes[midx].start()

            n_samples, source_sentences = _send_jobs(source_file_t, processes,
                                                     queue)
            _finish_processes(queue)
            #### The model loading takes place in the head of for loop, prolly in _retrieve_jobs
            for i, trans in enumerate(
                    _retrieve_jobs(n_samples, processes, queue, rqueue)):
                print "NEXT SENTENCE:"
                if nbest:
                    samples, scores, word_probs, alignment, hyp_graph = trans
                    if return_hyp_graph:
                        renderer = HypGraphRenderer(hyp_graph)
                        renderer.wordify(word_idict_trg)
                        renderer.save_png(return_hyp_graph,
                                          detailed=True,
                                          highlight_best=True)
                    order = numpy.argsort(scores)
                    for j in order:
                        if print_word_probabilities:
                            probs = " ||| " + " ".join(
                                "{0}".format(prob) for prob in word_probs[j])
                        else:
                            probs = ""
                        saveto.write('{0} ||| {1} ||| {2}{3}\n'.format(
                            i, _seqs2words(samples[j]), scores[j], probs))
                        # print alignment matrix for each hypothesis
                        # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos
                        # translation_token_count+eos
                        if save_alignment is not None:
                            if a_json:
                                print_matrix_json(
                                    alignment[j], source_sentences[i],
                                    _seqs2words(samples[j]).split(), i, i + j,
                                    save_alignment)
                            else:
                                save_alignment.write(
                                    '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'
                                    .format(i, _seqs2words(samples[j]),
                                            scores[j],
                                            ' '.join(source_sentences[i]),
                                            len(source_sentences[i]) + 1,
                                            len(samples[j])))
                                print_matrix(alignment[j], save_alignment)
                else:
                    samples, scores, word_probs, alignment, hyp_graph = trans
                    if return_hyp_graph:
                        renderer = HypGraphRenderer(hyp_graph)
                        renderer.wordify(word_idict_trg)
                        renderer.save_png(return_hyp_graph,
                                          detailed=True,
                                          highlight_best=True)
                    ## TODO: Handle the output here
                    #print((_seqs2words(samples) + "\n").encode('utf-8'))
                    #text.append(_seqs2words(samples) + "\n")
                    x = _seqs2words(samples)
                    #print x[0].upper() + x[1:]
                    detokenized += detokenizer.detokenize(
                        (x.decode('utf-8') + " ").split(), return_str=True)
                    detokenized = detokenized[0].upper() + detokenized[1:]
                    #print "ref this"
                    #print detokenized
                    #detokenized[0] = detokenized[0].upper()
                    #c.send(detokenized.replace('@@ ', '').encode('utf-8').strip())
                    ## TODO: End of output handling
                    if print_word_probabilities:
                        for prob in word_probs:
                            saveto.write("{} ".format(prob))
                        saveto.write('\n')
                    if save_alignment is not None:
                        if a_json:
                            print_matrix_json(alignment, source_sentences[i],
                                              _seqs2words(trans[0]).split(), i,
                                              i, save_alignment)
                        else:
                            save_alignment.write(
                                '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.
                                format(i, _seqs2words(trans[0]), 0,
                                       ' '.join(source_sentences[i]),
                                       len(source_sentences[i]) + 1,
                                       len(trans[0])))
                            print_matrix(alignment, save_alignment)
            c.send(detokenized.replace('@@ ', '').encode('utf-8').strip())
            source_file_t = sent_tokenize(c.recv(4096).decode('utf-8'))
        c.close()
        sys.stderr.write('Done\n')

    def _listen(c, addr, fs_init, fs_next, tokenizer, detokenizer, bpe):
        while True:
            try:  # Establish connection with client.
                try:
                    print 'Got connection from', addr
                    print "Receiving..."
                    fname = c.recv(4096)
                except socket.error:
                    c.close()
                    print "connection closed"
                    break
                print fname
                c.send("okay")
                #if fname == 'exit':
                #    print "Terminating connection with client."
                #    c.close()
                #    break
                #else:
                #t = threading.Thread(target=_parallelized_main, args=(fname, fs_init, fs_next, c))
                try:
                    t = threading.Thread(target=_parallelized_main,
                                         args=(fs_init, fs_next, c, bpe,
                                               tokenizer, detokenizer))
                    t.start()
                    t.join()
                except socket.error:
                    c.close()
                    break
            except KeyboardInterrupt as e:
                LOG.debug('Crtrl+C issued ...')
                LOG.info('Terminating server ...')
                try:
                    c.shutdown(socket.SHUT_RDWR)
                    c.close()
                except:
                    pass
                break

    s = socket.socket()  # Create a socket object
    host = socket.gethostname()  # Get local machine name
    port = 12345  # Reserve a port for your service.
    s.bind((host, port))  # Bind to the port #  Now wait for client connection.

    # Beginning model loading
    from theano_util import (load_params, init_theano_params)
    from nmt import (build_sampler)

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    from theano import shared
    trng = RandomStreams(1234)
    use_noise = shared(numpy.float32(0.))

    fs_init = []
    fs_next = []

    for model, option in zip(models, options):
        # load model parameters and set theano shared variables
        param_list = numpy.load(model).files
        param_list = dict.fromkeys(
            [key for key in param_list if not key.startswith('adam_')], 0)
        params = load_params(model, param_list)
        tparams = init_theano_params(params)

        # word index
        f_init, f_next = build_sampler(tparams,
                                       option,
                                       use_noise,
                                       trng,
                                       return_alignment=save_alignment
                                       is not None)

        fs_init.append(f_init)
        fs_next.append(f_next)
    # end of model loading
    tokenizer = moses.MosesTokenizer()
    detokenizer = moses.MosesDetokenizer()
    # start listening to connections once models are loaded
    args.codes = codecs.open(bpe_file[0], encoding='utf-8')
    bpe = BPE(args.codes, '@@')
    while True:
        try:
            s.listen(5)
            print("Waiting for connections and stuff...")
            c, addr = s.accept()
            t = threading.Thread(target=_listen,
                                 args=(c, addr, fs_init, fs_next, tokenizer,
                                       detokenizer, bpe))
            t.start()
        except KeyboardInterrupt:
            break
    s.close()
Ejemplo n.º 15
0
def main(model,
         dictionary,
         dictionary_target,
         source_file,
         saveto,
         k=5,
         batch_size=16,
         opt_base=None,
         normalize=False,
         output_attention=False):
    trng = RandomStreams(1234)
    use_noise = shared(numpy.float32(0.))

    #load params
    if opt_base is None:
        options = load_config(model)
    else:
        options = load_config(opt_base)

    param_list = numpy.load(model).files
    param_list = dict.fromkeys(
        [key for key in param_list if not key.startswith('adam_')], 0)
    params = load_params(model, param_list, '')
    tparams = init_theano_params(params)

    #load dictionary
    if dictionary is None:
        dictionary = options['dictionaries'][0]
    word_dict = load_dict(dictionary)

    if options['n_words_src']:
        for key, idx in word_dict.items():
            if idx >= options['n_words_src']:
                del word_dict[key]
    word_idict = dict()
    for kk, vv in word_dict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    if dictionary_target is None:
        dictionary_target = options['dictionaries'][1]
    word_dict_trg = load_dict(dictionary_target)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    def _send_jobs(fname):
        retval = []
        retval_ori = []
        with open(fname, 'r') as f:
            for idx, line in enumerate(f):
                words = line.strip().split()
                if len(words) == 0:
                    continue
                retval_ori.append(line.strip())
                x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
                x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x)
                retval.append(x)
        logging.info('total %s sentences' % len(retval))
        return retval, retval_ori

    sources, sources_ori = _send_jobs(source_file)

    batches = []
    for i in range(len(sources) / batch_size):
        batches.append(
            prepare_data(sources[i * batch_size:(i + 1) * batch_size]))
    if (i + 1) * batch_size < len(sources):
        batches.append(prepare_data(sources[(i + 1) * batch_size:]))
    final_sentences = []
    print 'Building beam sampler...',
    f_beam_sample = build_beam_sampler(tparams, options, use_noise, trng, k)
    print 'Done'
    for batch in batches:
        final_word_ids, final_beam_ids, final_beam_scores = f_beam_sample(
            *batch)
        for i in range(final_word_ids.shape[1]):
            word_ids = process_beam_results(final_word_ids[:, i, :],
                                            final_beam_ids[:, i, :],
                                            final_beam_scores[i])
            word_ids = [[wid for wid in line if wid != 0] for line in word_ids]
            words = [word_idict_trg[wid] for wid in word_ids[0]]
            sentence = ' '.join(words) + '\n'
            final_sentences.append(sentence)
            print len(final_sentences)

    with open(saveto, 'w') as fout:
        for sentence in final_sentences:
            fout.write(sentence)
    print 'Done'
Ejemplo n.º 16
0
    def init(self, model_options):
        """Exposes: (but Pyro does not see them)
            self.f_init
            self.f_next
            self.f_log_probs
            self.f_grad_shared
            self.f_update
        """

        reload_ = model_options['reload_']
        saveto = model_options['saveto']
        decay_c = model_options['decay_c']
        alpha_c = model_options['alpha_c']
        map_decay_c = model_options['map_decay_c']
        finetune = model_options['finetune']
        finetune_only_last = model_options['finetune_only_last']
        clip_c = model_options['clip_c']
        optimizer = model_options['optimizer']

        comp_start = time.time()

        print 'Building model'
        params = init_params(model_options)
        # reload parameters
        if reload_ and os.path.exists(saveto):
            print 'Reloading model parameters'
            params = load_params(saveto, params)

        self.tparams = init_theano_params(params)

        trng, self.use_noise, x, x_mask, y, y_mask, opt_ret, per_sent_neg_log_prob = build_model(
            self.tparams, model_options)

        inps = [x, x_mask, y, y_mask]

        self.f_init, self.f_next = build_sampler(self.tparams, model_options,
                                                 self.use_noise, trng)

        # before any regularizer
        print 'Building f_log_probs...',
        self.f_log_probs = theano.function(inps,
                                           per_sent_neg_log_prob,
                                           profile=profile)
        print 'Done'

        # apply per-sentence weight to cost_vec before averaging
        per_sent_weight = tensor.vector('per_sent_weight', dtype='float32')
        per_sent_weight.tag.test_value = numpy.ones(10).astype('float32')
        cost = (per_sent_neg_log_prob *
                per_sent_weight).mean()  # mean of elem-wise multiply

        # apply L2 regularization on weights
        if decay_c > 0.:
            decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
            weight_decay = 0.
            for kk, vv in self.tparams.iteritems():
                weight_decay += (vv**2).sum()
            weight_decay *= decay_c
            cost += weight_decay

        # regularize the alpha weights
        if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
            alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
            alpha_reg = alpha_c * ((
                tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None]
                - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
            cost += alpha_reg

        # apply L2 regularisation to loaded model (map training)
        if map_decay_c > 0:
            map_decay_c = theano.shared(numpy.float32(map_decay_c),
                                        name="map_decay_c")
            weight_map_decay = 0.
            for kk, vv in self.tparams.iteritems():
                init_value = theano.shared(vv.get_value(), name=kk + "_init")
                weight_map_decay += ((vv - init_value)**2).sum()
            weight_map_decay *= map_decay_c
            cost += weight_map_decay

        # allow finetuning with fixed embeddings
        if finetune:
            updated_params = OrderedDict([
                (key, value) for (key, value) in self.tparams.iteritems()
                if not key.startswith('Wemb')
            ])
        elif finetune_only_last:  # allow finetuning of only last layer (becomes a linear model training problem)
            updated_params = OrderedDict([
                (key, value) for (key, value) in self.tparams.iteritems()
                if key in ['ff_logit_W', 'ff_logit_b']
            ])
        else:
            updated_params = self.tparams

        print 'Computing gradient...',
        grads = tensor.grad(cost, wrt=itemlist(updated_params))
        print 'Done'

        # apply gradient clipping here
        if clip_c > 0.:
            g2 = 0.
            for g in grads:
                g2 += (g**2).sum()
            new_grads = []
            for g in grads:
                new_grads.append(
                    tensor.switch(g2 > (clip_c**2),
                                  g / tensor.sqrt(g2) * clip_c, g))
            grads = new_grads

        # compile the optimizer, the actual computational graph is compiled here
        lr = tensor.scalar(name='lr')

        print 'Building optimizers...',
        op_map = {
            'adam': optimizers.adam,
            'adadelta': optimizers.adadelta,
            'rmsprop': optimizers.rmsprop,
            'sgd': optimizers.sgd
        }
        inps = inps + [
            per_sent_weight,
        ]
        self.f_grad_shared, self.f_update = op_map[optimizer](
            lr,
            updated_params,
            grads,
            inps,
            per_sent_neg_log_prob,
            profile=profile)
        print 'Done'

        print 'Total compilation time: {0:.1f}s'.format(time.time() -
                                                        comp_start)
Ejemplo n.º 17
0
def translate_model_external_embedding(queue, rqueue, pid, models, options, k,
                                       normalize, verbose, nbest,
                                       return_alignment, suppress_unk):

    from theano_util import (load_params, init_theano_params)
    from nmt import (build_sampler_embeddings, gen_sample,
                     init_params_embeddings)

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    from theano import shared
    trng = RandomStreams(1234)
    use_noise = shared(numpy.float32(0.))

    fs_init = []
    fs_next = []

    for model, option in zip(models, options):

        # allocate model parameters
        params = init_params_embeddings(option)

        # load model parameters and set theano shared variables
        params = load_params(model, params)
        tparams = init_theano_params(params)

        # word index
        f_init, f_next = build_sampler_embeddings(
            tparams,
            option,
            use_noise,
            trng,
            return_alignment=return_alignment)

        fs_init.append(f_init)
        fs_next.append(f_next)

    def _translate(seq):
        # sample given an input sequence and obtain scores
        sample, score, word_probs, alignment = gen_sample(
            fs_init,
            fs_next,
            seq,
            trng=trng,
            k=int(k),
            maxlen=200,
            stochastic=False,
            argmax=False,
            return_alignment=return_alignment,
            suppress_unk=suppress_unk)

        # normalize scores according to sequence lengths
        if normalize:
            lengths = numpy.array([len(s) for s in sample])
            score /= lengths
        if nbest:
            return sample, score, word_probs, alignment
        else:
            sidx = numpy.argmin(score)
            return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx]

    while True:
        req = queue.get()
        if req is None:
            break

        idx, x = req[0], req[1]
        if verbose:
            sys.stderr.write('{0} - {1}\n'.format(pid, idx))
        seq = _translate(x)

        rqueue.put((idx, seq))

    return
Ejemplo n.º 18
0
def rescore_model(source_file, target_file, saveto, models, options, b, normalize, verbose, alignweights):

    trng = RandomStreams(1234)

    fs_log_probs = []

    for model, option in zip(models, options):
        # allocate model parameters
        params = init_params(option)

        # load model parameters and set theano shared variables
        params = load_params(model, params)
        tparams = init_theano_params(params)

        trng, use_noise, \
            x, x_mask, y, y_mask, \
            opt_ret, \
            cost = \
            build_model(tparams, option)
        inps = [x, x_mask, y, y_mask]
        use_noise.set_value(0.)

        if alignweights:
            sys.stderr.write("\t*** Save weight mode ON, alignment matrix will be saved.\n")
            outputs = [cost, opt_ret['dec_alphas']]
            f_log_probs = theano.function(inps, outputs)
        else:
            f_log_probs = theano.function(inps, cost)

        fs_log_probs.append(f_log_probs)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        alignments = []
        for i, f_log_probs in enumerate(fs_log_probs):
            score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights = alignweights)
            scores.append(score)
            alignments.append(alignment)

        return scores, alignments

    pairs = TextIterator(source_file.name, target_file.name,
                    options[0]['dictionaries'][:-1], options[0]['dictionaries'][1],
                     n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'],
                     batch_size=b,
                     maxlen=float('inf'),
                     sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd want to resort after

    scores, alignments = _score(pairs, alignweights)

    source_file.seek(0)
    target_file.seek(0)
    source_lines = source_file.readlines()
    target_lines = target_file.readlines()

    for i, line in enumerate(target_lines):
        score_str = ' '.join(map(str,[s[i] for s in scores]))
        saveto.write('{0} {1}\n'.format(line.strip(), score_str))

    ### optional save weights mode.
    if alignweights:
        ### writing out the alignments.
        temp_name = saveto.name + ".json"
        with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT:
            for line in all_alignments:
                align_OUT.write(line + "\n")
            ### combining the actual source and target words.
            combine_source_target_text_1to1(source_file, target_file, saveto.name, align_OUT)
Ejemplo n.º 19
0
def rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights):

    trng = RandomStreams(1234)

    fs_log_probs = []

    for model, option in zip(models, options):
        # allocate model parameters
        params = init_params(option)

        # load model parameters and set theano shared variables
        params = load_params(model, params)
        tparams = init_theano_params(params)

        trng, use_noise, \
            x, x_mask, y, y_mask, \
            opt_ret, \
            cost = \
            build_model(tparams, option)
        inps = [x, x_mask, y, y_mask]
        use_noise.set_value(0.)

        if alignweights:
            sys.stderr.write("\t*** Save weight mode ON, alignment matrix will be saved.\n")
            outputs = [cost, opt_ret['dec_alphas']]
            f_log_probs = theano.function(inps, outputs)
        else:
            f_log_probs = theano.function(inps, cost)

        fs_log_probs.append(f_log_probs)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        alignments = []
        for i, f_log_probs in enumerate(fs_log_probs):
            score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights = alignweights)
            scores.append(score)
            alignments.append(alignment)

        return scores, alignments

    lines = source_file.readlines()
    nbest_lines = nbest_file.readlines()

    if alignweights: ### opening the temporary file.
        temp_name = saveto.name + ".json"
        align_OUT = tempfile.NamedTemporaryFile(prefix=temp_name)

    with tempfile.NamedTemporaryFile(prefix='rescore-tmpin') as tmp_in, tempfile.NamedTemporaryFile(prefix='rescore-tmpout') as tmp_out:
        for line in nbest_lines:
            linesplit = line.split(' ||| ')
            idx = int(linesplit[0])   ##index from the source file. Starting from 0.
            tmp_in.write(lines[idx])
            tmp_out.write(linesplit[1] + '\n')

        tmp_in.seek(0)
        tmp_out.seek(0)
        pairs = TextIterator(tmp_in.name, tmp_out.name,
                        options[0]['dictionaries'][:-1], options[0]['dictionaries'][1],
                         n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'],
                         batch_size=b,
                         maxlen=float('inf'),
                         sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after


        scores, alignments = _score(pairs, alignweights)

        for i, line in enumerate(nbest_lines):
            score_str = ' '.join(map(str,[s[i] for s in scores]))
            saveto.write('{0} {1}\n'.format(line.strip(), score_str))

        ### optional save weights mode.
        if alignweights:
            for line in alignments:
                align_OUT.write(line + "\n")
    if alignweights:
        combine_source_target_text(source_file, nbest_file, saveto.name, align_OUT)
        align_OUT.close()