Exemple #1
0
    def load_dictionaries(self,
                          dictionary_files,
                          n_words_src=None,
                          n_words_trg=None):
        """
        Load the input dictionaries and output dictionary for a model. Note the `n_words_src` kwarg is here to
        maintain compatibility with the dictionary loading logic in Nematus.

        Args:
          dictionary_files: list of strings which are paths to *.json Nematus dictionary files

        Returns:
          input_dicts, input_idicts, output_dict, output_idict
        """
        def load_utf8_dict(filename):
            with codecs.open(filename, 'rb', encoding='utf8') as f:
                return {k.encode('utf8'): v for k, v in json.load(f).items()}

        input_dict_files = dictionary_files[:-1]
        output_dict_file = dictionary_files[-1]

        # load source dictionary and invert
        input_dicts = []
        input_idicts = []
        for dictionary in input_dict_files:
            input_dict = load_utf8_dict(dictionary) if self.is_utf8(
                dictionary) else load_dict(dictionary)
            if n_words_src is not None:
                for key, idx in input_dict.items():
                    if idx >= n_words_src:
                        del input_dict[key]
            input_idict = dict()
            for kk, vv in input_dict.iteritems():
                input_idict[vv] = kk
            input_idict[0] = '<eos>'
            input_idict[1] = 'UNK'
            input_dicts.append(input_dict)
            input_idicts.append(input_idict)

        # load target dictionary and invert
        output_dict = load_utf8_dict(output_dict_file) if self.is_utf8(
            output_dict_file) else load_dict(output_dict_file)
        if n_words_trg is not None:
            for key, idx in output_dict.items():
                if idx >= n_words_trg:
                    del output_dict[key]
        output_idict = dict()
        for kk, vv in output_dict.iteritems():
            output_idict[vv] = kk
        output_idict[0] = '<eos>'
        output_idict[1] = 'UNK'

        return {
            'input_dicts': input_dicts,
            'input_idicts': input_idicts,
            'output_dict': output_dict,
            'output_idict': output_idict,
            'src_size': n_words_src,
            'trg_size': n_words_trg
        }
Exemple #2
0
    def __init__(self,
                 source,
                 target,
                 hter,
                 source_dict,
                 target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 shuffle_each_epoch=False,
                 sort_by_length=False,
                 maxibatch_size=20):
        # 每次epoch都,打乱文件顺序
        if shuffle_each_epoch:
            shuffle.main([source, target, hter])
            self.source = fopen(source + '.shuf')
            self.target = fopen(target + '.shuf')
            self.hter = fopen(hter + '.shuf')
        else:
            self.source = fopen(source)
            self.target = fopen(target)
            self.hter = fopen(hter)

        self.source_dict = load_dict(source_dict)
        self.target_dict = load_dict(target_dict)
        self.batch_size = batch_size
        self.maxlen = maxlen
        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:
            for key, idx in self.source_dict.items():
                if idx >= self.n_words_source:
                    del self.source_dict[key]
        if self.n_words_target > 0:
            for key, idx in self.target_dict.items():
                if idx >= self.n_words_target:
                    del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.hter_buffer = []

        self.k = batch_size * maxibatch_size
        self.end_of_data = False
Exemple #3
0
def initialize(model_options, pyro_port, pyro_name, pyro_key):
    if model_options['dim_per_factor'] is None:
        if model_options['factors'] == 1:
            model_options['dim_per_factor'] = [model_options['dim_word']]
        else:
            sys.stderr.write(
                'Error: if using factored input, you must specify \'dim_per_factor\'\n'
            )
            sys.exit(1)

    assert (len(model_options['dictionaries']) == model_options['factors'] + 1)
    # one dictionary per source factor + 1 for target factor
    assert (len(model_options['dim_per_factor']) == model_options['factors'])
    # each factor embedding has its own dimensionality
    assert (
        sum(model_options['dim_per_factor']) == model_options['dim_word']
    )  # dimensionality of factor embeddings sums up to total dimensionality of input embedding vector

    # load dictionaries and invert them
    worddicts = [None] * len(model_options['dictionaries'])
    for ii, dd in enumerate(model_options['dictionaries']):
        worddicts[ii] = load_dict(dd)

    if model_options['n_words_src'] is None:
        n_words_src = len(worddicts[0])
        model_options['n_words_src'] = n_words_src
    if model_options['n_words'] is None:
        n_words = len(worddicts[1])
        model_options['n_words'] = n_words

    print 'Initilizing remote theano server'
    # In order to transfer numpy objects across the network, must use pickle as Pyro Serializer.
    # Also requires various environment flags (PYRO_SERIALIZERS_ACCEPTED, PYRO_SERIALIZER)
    #   for both name server and server.
    Pyro4.config.SERIALIZER = 'pickle'
    Pyro4.config.NS_PORT = pyro_port
    remote = Pyro4.Proxy("PYRONAME:{0}".format(pyro_name))
    remote._pyroHmacKey = pyro_key

    remote.init(model_options)
    return remote
Exemple #4
0
def train(
        batch_size=80,
        valid_batch_size=80,
        dim=100,
        dim_word=500,
        dispFreq=100,
        saveFreq=3000,
        validFreq=1000,
        saveto='RNN_model/wmt17.en-de.npz',
        datasets=['tuning/train.bpe.en', 'tuning/train.bpe.de'],
        valid_datasets=['tuning/dev.bpe.en', 'tuning/dev.bpe.de'],
        dictionaries=['data/train.bpe.en.json', 'data/train.bpe.de.json'],
        hter=['tuning/train.hter', 'tuning/dev.hter'],
        n_words_src=40000,
        n_words_tgt=40000,
        nmt_model='model/model.npz.best_bleu',
        lrate=0.0001,  # learning rate
        use_dropout=True,
        patience=10,
        max_epochs=5000,
        finish_after=1000000,
        maxibatch_size=20,
        optimizer='rmsprop',
        shuffle_each_epoch=True,
        reload_=True,
        overwrite=False,
        sort_by_length=False,
        maxlen=1000,
        decay_c=0.,  # L2 regularization penalty
        map_decay_c=0.,  # L2 regularization penalty towards original weights
        clip_c=1.0,
        dropout_embedding=0.2,  # dropout for input embeddings (0: no dropout)
        dropout_hidden=0.2,  # dropout for hidden layers (0: no dropout)
        dropout_source=0.1,  # dropout source words (0: no dropout)
        dropout_target=0.1,  # dropout target words (0: no dropout)
        model_version=0.1):

    #获取局部参数
    model_options = locals().copy()
    print 'Model options:', model_options

    #加载字典,并且反转
    worddicts = [None] * len(dictionaries)
    worddicts_r = [None] * len(dictionaries)
    for ii, dd in enumerate(dictionaries):
        worddicts[ii] = load_dict(dd)
        worddicts_r[ii] = dict()
        for kk, vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk

    #若词汇总大小未设置,则给定默认值为词汇表大小
    if n_words_src is None:
        n_words_src = len(worddicts[0])
        model_options['n_words_src'] = n_words_src
    if n_words_tgt is None:
        n_words_tgt = len(worddicts[1])
        model_options['n_words_tgt'] = n_words_tgt

    #加载数据
    print 'Loading data ...'
    train = TextIterator(datasets[0],
                         datasets[1],
                         hter[0],
                         dictionaries[0],
                         dictionaries[1],
                         n_words_source=n_words_src,
                         n_words_target=n_words_tgt,
                         batch_size=batch_size,
                         maxlen=maxlen,
                         shuffle_each_epoch=shuffle_each_epoch,
                         sort_by_length=sort_by_length,
                         maxibatch_size=maxibatch_size)
    valid = TextIterator(valid_datasets[0],
                         valid_datasets[1],
                         hter[1],
                         dictionaries[0],
                         dictionaries[1],
                         n_words_source=n_words_src,
                         n_words_target=n_words_tgt,
                         batch_size=valid_batch_size,
                         maxlen=maxlen)

    #numpy.random.seed(1234)
    # 初始化模型参数
    print 'Init parameters ...'
    params = init_params(model_options)

    #reload parameters
    if reload_ and os.path.exists(saveto):
        print 'Reloading model parameters'
        params = load_params(saveto, params)

    #把网络中的W,b 变为共享变量
    tparams = init_theano_params(params)

    # 建立模型
    print 'Building model ...',
    trng,use_noise,x,x_mask,y,y_mask,hter, \
        y_pred,cost = build_model(tparams,model_options)
    print 'Done'
    """
    @function:调试
    print Wt.get_value().shape
    print tparams['W'].get_value().shape
     
    f_tt = theano.function([x,x_mask,y,y_mask],tt)
    f_emb = theano.function([x,x_mask,y,y_mask],emb)
    f_pred = theano.function([x,x_mask,y,y_mask],y_pred)
    f_cost = theano.function([x,x_mask,y,y_mask,hter],cost)
    
    
    for x, y, hter in train:
            # 准备数据用于训练
            x, x_mask, y, y_mask = nmt.prepare_data(x, y, maxlen=maxlen,
                                                n_words_src=n_words_src,
                                                n_words=n_words_tgt)
            hter = numpy.array(hter).astype('float32')
            hter = hter.reshape([hter.shape[0],1])

            print f_pred(x,x_mask,y,y_mask).shape
            print f_cost(x,x_mask,y,y_mask,hter)
            #print f_cost(x,x_mask,y,y_mask,hter)
            
            sys.exit(0)
    """

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # apply L2 regularisation to loaded model (map training)
    if map_decay_c > 0:
        map_decay_c = theano.shared(numpy.float32(map_decay_c),
                                    name="map_decay_c")
        weight_map_decay = 0.
        for kk, vv in tparams.iteritems():
            init_value = theano.shared(vv.get_value(), name=kk + "_init")
            weight_map_decay += ((vv - init_value)**2).sum()
        weight_map_decay *= map_decay_c
        cost += weight_map_decay

    print 'Building f_pred...',
    inps = [x, x_mask, y, y_mask]
    f_pred = theano.function(inps, y_pred, profile=False)
    print 'Done'

    print 'Building f_cost...',
    inps = [x, x_mask, y, y_mask, hter]
    f_cost = theano.function(inps, cost, profile=False)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c,
                              g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = eval(optimizer)(lr,
                                              tparams,
                                              grads,
                                              inps,
                                              cost,
                                              profile=False)
    print 'Done'

    print 'Start Optimization'
    best_p = None
    bad_counter = 0
    uidx = 0
    estop = False
    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        rmodel = numpy.load(saveto)
        history_errs = list(rmodel['history_errs'])
        if 'uidx' in rmodel:
            uidx = rmodel['uidx']

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    valid_err = None
    fp = open('RNN_model/valid.error', 'w')

    for eidx in xrange(max_epochs):

        n_samples = 0
        for x, y, hter in train:
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)
            # 准备数据用于训练
            x, x_mask, y, y_mask = nmt.prepare_data(x,
                                                    y,
                                                    maxlen=maxlen,
                                                    n_words_src=n_words_src,
                                                    n_words=n_words_tgt)
            hter = numpy.array(hter).astype('float32')
            hter = hter.reshape([hter.shape[0], 1])

            #长度小于 maxlen 的值的句子为 0
            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x, x_mask, y, y_mask, hter)

            # do the update on parameters
            f_update(lrate)

            ud = time.time() - ud_start

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud

            # save the best model so far, in addition, save the latest model
            # into a separate file with the iteration number for external eval
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving the best model...',
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip_from_theano(tparams)
                numpy.savez(saveto,
                            history_errs=history_errs,
                            uidx=uidx,
                            **params)
                json.dump(model_options,
                          open('%s.json' % saveto, 'wb'),
                          indent=2)
                print 'Done'

                # save with uidx
                if not overwrite:
                    print 'Saving the model at iteration {}...'.format(uidx),
                    saveto_uidx = '{}.iter{}.npz'.format(
                        os.path.splitext(saveto)[0], uidx)
                    numpy.savez(saveto_uidx,
                                history_errs=history_errs,
                                uidx=uidx,
                                **unzip_from_theano(tparams))
                    print 'Done'

            # validate model on validation set and early stop if necessary
            if valid and validFreq and numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs = pred_error(f_cost, nmt.prepare_data,
                                        model_options, valid)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip_from_theano(tparams)
                    bad_counter = 0
                if len(history_errs) > patience and valid_err >= \
                        numpy.array(history_errs)[:-patience].min():
                    bad_counter += 1
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

                if numpy.isnan(valid_err):
                    ipdb.set_trace()

                print 'Valid ', valid_err
                fp.writelines('valid error: ' + str(valid_err) + '\n')

            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zip_to_theano(best_p, tparams)

    if valid:
        use_noise.set_value(0.)
        valid_errs = pred_error(f_cost, nmt.prepare_data, model_options, valid)
        valid_err = valid_errs.mean()
        print 'Valid ', valid_err

        fp.writelines('Finally error: ' + str(valid_err) + '\n')
        fp.close()

    if best_p is not None:
        params = copy.copy(best_p)
    else:
        params = unzip_from_theano(tparams)
    numpy.savez(saveto,
                zipped_params=best_p,
                history_errs=history_errs,
                uidx=uidx,
                **params)

    return valid_err
Exemple #5
0
def sample_par(lines,
               model_options,
               f_init,
               f_next,
               beam_size=3,
               suppress_unk=True):
    dictionaries = model_options['dictionaries']
    dictionaries_source = dictionaries[:-1]
    dictionary_target = dictionaries[-1]

    # load source dictionary and invert
    word_dicts = []
    word_idicts = []
    for dictionary in dictionaries_source:
        word_dict = load_dict(dictionary)
        if model_options['n_words_src']:
            for key, idx in word_dict.items():
                if idx >= model_options['n_words_src']:
                    del word_dict[key]
        word_idict = dict()
        for kk, vv in word_dict.iteritems():
            word_idict[vv] = kk
        word_idict[0] = '<eos>'
        word_idict[1] = 'UNK'
        word_dicts.append(word_dict)
        word_idicts.append(word_idict)

    # load target dictionary and invert
    word_dict_trg = load_dict(dictionary_target)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    def _seqs2words(cc):
        ww = []
        for w in cc:
            if w == 0:
                break
            ww.append(word_idict_trg[w])
        return ' '.join(ww).replace('@@ ', '')

    seqs = []
    for idx, line in enumerate(lines):
        words = line.strip().split()

        x = []
        for w in words:
            w = [
                word_dicts[i][f] if f in word_dicts[i] else 1
                for (i, f) in enumerate(w.split('|'))
            ]
            if len(w) != model_options['factors']:
                raise Exception(
                    'Error: expected {0} factors, but input word has {1}\n'.
                    format(model_options['factors'], len(w)))
            x.append(w)

        x += [[0] * model_options['factors']]
        seqs.append(x)

    seqs_y_dummy = [[x[0] for x in element] for element in seqs]
    sequences, xmask, dummy_y, ymask = prepare_data(seqs, seqs_y_dummy)

    print 'Calling gen_par_samp'
    t0 = time.time()
    parsample, parscore, parword_probs = gen_par_sample(
        [
            f_init,
        ], [
            f_next,
        ],
        sequences,
        xmask,
        k=beam_size,
        maxlen=200,
        suppress_unk=suppress_unk)
    print 'gen_par_samp returned, took %.1f seconds' % (time.time() - t0)

    t0 = time.time()
    compare_samples = []
    for i in range(len(seqs)):
        mask_size = int(round(np.sum(xmask[:, i])))
        seq = sequences[:, :mask_size, i:i + 1]
        print 'calling gen_sample'
        sample, score, word_probs, _, _ = gen_sample([
            f_init,
        ], [
            f_next,
        ],
                                                     seq,
                                                     k=beam_size,
                                                     maxlen=200,
                                                     stochastic=False,
                                                     suppress_unk=suppress_unk)
        compare_samples += sample

    print 'iterative gen_sample took %.1f seconds' % (time.time() - t0)

    sample_words = []
    for sents in parsample:
        #sample_words.append([_seqs2words(cand) for cand in sents])
        sample_words += sents

    return sample_words, compare_samples