コード例 #1
0
def sgd(lr, tparams, grads, inp, cost, profile=False):
    gshared = [
        theano.shared(p.get_value() * 0., name='%s_grad' % k)
        for k, p in tparams.iteritems()
    ]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]

    f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile)

    pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
    f_update = theano.function([lr], [], updates=pup, profile=profile)

    return f_grad_shared, f_update
コード例 #2
0
def rmsprop(lr, tparams, grads, inp, cost, profile=False):
    zipped_grads = [
        theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad' % k)
        for k, p in tparams.iteritems()
    ]
    running_grads = [
        theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad' % k)
        for k, p in tparams.iteritems()
    ]
    running_grads2 = [
        theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2' % k)
        for k, p in tparams.iteritems()
    ]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function(inp,
                                    cost,
                                    updates=zgup + rgup + rg2up,
                                    profile=profile)

    updir = [
        theano.shared(p.get_value() * numpy.float32(0.), name='%s_updir' % k)
        for k, p in tparams.iteritems()
    ]
    updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg**2 + 1e-4))
                 for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
                                            running_grads2)]
    param_up = [(p, p + udn[1])
                for p, udn in zip(itemlist(tparams), updir_new)]
    f_update = theano.function([lr], [],
                               updates=updir_new + param_up,
                               on_unused_input='ignore',
                               profile=profile)

    return f_grad_shared, f_update
コード例 #3
0
def adadelta(lr, tparams, grads, inp, cost, profile=False):
    zipped_grads = [
        theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad' % k)
        for k, p in tparams.iteritems()
    ]
    running_up2 = [
        theano.shared(p.get_value() * numpy.float32(0.), name='%s_rup2' % k)
        for k, p in tparams.iteritems()
    ]
    running_grads2 = [
        theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2' % k)
        for k, p in tparams.iteritems()
    ]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function(inp,
                                    cost,
                                    updates=zgup + rg2up,
                                    profile=profile)

    updir = [
        -tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
        for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)
    ]
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud**2))
             for ru2, ud in zip(running_up2, updir)]
    param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)]

    f_update = theano.function([lr], [],
                               updates=ru2up + param_up,
                               on_unused_input='ignore',
                               profile=profile)

    return f_grad_shared, f_update
コード例 #4
0
ファイル: nmt.py プロジェクト: sohuren/DL4MT
def train(
        dim_word=100,  # word vector dimensionality
        dim=1000,  # the number of LSTM units
        factors=1,  # input factors
        dim_per_factor=None,  # list of word vector dimensionalities (one per factor): [250,200,50] for total dimensionality of 500
        encoder='gru',
        decoder='gru_cond',
        patience=10,  # early stopping patience
        max_epochs=5000,
        finish_after=10000000,  # finish after this many updates
        dispFreq=100,
        decay_c=0.,  # L2 regularization penalty
        map_decay_c=0.,  # L2 regularization penalty towards original weights
        alpha_c=0.,  # alignment regularization
        clip_c=-1.,  # gradient clipping threshold
        lrate=0.01,  # learning rate
        n_words_src=None,  # source vocabulary size
        n_words=None,  # target vocabulary size
        maxlen=100,  # maximum length of the description
        optimizer='rmsprop',
        batch_size=16,
        valid_batch_size=16,
        saveto='model.npz',
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        sampleFreq=100,  # generate some samples after every sampleFreq
        datasets=('/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
                  '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'),
        valid_datasets=('../data/dev/newstest2011.en.tok',
                        '../data/dev/newstest2011.fr.tok'),
        dictionaries=(
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'),
        use_dropout=False,
        dropout_embedding=0.2,  # dropout for input embeddings (0: no dropout)
        dropout_hidden=0.5,  # dropout for hidden layers (0: no dropout)
        dropout_source=0,  # dropout source words (0: no dropout)
        dropout_target=0,  # dropout target words (0: no dropout)
        reload_=False,
        overwrite=False,
        external_validation_script=None,
        shuffle_each_epoch=True,
        finetune=False,
        finetune_only_last=False,
        sort_by_length=True,
        use_domain_interpolation=False,
        domain_interpolation_min=0.1,
        domain_interpolation_inc=0.1,
        domain_interpolation_indomain_datasets=('indomain.en', 'indomain.fr'),
        maxibatch_size=20,  #How many minibatches to load at one time
        model_version=0.1,  #store version used for training for compatibility
):

    # Model options
    model_options = locals().copy()

    if model_options['dim_per_factor'] == None:
        if factors == 1:
            model_options['dim_per_factor'] = [model_options['dim_word']]
        else:
            sys.stderr.write(
                'Error: if using factored input, you must specify \'dim_per_factor\'\n'
            )
            sys.exit(1)

    assert (len(dictionaries) == factors + 1
            )  # one dictionary per source factor + 1 for target factor
    assert (len(model_options['dim_per_factor']) == factors
            )  # each factor embedding has its own dimensionality
    assert (
        sum(model_options['dim_per_factor']) == model_options['dim_word']
    )  # dimensionality of factor embeddings sums up to total dimensionality of input embedding vector

    # load dictionaries and invert them
    worddicts = [None] * len(dictionaries)
    worddicts_r = [None] * len(dictionaries)
    for ii, dd in enumerate(dictionaries):
        worddicts[ii] = load_dict(dd)
        worddicts_r[ii] = dict()
        for kk, vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk

    if n_words_src is None:
        n_words_src = len(worddicts[0])
        model_options['n_words_src'] = n_words_src
    if n_words is None:
        n_words = len(worddicts[1])
        model_options['n_words'] = n_words

    print('Loading data')
    domain_interpolation_cur = None
    if use_domain_interpolation:
        print(
            'Using domain interpolation with initial ratio %s, increase rate %s'
            % (domain_interpolation_min, domain_interpolation_inc))
        domain_interpolation_cur = domain_interpolation_min
        train = DomainInterpolatorTextIterator(
            datasets[0],
            datasets[1],
            dictionaries[:-1],
            dictionaries[1],
            n_words_source=n_words_src,
            n_words_target=n_words,
            batch_size=batch_size,
            maxlen=maxlen,
            shuffle_each_epoch=shuffle_each_epoch,
            sort_by_length=sort_by_length,
            indomain_source=domain_interpolation_indomain_datasets[0],
            indomain_target=domain_interpolation_indomain_datasets[1],
            interpolation_rate=domain_interpolation_cur,
            maxibatch_size=maxibatch_size)
    else:
        train = TextIterator(datasets[0],
                             datasets[1],
                             dictionaries[:-1],
                             dictionaries[-1],
                             n_words_source=n_words_src,
                             n_words_target=n_words,
                             batch_size=batch_size,
                             maxlen=maxlen,
                             skip_empty=True,
                             shuffle_each_epoch=shuffle_each_epoch,
                             sort_by_length=sort_by_length,
                             maxibatch_size=maxibatch_size)

    if valid_datasets and validFreq:
        valid = TextIterator(valid_datasets[0],
                             valid_datasets[1],
                             dictionaries[:-1],
                             dictionaries[-1],
                             n_words_source=n_words_src,
                             n_words_target=n_words,
                             batch_size=valid_batch_size,
                             maxlen=maxlen)
    else:
        valid = None

    comp_start = time.time()

    print('Building model')
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(saveto):
        print('Reloading model parameters')
        params = load_params(saveto, params)

    tparams = init_theano_params(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)

    inps = [x, x_mask, y, y_mask]

    if validFreq or sampleFreq:
        print('Building sampler')
        f_init, f_next = build_sampler(tparams, model_options, use_noise, trng)

    # before any regularizer
    print('Building f_log_probs...', )
    f_log_probs = theano.function(inps, cost, profile=profile)
    print('Done')

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
        cost += alpha_reg

    # apply L2 regularisation to loaded model (map training)
    if map_decay_c > 0:
        map_decay_c = theano.shared(numpy.float32(map_decay_c),
                                    name="map_decay_c")
        weight_map_decay = 0.
        for kk, vv in tparams.iteritems():
            init_value = theano.shared(vv.get_value(), name=kk + "_init")
            weight_map_decay += ((vv - init_value)**2).sum()
        weight_map_decay *= map_decay_c
        cost += weight_map_decay

    # allow finetuning with fixed embeddings
    if finetune:
        updated_params = OrderedDict([(key, value)
                                      for (key, value) in tparams.iteritems()
                                      if not key.startswith('Wemb')])
    else:
        updated_params = tparams

    # allow finetuning of only last layer (becomes a linear model training problem)
    if finetune_only_last:
        updated_params = OrderedDict([(key, value)
                                      for (key, value) in tparams.iteritems()
                                      if key in ['ff_logit_W', 'ff_logit_b']])
    else:
        updated_params = tparams

    print('Computing gradient...', )
    grads = tensor.grad(cost, wrt=itemlist(updated_params))
    print('Done')

    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c,
                              g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')

    print('Building optimizers...', )
    f_grad_shared, f_update = eval(optimizer)(lr,
                                              updated_params,
                                              grads,
                                              inps,
                                              cost,
                                              profile=profile)
    print('Done')

    print('Total compilation time: {0:.1f}s'.format(time.time() - comp_start))

    print('Optimization')

    best_p = None
    bad_counter = 0
    uidx = 0
    estop = False
    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        rmodel = numpy.load(saveto)
        history_errs = list(rmodel['history_errs'])
        if 'uidx' in rmodel:
            uidx = rmodel['uidx']

    # save model options
    json.dump(model_options, open('%s.json' % saveto, 'wb'), indent=2)

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0]) / batch_size

    valid_err = None

    last_disp_samples = 0
    ud_start = time.time()
    p_validation = None
    for eidx in xrange(max_epochs):
        n_samples = 0

        for x, y in train:
            n_samples += len(x)
            last_disp_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)

            # ensure consistency in number of factors
            if len(x) and len(x[0]) and len(x[0][0]) != factors:
                sys.stderr.write(
                    'Error: mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n'
                    .format(factors, len(x[0][0])))
                sys.exit(1)

            x, x_mask, y, y_mask = prepare_data(
                x, y, maxlen=maxlen
            )  # n_words_src=n_words_src, n_words=n_words) # TODO: why unused??

            if x is None:
                print('Minibatch with zero sample under length ', maxlen)
                uidx -= 1
                continue

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x, x_mask, y, y_mask)

            # do the update on parameters
            f_update(lrate)

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if numpy.isnan(cost) or numpy.isinf(cost):
                print('NaN detected')
                return 1., 1., 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                ud = time.time() - ud_start
                wps = (last_disp_samples) / float(ud)
                print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ',
                      ud, "{0:.2f} sentences/s".format(wps))
                ud_start = time.time()
                last_disp_samples = 0

            # save the best model so far, in addition, save the latest model
            # into a separate file with the iteration number for external eval
            if numpy.mod(uidx, saveFreq) == 0:
                print('Saving the best model...', )
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip_from_theano(tparams)
                numpy.savez(saveto,
                            history_errs=history_errs,
                            uidx=uidx,
                            **params)
                print('Done')

                # save with uidx
                if not overwrite:
                    print('Saving the model at iteration {}...'.format(uidx), )
                    saveto_uidx = '{}.iter{}.npz'.format(
                        os.path.splitext(saveto)[0], uidx)
                    numpy.savez(saveto_uidx,
                                history_errs=history_errs,
                                uidx=uidx,
                                **unzip_from_theano(tparams))
                    print('Done')

            # generate some samples with the model and display them
            if sampleFreq and numpy.mod(uidx, sampleFreq) == 0:
                # FIXME: random selection?
                for jj in xrange(numpy.minimum(5, x.shape[2])):
                    stochastic = True
                    x_current = x[:, :, jj][:, :, None]

                    # remove padding
                    x_current = x_current[:, :x_mask[:, jj].sum(), :]

                    sample, score, sample_word_probs, alignment, hyp_graph = gen_sample(
                        [f_init], [f_next],
                        x_current,
                        trng=trng,
                        k=1,
                        maxlen=30,
                        stochastic=stochastic,
                        argmax=False,
                        suppress_unk=False,
                        return_hyp_graph=False)
                    print(
                        'Source ',
                        jj,
                        ': ',
                    )
                    for pos in range(x.shape[1]):
                        if x[0, pos, jj] == 0:
                            break
                        for factor in range(factors):
                            vv = x[factor, pos, jj]
                            if vv in worddicts_r[factor]:
                                sys.stdout.write(worddicts_r[factor][vv])
                            else:
                                sys.stdout.write('UNK')
                            if factor + 1 < factors:
                                sys.stdout.write('|')
                            else:
                                sys.stdout.write(' ')
                    print()
                    print(
                        'Truth ',
                        jj,
                        ' : ',
                    )
                    for vv in y[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[-1]:
                            print(worddicts_r[-1][vv], )
                        else:
                            print('UNK', )
                    print()
                    print(
                        'Sample ',
                        jj,
                        ': ',
                    )
                    if stochastic:
                        ss = sample
                    else:
                        score = score / numpy.array([len(s) for s in sample])
                        ss = sample[score.argmin()]
                    for vv in ss:
                        if vv == 0:
                            break
                        if vv in worddicts_r[-1]:
                            print(worddicts_r[-1][vv], )
                        else:
                            print('UNK', )
                    print()

            # validate model on validation set and early stop if necessary
            if valid and validFreq and numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs, alignment = pred_probs(f_log_probs, prepare_data,
                                                   model_options, valid)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip_from_theano(tparams)
                    bad_counter = 0
                if len(history_errs) > patience and valid_err >= \
                        numpy.array(history_errs)[:-patience].min():
                    bad_counter += 1
                    if bad_counter > patience:
                        if use_domain_interpolation and (
                                domain_interpolation_cur < 1.0):
                            domain_interpolation_cur = min(
                                domain_interpolation_cur +
                                domain_interpolation_inc, 1.0)
                            print(
                                'No progress on the validation set, increasing domain interpolation rate to %s and resuming from best params'
                                % domain_interpolation_cur)
                            train.adjust_domain_interpolation_rate(
                                domain_interpolation_cur)
                            if best_p is not None:
                                zip_to_theano(best_p, tparams)
                            bad_counter = 0
                        else:
                            print('Early Stop!')
                            estop = True
                            break

                if numpy.isnan(valid_err):
                    ipdb.set_trace()

                print('Valid ', valid_err)

                if external_validation_script:
                    print("Calling external validation script")
                    if p_validation is not None and p_validation.poll(
                    ) is None:
                        print("Waiting for previous validation run to finish")
                        print(
                            "If this takes too long, consider increasing validation interval, reducing validation set size, or speeding up validation by using multiple processes"
                        )
                        valid_wait_start = time.time()
                        p_validation.wait()
                        print("Waited for {0:.1f} seconds".format(
                            time.time() - valid_wait_start))
                    print('Saving  model...', )
                    params = unzip_from_theano(tparams)
                    numpy.savez(saveto + '.dev',
                                history_errs=history_errs,
                                uidx=uidx,
                                **params)
                    json.dump(model_options,
                              open('%s.dev.npz.json' % saveto, 'wb'),
                              indent=2)
                    print('Done')
                    p_validation = Popen([external_validation_script])

            # finish after this many updates
            if uidx >= finish_after:
                print('Finishing after %d iterations!' % uidx)
                estop = True
                break

        print('Seen %d samples' % n_samples)

        if estop:
            break

    if best_p is not None:
        zip_to_theano(best_p, tparams)

    if valid:
        use_noise.set_value(0.)
        valid_errs, alignment = pred_probs(f_log_probs, prepare_data,
                                           model_options, valid)
        valid_err = valid_errs.mean()

        print('Valid ', valid_err)

    if best_p is not None:
        params = copy.copy(best_p)
    else:
        params = unzip_from_theano(tparams)
    numpy.savez(saveto,
                zipped_params=best_p,
                history_errs=history_errs,
                uidx=uidx,
                **params)

    return valid_err
コード例 #5
0
    def init(self, model_options):
        """Exposes: (but Pyro does not see them)
            self.f_init
            self.f_next
            self.f_log_probs
            self.f_grad_shared
            self.f_update
        """

        reload_ = model_options['reload_']
        saveto = model_options['saveto']
        decay_c = model_options['decay_c']
        alpha_c = model_options['alpha_c']
        map_decay_c = model_options['map_decay_c']
        finetune = model_options['finetune']
        finetune_only_last = model_options['finetune_only_last']
        clip_c = model_options['clip_c']
        optimizer = model_options['optimizer']

        comp_start = time.time()

        print 'Building model'
        params = init_params(model_options)
        # reload parameters
        if reload_ and os.path.exists(saveto):
            print 'Reloading model parameters'
            params = load_params(saveto, params)

        self.tparams = init_theano_params(params)

        trng, self.use_noise, x, x_mask, y, y_mask, opt_ret, per_sent_neg_log_prob = build_model(
            self.tparams, model_options)

        inps = [x, x_mask, y, y_mask]

        self.f_init, self.f_next = build_sampler(self.tparams, model_options,
                                                 self.use_noise, trng)

        # before any regularizer
        print 'Building f_log_probs...',
        self.f_log_probs = theano.function(inps,
                                           per_sent_neg_log_prob,
                                           profile=profile)
        print 'Done'

        # apply per-sentence weight to cost_vec before averaging
        per_sent_weight = tensor.vector('per_sent_weight', dtype='float32')
        per_sent_weight.tag.test_value = numpy.ones(10).astype('float32')
        cost = (per_sent_neg_log_prob *
                per_sent_weight).mean()  # mean of elem-wise multiply

        # apply L2 regularization on weights
        if decay_c > 0.:
            decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
            weight_decay = 0.
            for kk, vv in self.tparams.iteritems():
                weight_decay += (vv**2).sum()
            weight_decay *= decay_c
            cost += weight_decay

        # regularize the alpha weights
        if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
            alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
            alpha_reg = alpha_c * ((
                tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None]
                - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
            cost += alpha_reg

        # apply L2 regularisation to loaded model (map training)
        if map_decay_c > 0:
            map_decay_c = theano.shared(numpy.float32(map_decay_c),
                                        name="map_decay_c")
            weight_map_decay = 0.
            for kk, vv in self.tparams.iteritems():
                init_value = theano.shared(vv.get_value(), name=kk + "_init")
                weight_map_decay += ((vv - init_value)**2).sum()
            weight_map_decay *= map_decay_c
            cost += weight_map_decay

        # allow finetuning with fixed embeddings
        if finetune:
            updated_params = OrderedDict([
                (key, value) for (key, value) in self.tparams.iteritems()
                if not key.startswith('Wemb')
            ])
        elif finetune_only_last:  # allow finetuning of only last layer (becomes a linear model training problem)
            updated_params = OrderedDict([
                (key, value) for (key, value) in self.tparams.iteritems()
                if key in ['ff_logit_W', 'ff_logit_b']
            ])
        else:
            updated_params = self.tparams

        print 'Computing gradient...',
        grads = tensor.grad(cost, wrt=itemlist(updated_params))
        print 'Done'

        # apply gradient clipping here
        if clip_c > 0.:
            g2 = 0.
            for g in grads:
                g2 += (g**2).sum()
            new_grads = []
            for g in grads:
                new_grads.append(
                    tensor.switch(g2 > (clip_c**2),
                                  g / tensor.sqrt(g2) * clip_c, g))
            grads = new_grads

        # compile the optimizer, the actual computational graph is compiled here
        lr = tensor.scalar(name='lr')

        print 'Building optimizers...',
        op_map = {
            'adam': optimizers.adam,
            'adadelta': optimizers.adadelta,
            'rmsprop': optimizers.rmsprop,
            'sgd': optimizers.sgd
        }
        inps = inps + [
            per_sent_weight,
        ]
        self.f_grad_shared, self.f_update = op_map[optimizer](
            lr,
            updated_params,
            grads,
            inps,
            per_sent_neg_log_prob,
            profile=profile)
        print 'Done'

        print 'Total compilation time: {0:.1f}s'.format(time.time() -
                                                        comp_start)
コード例 #6
0
def train(
    dim_word=100,  # word vector dimensionality
    dim=1000,  # the number of LSTM units
    patience=10,  # early stopping patience
    max_epochs=5000,
    finish_after=10000000,  # finish after this many updates
    dispFreq=100,
    decay_c=0.,  # L2 regularization penalty
    map_decay_c=0., # L2 regularization penalty towards original weights
    alpha_c=0.,  # alignment regularization
    clip_c=-1.,  # gradient clipping threshold
    lrate=0.01,  # learning rate
    n_words_src=None,  # source vocabulary size
    n_words_tgt=None,  # target vocabulary size
    maxlen=100,  # maximum length of the description
    optimizer='rmsprop',
    batch_size=16,
    valid_batch_size=16,
    saveto='model.npz',
    validFreq=1000,
    saveFreq=1000,   # save the parameters after every saveFreq updates
    sampleFreq=100,   # generate some samples after every sampleFreq
    datasets=[
        '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
        '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'],
    valid_datasets=['../data/dev/newstest2011.en.tok',
                    '../data/dev/newstest2011.fr.tok'],
    dictionaries=[
        '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
        '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'],
    use_dropout=False,
    dropout_embedding=0.2, # dropout for input embeddings (0: no dropout)
    dropout_hidden=0.5, # dropout for hidden layers (0: no dropout)
    dropout_source=0, # dropout source words (0: no dropout)
    dropout_target=0, # dropout target words (0: no dropout)
    reload_=False,
    overwrite=False,
    external_validation_script=None,
    shuffle_each_epoch=True,
    sort_by_length=True,
    maxibatch_size=20, #How many minibatches to load at one time
    model_version = 0.1
    ):
    # 获取局部参数
    model_options = locals().copy()  
    print 'Model options:',model_options
    
    # 加载字典,并且反转
    worddicts = [None]*len(dictionaries)
    worddicts_r = [None]*len(dictionaries)
    for ii,dd in enumerate(dictionaries):
        worddicts[ii] = load_dict(dd)
        worddicts_r[ii] = dict()
        for kk,vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk
    
    # 若词汇总大小未设置,则给定默认值为词汇表大小
    if n_words_src is None:
        n_words_src = len(worddicts[0])
        model_options['n_words_src'] = n_words_src
    if n_words_tgt is None:
        n_words_tgt = len(worddicts[1])
        model_options['n_words_tgt'] = n_words_tgt
    
    # 加载数据
    print 'Loading data ...'
    train = TextIterator(datasets[0],datasets[1],
                        dictionaries[0],dictionaries[1],
                        n_words_source=n_words_src,
                        n_words_target=n_words_tgt,
                        batch_size=batch_size,
                        maxlen=maxlen,
                        shuffle_each_epoch=shuffle_each_epoch,
                        sort_by_length=sort_by_length,
                        maxibatch_size=maxibatch_size)
    valid = TextIterator(valid_datasets[0], valid_datasets[1],
                        dictionaries[0], dictionaries[1],
                        n_words_source=n_words_src, n_words_target=n_words_tgt,
                        batch_size=valid_batch_size,
                        maxlen=maxlen)
    
    # 初始化模型参数
    print 'Init parameters ...'
    params = init_params(model_options)
    
    # 重新载入模型,当程序意外中断的时候,可以继续运行代码
    if reload_ and os.path.exists(saveto):
        print 'Reloading model parameters'
        params = load_params(saveto,params)
    
    # 把网络中的W,b 变为共享变量
    tparams = init_theano_params(params)
    
    # 建立模型
    print 'Building model ...'
    
    trng,use_noise,x,x_mask,y,y_mask,\
        opt_ret, cost, ctx, tt = build_model(tparams,model_options)
    
    inps = [x, x_mask, y, y_mask]

    #建立采样器
    if validFreq or sampleFreq:
        print 'Building sampler ...'
        f_init, f_next = build_sampler(tparams, model_options, use_noise, trng)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
        cost += alpha_reg

     # apply L2 regularisation to loaded model (map training)
    if map_decay_c > 0:
        map_decay_c = theano.shared(numpy.float32(map_decay_c), name="map_decay_c")
        weight_map_decay = 0.
        for kk, vv in tparams.iteritems():
            init_value = theano.shared(vv.get_value(), name= kk + "_init")
            weight_map_decay += ((vv -init_value) ** 2).sum()
        weight_map_decay *= map_decay_c
        cost += weight_map_decay
    
    # after all regularizers - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    f_alpha = theano.function(inps, opt_ret['dec_alphas']) # alphas
    print 'Done'
    
    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'
    
    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(tensor.switch(g2 > (clip_c**2),
                                           g / tensor.sqrt(g2) * clip_c,
                                           g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    
    print 'Building optimizers...',
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, profile=profile)
    print 'Done'
    
    #开始优化
    print 'Optimization'

    best_p = None
    bad_counter = 0
    uidx = 0
    estop = False
    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        rmodel = numpy.load(saveto)
        history_errs = list(rmodel['history_errs'])
        if 'uidx' in rmodel:
            uidx = rmodel['uidx']

    if validFreq == -1:
        validFreq = len(train[0])/batch_size
    if saveFreq == -1:
        saveFreq = len(train[0])/batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0])/batch_size

    valid_err = None

    for eidx in xrange(max_epochs):
        n_samples = 0

        for x, y in train:
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)
            # 准备数据用于训练
            x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen,
                                                n_words_src=n_words_src,
                                                n_words=n_words_tgt)
            #长度小于 maxlen 的值的句子为 0
            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x, x_mask, y, y_mask)
            
            # 画出词对齐矩阵
            #print f_alpha(x, x_mask, y, y_mask).shape
            """
            x_word = [worddicts_r[0][idx] for idx in x[:,0]]
            y_word = [worddicts_r[1][idx] for idx in y[:,0]]
            print len(x_word), x_word
            print len(y_word), y_word
            shape = f_alpha(x, x_mask, y, y_mask).shape
            for i in range(shape[1]):
                # print sum(f_alpha(x, x_mask, y, y_mask)[i,0,:])
                mx = sum(y_mask[:,i])
                my = sum(x_mask[:,i])
                align_matrix = f_alpha(x, x_mask, y, y_mask)[:,i,:][0:mx,0:my]
                align_shape = align_matrix.shape
                scale_ = 20 # 图像大小
                out_matrix = numpy.ones([scale_*align_shape[0],scale_*align_shape[1]])
                for j in range(align_shape[0]):
                    for k in range(align_shape[1]):
                        out_matrix[j*scale_:(j+1)*scale_,k*scale_:(k+1)*scale_] *= align_matrix[j,k]
                
                plt.imshow(100*out_matrix, plt.cm.gray)
                plt.pause(1)
                
            plt.show()
            sys.exit(0)
            """
            
            # do the update on parameters
            f_update(lrate)

            ud = time.time() - ud_start

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud

            # save the best model so far, in addition, save the latest model
            # into a separate file with the iteration number for external eval
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving the best model...',
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip_from_theano(tparams)
                numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params)
                json.dump(model_options, open('%s.json' % saveto, 'wb'), indent=2)
                print 'Done'

                # save with uidx
                if not overwrite:
                    print 'Saving the model at iteration {}...'.format(uidx),
                    saveto_uidx = '{}.iter{}.npz'.format(
                        os.path.splitext(saveto)[0], uidx)
                    numpy.savez(saveto_uidx, history_errs=history_errs,
                                uidx=uidx, **unzip_from_theano(tparams))
                    print 'Done'


            # generate some samples with the model and display them
            
            if sampleFreq and numpy.mod(uidx, sampleFreq) == 0:
                # FIXME: random selection?
                for jj in xrange(numpy.minimum(5, x.shape[1])):
                    stochastic = True
                    sample, score, sample_word_probs, alignment = gen_sample([f_init], [f_next],
                                               x[:, jj][:, None],
                                               trng=trng, k=1,
                                               maxlen=30,
                                               stochastic=stochastic,
                                               argmax=False,
                                               suppress_unk=False)
                    print 'Source ', jj, ': ',
                    for vv in x[:,jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[0]:
                            print worddicts_r[0][vv],
                        else:
                            print 'UNK'
                    print
                    print 'Truth ', jj, ' : ',
                    for vv in y[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[-1]:
                            print worddicts_r[-1][vv],
                        else:
                            print 'UNK',
                    print
                    print 'Sample ', jj, ': ',
                    if stochastic:
                        ss = sample
                    else:
                        score = score / numpy.array([len(s) for s in sample])
                        ss = sample[score.argmin()]
                    for vv in ss:
                        if vv == 0:
                            break
                        if vv in worddicts_r[-1]:
                            print worddicts_r[-1][vv],
                        else:
                            print 'UNK',
                    print
            
            # validate model on validation set and early stop if necessary
            if valid and validFreq and numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs, alignment = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip_from_theano(tparams)
                    bad_counter = 0
                if len(history_errs) > patience and valid_err >= \
                        numpy.array(history_errs)[:-patience].min():
                    bad_counter += 1
                    if bad_counter > patience:
                            print 'Early Stop!'
                            estop = True
                            break

                if numpy.isnan(valid_err):
                    ipdb.set_trace()

                print 'Valid ', valid_err

                if external_validation_script:
                    print "Calling external validation script"
                    print 'Saving  model...',
                    params = unzip_from_theano(tparams)
                    #每次验证的时候,也会保存 uidx
                    numpy.savez(saveto +'.dev', history_errs=history_errs, uidx=uidx, **params)
                    json.dump(model_options, open('%s.dev.npz.json' % saveto, 'wb'), indent=2)
                    print 'Done'
                    p = Popen([external_validation_script])

            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zip_to_theano(best_p, tparams)

    if valid:
        use_noise.set_value(0.)
        valid_errs, alignment = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid)
        valid_err = valid_errs.mean()

        print 'Valid ', valid_err

    if best_p is not None:
        params = copy.copy(best_p)
    else:
        params = unzip_from_theano(tparams)
    numpy.savez(saveto, zipped_params=best_p,
                history_errs=history_errs,
                uidx=uidx,
                **params)

    return valid_err