Beispiel #1
0
    def get_sfg_updates(self, X_sym, y_sym, params, cost,
                        learning_rate, momentum):
        gparams = T.grad(cost, params)
        updates = OrderedDict()
        from sfg import SFG
        if not hasattr(self, "sfg_"):
            self.count_ = theano.shared(0)
            self.slow_freq_ = 20
            self.sfg_ = SFG(params, gparams)

        slow_updates, fast_updates = self.sfg_.updates(self.learning_rate,
                                                       self.momentum,
                                                       epsilon=0.0001,
                                                       momentum_clipping=None)
        for param in slow_updates.keys():
            updates[param] = theano.ifelse.ifelse(T.eq(self.count_,
                                                       self.slow_freq_ - 1),
                                                  slow_updates[param],
                                                  fast_updates[param])
        updates[self.count_] = T.mod(self.count_ + 1, self.slow_freq_)
        return updates
Beispiel #2
0
class TrainingMixin(object):
    def get_sgd_updates(self, X_sym, y_sym, params, cost, learning_rate,
                        momentum):
        gparams = T.grad(cost, params)
        updates = OrderedDict()

        if not hasattr(self, "momentum_velocity_"):
            self.momentum_velocity_ = [0.] * len(gparams)

        for n, (param, gparam) in enumerate(zip(params, gparams)):
            velocity = self.momentum_velocity_[n]
            update_step = momentum * velocity - learning_rate * gparam
            self.momentum_velocity_[n] = update_step
            updates[param] = param + update_step

        return updates

    def _norm_constraint(self, param, update_step, max_col_norm):
        stepped_param = param + update_step
        if param.get_value(borrow=True).ndim == 2:
            col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
            desired_norms = T.clip(col_norms, 0, max_col_norm)
            scale = desired_norms / (1e-7 + col_norms)
            new_param = param * scale
            new_update_step = update_step * scale
        else:
            new_param = param
            new_update_step = update_step
        return new_param, new_update_step

    def get_clip_sgd_updates(self, X_sym, y_sym, params, cost, learning_rate,
                             momentum, rescale=5.):
        gparams = T.grad(cost, params)
        updates = OrderedDict()

        if not hasattr(self, "momentum_velocity_"):
            self.momentum_velocity_ = [0.] * len(gparams)

        # Gradient clipping
        grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams)))
        not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
        grad_norm = T.sqrt(grad_norm)
        scaling_num = rescale
        scaling_den = T.maximum(rescale, grad_norm)
        for n, (param, gparam) in enumerate(zip(params, gparams)):
            # clip gradient directly, not momentum etc.
            gparam = T.switch(not_finite, 0.1 * param,
                              gparam * (scaling_num / scaling_den))
            velocity = self.momentum_velocity_[n]
            update_step = momentum * velocity - learning_rate * gparam
            self.momentum_velocity_[n] = update_step
            updates[param] = param + update_step
        return updates

    def get_clip_rmsprop_updates(self, X_sym, y_sym, params, cost,
                                 learning_rate, momentum, rescale=5.):
        gparams = T.grad(cost, params)
        updates = OrderedDict()

        if not hasattr(self, "running_average_"):
            self.running_square_ = [0.] * len(gparams)
            self.running_avg_ = [0.] * len(gparams)
            self.updates_storage_ = [0.] * len(gparams)

        if not hasattr(self, "momentum_velocity_"):
            self.momentum_velocity_ = [0.] * len(gparams)

        # Gradient clipping
        grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams)))
        not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
        grad_norm = T.sqrt(grad_norm)
        scaling_num = rescale
        scaling_den = T.maximum(rescale, grad_norm)
        for n, (param, gparam) in enumerate(zip(params, gparams)):
            gparam = T.switch(not_finite, 0.1 * param,
                              gparam * (scaling_num / scaling_den))
            combination_coeff = 0.9
            minimum_grad = 1e-4
            old_square = self.running_square_[n]
            new_square = combination_coeff * old_square + (
                1. - combination_coeff) * T.sqr(gparam)
            old_avg = self.running_avg_[n]
            new_avg = combination_coeff * old_avg + (
                1. - combination_coeff) * gparam
            rms_grad = T.sqrt(new_square - new_avg ** 2)
            rms_grad = T.maximum(rms_grad, minimum_grad)
            velocity = self.momentum_velocity_[n]
            update_step = momentum * velocity - learning_rate * (
                gparam / rms_grad)
            self.running_square_[n] = new_square
            self.running_avg_[n] = new_avg
            self.updates_storage_[n] = update_step
            self.momentum_velocity_[n] = update_step
            updates[param] = param + update_step

        return updates

    def get_sfg_updates(self, X_sym, y_sym, params, cost,
                        learning_rate, momentum):
        gparams = T.grad(cost, params)
        updates = OrderedDict()
        from sfg import SFG
        if not hasattr(self, "sfg_"):
            self.count_ = theano.shared(0)
            self.slow_freq_ = 20
            self.sfg_ = SFG(params, gparams)

        slow_updates, fast_updates = self.sfg_.updates(self.learning_rate,
                                                       self.momentum,
                                                       epsilon=0.0001,
                                                       momentum_clipping=None)
        for param in slow_updates.keys():
            updates[param] = theano.ifelse.ifelse(T.eq(self.count_,
                                                       self.slow_freq_ - 1),
                                                  slow_updates[param],
                                                  fast_updates[param])
        updates[self.count_] = T.mod(self.count_ + 1, self.slow_freq_)
        return updates