def get_sfg_updates(self, X_sym, y_sym, params, cost, learning_rate, momentum): gparams = T.grad(cost, params) updates = OrderedDict() from sfg import SFG if not hasattr(self, "sfg_"): self.count_ = theano.shared(0) self.slow_freq_ = 20 self.sfg_ = SFG(params, gparams) slow_updates, fast_updates = self.sfg_.updates(self.learning_rate, self.momentum, epsilon=0.0001, momentum_clipping=None) for param in slow_updates.keys(): updates[param] = theano.ifelse.ifelse(T.eq(self.count_, self.slow_freq_ - 1), slow_updates[param], fast_updates[param]) updates[self.count_] = T.mod(self.count_ + 1, self.slow_freq_) return updates
class TrainingMixin(object): def get_sgd_updates(self, X_sym, y_sym, params, cost, learning_rate, momentum): gparams = T.grad(cost, params) updates = OrderedDict() if not hasattr(self, "momentum_velocity_"): self.momentum_velocity_ = [0.] * len(gparams) for n, (param, gparam) in enumerate(zip(params, gparams)): velocity = self.momentum_velocity_[n] update_step = momentum * velocity - learning_rate * gparam self.momentum_velocity_[n] = update_step updates[param] = param + update_step return updates def _norm_constraint(self, param, update_step, max_col_norm): stepped_param = param + update_step if param.get_value(borrow=True).ndim == 2: col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) desired_norms = T.clip(col_norms, 0, max_col_norm) scale = desired_norms / (1e-7 + col_norms) new_param = param * scale new_update_step = update_step * scale else: new_param = param new_update_step = update_step return new_param, new_update_step def get_clip_sgd_updates(self, X_sym, y_sym, params, cost, learning_rate, momentum, rescale=5.): gparams = T.grad(cost, params) updates = OrderedDict() if not hasattr(self, "momentum_velocity_"): self.momentum_velocity_ = [0.] * len(gparams) # Gradient clipping grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) for n, (param, gparam) in enumerate(zip(params, gparams)): # clip gradient directly, not momentum etc. gparam = T.switch(not_finite, 0.1 * param, gparam * (scaling_num / scaling_den)) velocity = self.momentum_velocity_[n] update_step = momentum * velocity - learning_rate * gparam self.momentum_velocity_[n] = update_step updates[param] = param + update_step return updates def get_clip_rmsprop_updates(self, X_sym, y_sym, params, cost, learning_rate, momentum, rescale=5.): gparams = T.grad(cost, params) updates = OrderedDict() if not hasattr(self, "running_average_"): self.running_square_ = [0.] * len(gparams) self.running_avg_ = [0.] * len(gparams) self.updates_storage_ = [0.] * len(gparams) if not hasattr(self, "momentum_velocity_"): self.momentum_velocity_ = [0.] * len(gparams) # Gradient clipping grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), gparams))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) for n, (param, gparam) in enumerate(zip(params, gparams)): gparam = T.switch(not_finite, 0.1 * param, gparam * (scaling_num / scaling_den)) combination_coeff = 0.9 minimum_grad = 1e-4 old_square = self.running_square_[n] new_square = combination_coeff * old_square + ( 1. - combination_coeff) * T.sqr(gparam) old_avg = self.running_avg_[n] new_avg = combination_coeff * old_avg + ( 1. - combination_coeff) * gparam rms_grad = T.sqrt(new_square - new_avg ** 2) rms_grad = T.maximum(rms_grad, minimum_grad) velocity = self.momentum_velocity_[n] update_step = momentum * velocity - learning_rate * ( gparam / rms_grad) self.running_square_[n] = new_square self.running_avg_[n] = new_avg self.updates_storage_[n] = update_step self.momentum_velocity_[n] = update_step updates[param] = param + update_step return updates def get_sfg_updates(self, X_sym, y_sym, params, cost, learning_rate, momentum): gparams = T.grad(cost, params) updates = OrderedDict() from sfg import SFG if not hasattr(self, "sfg_"): self.count_ = theano.shared(0) self.slow_freq_ = 20 self.sfg_ = SFG(params, gparams) slow_updates, fast_updates = self.sfg_.updates(self.learning_rate, self.momentum, epsilon=0.0001, momentum_clipping=None) for param in slow_updates.keys(): updates[param] = theano.ifelse.ifelse(T.eq(self.count_, self.slow_freq_ - 1), slow_updates[param], fast_updates[param]) updates[self.count_] = T.mod(self.count_ + 1, self.slow_freq_) return updates