def init_param_updates(self, layer, parameter): epoch = self.variables.epoch prev_first_moment = parameter.prev_first_moment prev_weighted_inf_norm = parameter.prev_weighted_inf_norm step = self.variables.step beta1 = self.beta1 beta2 = self.beta2 n_parameters = count_parameters(self) self.variables.hessian = theano.shared( value=asfloat(np.zeros((n_parameters, n_parameters))), name='hessian_inverse') parameters = list(iter_parameters(self)) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters ) gradient = T.grad(self.variables.error_func, wrt=parameter) first_moment = beta1 * prev_first_moment + (1 - beta1) * gradient weighted_inf_norm = T.maximum(beta2 * prev_weighted_inf_norm, T.abs_(gradient)) parameter_delta = ( (1 / (1 - beta1 ** epoch)) * (first_moment / (weighted_inf_norm + self.epsilon)) ) return [ (prev_first_moment, first_moment), (prev_weighted_inf_norm, weighted_inf_norm), (parameter, parameter - step * parameter_delta),(self.variables.hessian, hessian_matrix) ]
def init_param_updates(self, layer, parameter): epoch = self.variables.epoch prev_first_moment = parameter.prev_first_moment prev_second_moment = parameter.prev_second_moment # step = asfloat(self.variables.step) step = 0.001 beta1 = asfloat(self.beta1) beta2 = asfloat(self.beta2) epsilon = asfloat(self.epsilon) gradient = T.grad(self.variables.error_func, wrt=parameter) n_parameters = count_parameters(self) self.variables.hessian = theano.shared(value=asfloat( np.zeros((n_parameters, n_parameters))), name='hessian_inverse') parameters = list(iter_parameters(self)) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) first_moment = (beta1 * prev_first_moment + asfloat(1. - beta1) * gradient) second_moment = (beta2 * prev_second_moment + asfloat(1. - beta2) * gradient**2) first_moment_bias_corrected = first_moment / (1. - beta1**epoch) second_moment_bias_corrected = second_moment / (1. - beta2**epoch) parameter_delta = first_moment_bias_corrected * ( T.sqrt(second_moment_bias_corrected) + epsilon) return [(prev_first_moment, first_moment), (prev_second_moment, second_moment), (parameter, parameter - step * parameter_delta), (self.variables.hessian, hessian_matrix)]
def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = ifelse( T.lt(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) mse_for_each_sample = T.mean((network_output - prediction_func)**2, axis=1) params = list(iter_parameters(self)) param_vector = parameters2vector(self) ####################################################################################### n_parameters = count_parameters(self) self.variables.hessian = theano.shared(value=asfloat( np.zeros((n_parameters, n_parameters))), name='hessian_inverse') parameters = list(iter_parameters(self)) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) ####################################################################################### J = compute_jaccobian(mse_for_each_sample, params) n_params = J.shape[1] updated_params = param_vector - T.nlinalg.matrix_inverse( J.T.dot(J) + new_mu * T.eye(n_params)).dot( J.T).dot(mse_for_each_sample) updates = [(mu, new_mu), [(self.variables.hessian, hessian_matrix)]] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates
def init_param_updates(self, layer, parameter): n_parameters = count_parameters(self) self.variables.hessian = theano.shared(value=asfloat( np.zeros((n_parameters, n_parameters))), name='hessian_inverse') parameters = list(iter_parameters(self)) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) prev_mean_squred_grad = parameter.prev_mean_squred_grad step = self.variables.step gradient = T.grad(self.variables.error_func, wrt=parameter) mean_squred_grad = (self.decay * prev_mean_squred_grad + (1 - self.decay) * gradient**2) parameter_delta = gradient / T.sqrt(mean_squred_grad + self.epsilon) return [ (prev_mean_squred_grad, mean_squred_grad), (parameter, parameter - step * parameter_delta), ]