def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = ifelse( T.lt(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) mse_for_each_sample = T.mean((network_output - prediction_func)**2, axis=1) params = list(iter_parameters(self)) param_vector = parameters2vector(self) J = compute_jaccobian(mse_for_each_sample, params) n_params = J.shape[1] updated_params = param_vector - T.nlinalg.matrix_inverse( J.T.dot(J) + new_mu * T.eye(n_params)).dot( J.T).dot(mse_for_each_sample) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates
def init_train_updates(self): step = self.variables.step min_eigval = self.min_eigval parameters = list(iter_parameters(self)) param_vector = parameters2vector(self) gradients = T.grad(self.variables.error_func, wrt=parameters) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) second_derivatives = [] for parameter, gradient in zip(parameters, gradients): second_derivative = T.grad(gradient.sum(), wrt=parameter) second_derivatives.append(second_derivative.flatten()) hessian_diag = T.concatenate(second_derivatives) hessian_diag = T.switch( T.abs_(hessian_diag) < min_eigval, T.switch( hessian_diag < 0, -min_eigval, min_eigval, ), hessian_diag) # We divide gradient by Hessian diagonal elementwise is the same # as we just took diagonal Hessian inverse (which is # reciprocal for each diagonal element) and mutliply # by gradient. This operation is less clear, but works faster. updated_parameters = (param_vector - step * full_gradient / hessian_diag) updates = setup_parameter_updates(parameters, updated_parameters) return updates
def init_train_updates(self): step = self.variables.step previous_delta = self.variables.prev_delta previous_gradient = self.variables.prev_gradient n_parameters = count_parameters(self) parameters = list(iter_parameters(self)) param_vector = parameters2vector(self) gradients = T.grad(self.variables.error_func, wrt=parameters) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) beta = self.update_function(previous_gradient, full_gradient, previous_delta) parameter_delta = ifelse( T.eq(T.mod(self.variables.epoch, n_parameters), 1), -full_gradient, -full_gradient + beta * previous_delta ) updated_parameters = param_vector + step * parameter_delta updates = [ (previous_gradient, full_gradient), (previous_delta, parameter_delta), ] parameter_updates = setup_parameter_updates(parameters, updated_parameters) updates.extend(parameter_updates) return updates
def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = ifelse( T.lt(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) mse_for_each_sample = T.mean( (network_output - prediction_func) ** 2, axis=1 ) params = list(iter_parameters(self)) param_vector = parameters2vector(self) J = compute_jaccobian(mse_for_each_sample, params) n_params = J.shape[1] updated_params = param_vector - T.nlinalg.matrix_inverse( J.T.dot(J) + new_mu * T.eye(n_params) ).dot(J.T).dot(mse_for_each_sample) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates
def init_train_updates(self): step = self.variables.step previous_delta = self.variables.prev_delta previous_gradient = self.variables.prev_gradient n_parameters = count_parameters(self) parameters = list(iter_parameters(self)) param_vector = parameters2vector(self) gradients = T.grad(self.variables.error_func, wrt=parameters) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) beta = self.update_function(previous_gradient, full_gradient, previous_delta) parameter_delta = ifelse( T.eq(T.mod(self.variables.epoch, n_parameters), 1), -full_gradient, -full_gradient + beta * previous_delta) updated_parameters = param_vector + step * parameter_delta updates = [ (previous_gradient, full_gradient), (previous_delta, parameter_delta), ] parameter_updates = setup_parameter_updates(parameters, updated_parameters) updates.extend(parameter_updates) return updates
def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = ifelse( T.lt(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) se_for_each_sample = ( (network_output - prediction_func) ** 2 ).ravel() params = parameter_values(self.connection) param_vector = parameters2vector(self) J = compute_jacobian(se_for_each_sample, params) n_params = J.shape[1] updated_params = param_vector - slinalg.solve( J.T.dot(J) + new_mu * T.eye(n_params), J.T.dot(se_for_each_sample) ) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates
def init_train_updates(self): network_input = self.variables.network_input network_output = self.variables.network_output inv_hessian = self.variables.inv_hessian prev_params = self.variables.prev_params prev_full_gradient = self.variables.prev_full_gradient params = list(iter_parameters(self)) param_vector = parameters2vector(self) gradients = T.grad(self.variables.error_func, wrt=params) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) new_inv_hessian = ifelse( T.eq(self.variables.epoch, 1), inv_hessian, self.update_function(inv_hessian, param_vector - prev_params, full_gradient - prev_full_gradient) ) param_delta = -new_inv_hessian.dot(full_gradient) def prediction(step): # TODO: I need to update this ugly solution later updated_params = param_vector + step * param_delta layer_input = network_input start_pos = 0 for layer in self.layers: for param in layer.parameters: end_pos = start_pos + param.size parameter_name, parameter_id = param.name.split('_') setattr(layer, parameter_name, T.reshape( updated_params[start_pos:end_pos], param.shape )) start_pos = end_pos layer_input = layer.output(layer_input) return layer_input def phi(step): return self.error(network_output, prediction(step)) def derphi(step): error_func = self.error(network_output, prediction(step)) return T.grad(error_func, wrt=step) step = asfloat(line_search(phi, derphi)) updated_params = param_vector + step * param_delta updates = setup_parameter_updates(params, updated_params) updates.extend([ (inv_hessian, new_inv_hessian), (prev_params, param_vector), (prev_full_gradient, full_gradient), ]) return updates
def init_train_updates(self): n_parameters = count_parameters(self) parameters = list(iter_parameters(self)) param_vector = parameters2vector(self) penalty_const = asfloat(self.penalty_const) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) hessian_inverse = T.nlinalg.matrix_inverse(hessian_matrix + penalty_const * T.eye(n_parameters)) updated_parameters = param_vector - hessian_inverse.dot(full_gradient) updates = setup_parameter_updates(parameters, updated_parameters) return updates
def init_train_updates(self): n_parameters = count_parameters(self.connection) parameters = parameter_values(self.connection) param_vector = parameters2vector(self) penalty_const = asfloat(self.penalty_const) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters ) updated_parameters = param_vector - slinalg.solve( hessian_matrix + penalty_const * T.eye(n_parameters), full_gradient ) updates = setup_parameter_updates(parameters, updated_parameters) return updates
def init_train_updates(self): n_parameters = count_parameters(self) parameters = list(iter_parameters(self)) param_vector = parameters2vector(self) penalty_const = asfloat(self.penalty_const) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters ) hessian_inverse = T.nlinalg.matrix_inverse( hessian_matrix + penalty_const * T.eye(n_parameters) ) updated_parameters = param_vector - hessian_inverse.dot(full_gradient) updates = setup_parameter_updates(parameters, updated_parameters) return updates
def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = ifelse( T.lt(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) mse_for_each_sample = T.mean((network_output - prediction_func)**2, axis=1) params = list(iter_parameters(self)) param_vector = parameters2vector(self) ####################################################################################### n_parameters = count_parameters(self) self.variables.hessian = theano.shared(value=asfloat( np.zeros((n_parameters, n_parameters))), name='hessian_inverse') parameters = list(iter_parameters(self)) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) ####################################################################################### J = compute_jaccobian(mse_for_each_sample, params) n_params = J.shape[1] updated_params = param_vector - T.nlinalg.matrix_inverse( J.T.dot(J) + new_mu * T.eye(n_params)).dot( J.T).dot(mse_for_each_sample) updates = [(mu, new_mu), [(self.variables.hessian, hessian_matrix)]] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates
def init_train_updates(self): step = self.variables.step min_eigval = self.min_eigval parameters = list(iter_parameters(self)) param_vector = parameters2vector(self) gradients = T.grad(self.variables.error_func, wrt=parameters) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) second_derivatives = [] for parameter, gradient in zip(parameters, gradients): second_derivative = T.grad(gradient.sum(), wrt=parameter) second_derivatives.append(second_derivative.flatten()) hessian_diag = T.concatenate(second_derivatives) hessian_diag = T.switch( T.abs_(hessian_diag) < min_eigval, T.switch( hessian_diag < 0, -min_eigval, min_eigval, ), hessian_diag ) # We divide gradient by Hessian diagonal elementwise is the same # as we just took diagonal Hessian inverse (which is # reciprocal for each diagonal element) and mutliply # by gradient. This operation is less clear, but works faster. updated_parameters = ( param_vector - step * full_gradient / hessian_diag ) updates = setup_parameter_updates(parameters, updated_parameters) return updates
def init_train_updates(self): network_input = self.variables.network_input network_output = self.variables.network_output inv_hessian = self.variables.inv_hessian prev_params = self.variables.prev_params prev_full_gradient = self.variables.prev_full_gradient params = parameter_values(self.connection) param_vector = parameters2vector(self) gradients = T.grad(self.variables.error_func, wrt=params) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) new_inv_hessian = ifelse( T.eq(self.variables.epoch, 1), inv_hessian, self.update_function(inv_hessian, param_vector - prev_params, full_gradient - prev_full_gradient) ) param_delta = -new_inv_hessian.dot(full_gradient) layers_and_parameters = list(iter_parameters(self.layers)) def prediction(step): updated_params = param_vector + step * param_delta # This trick allow us to replace shared variables # with theano variables and get output from the network start_pos = 0 for layer, attrname, param in layers_and_parameters: end_pos = start_pos + param.size updated_param_value = T.reshape( updated_params[start_pos:end_pos], param.shape ) setattr(layer, attrname, updated_param_value) start_pos = end_pos output = self.connection.output(network_input) # We need to replace back parameter to shared variable for layer, attrname, param in layers_and_parameters: setattr(layer, attrname, param) return output def phi(step): return self.error(network_output, prediction(step)) def derphi(step): error_func = self.error(network_output, prediction(step)) return T.grad(error_func, wrt=step) step = asfloat(line_search(phi, derphi)) updated_params = param_vector + step * param_delta updates = setup_parameter_updates(params, updated_params) updates.extend([ (inv_hessian, new_inv_hessian), (prev_params, param_vector), (prev_full_gradient, full_gradient), ]) return updates