def init_train_updates(self): step = self.variables.step inv_min_eigval = 1 / self.min_eigval parameters = parameter_values(self.connection) param_vector = make_single_vector(parameters) gradients = tf.gradients(self.variables.error_func, parameters) full_gradient = make_single_vector(gradients) second_derivatives = [] for parameter, gradient in zip(parameters, gradients): second_derivative, = tf.gradients(gradient, parameter) second_derivatives.append(flatten(second_derivative)) hessian_diag = tf.concat(second_derivatives, axis=0) # it's easier to clip inverse hessian rather than the hessian,. inv_hessian_diag = tf.clip_by_value( # inverse for diagonal matrix easy to compute with # elementwise inverse operation. 1 / hessian_diag, -inv_min_eigval, inv_min_eigval, ) updates = setup_parameter_updates( parameters, param_vector - step * full_gradient * inv_hessian_diag) return updates
def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = tf.where( tf.less(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) err_for_each_sample = flatten((network_output - prediction_func) ** 2) params = parameter_values(self.connection) param_vector = make_single_vector(params) J = compute_jacobian(err_for_each_sample, params) J_T = tf.transpose(J) n_params = J.shape[1] parameter_update = tf.matrix_solve( tf.matmul(J_T, J) + new_mu * tf.eye(n_params.value), tf.matmul(J_T, tf.expand_dims(err_for_each_sample, 1)) ) updated_params = param_vector - flatten(parameter_update) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates
def find_hessian_and_gradient(error_function, parameters): """ Compute jacobian. Parameters ---------- values : Tensorfow variable Computed MSE for each sample separetly. parameters : list of Tensorfow variable Neural network parameters (e.g. weights, biases). Returns ------- Tensorfow variable """ gradients = tf.gradients(error_function, parameters) full_gradient = make_single_vector(gradients) full_gradient_shape = tf.shape(full_gradient) n_samples = full_gradient_shape[0] def compute_gradient_per_value(index, result): gradients = tf.gradients(full_gradient[index], parameters) hessian = make_single_vector(gradients) return (index + 1, result.write(index, hessian)) _, hessian = tf.while_loop(lambda index, _: index < n_samples, compute_gradient_per_value, [ tf.constant(0, tf.int32), tf.TensorArray(tf.float32, size=n_samples), ]) return hessian.stack(), full_gradient
def init_train_updates(self): inv_hessian = self.variables.inv_hessian prev_params = self.variables.prev_params prev_full_gradient = self.variables.prev_full_gradient params = parameter_values(self.connection) param_vector = make_single_vector(params) gradients = tf.gradients(self.variables.error_func, params) full_gradient = make_single_vector(gradients) new_inv_hessian = tf.where( tf.equal(self.variables.epoch, 1), inv_hessian, self.update_function(inv_H=inv_hessian, delta_w=param_vector - prev_params, delta_grad=full_gradient - prev_full_gradient, epsilon=self.epsilon)) param_delta = -dot(new_inv_hessian, full_gradient) step = self.find_optimal_step(param_vector, param_delta) updated_params = param_vector + step * param_delta updates = setup_parameter_updates(params, updated_params) # We have to compute these values first, otherwise # parallelization in tensorflow can mix update order # and, for example, previous gradient can be equal to # current gradient value. It happens because tensorflow # try to execute operations in parallel. required_variables = [new_inv_hessian, param_vector, full_gradient] with tf.control_dependencies(required_variables): updates.extend([ inv_hessian.assign(new_inv_hessian), prev_params.assign(param_vector), prev_full_gradient.assign(full_gradient), ]) return updates
def init_train_updates(self): step = self.variables.step epoch = self.variables.epoch previous_delta = self.variables.prev_delta previous_gradient = self.variables.prev_gradient n_parameters = count_parameters(self.connection) parameters = parameter_values(self.connection) param_vector = make_single_vector(parameters) gradients = tf.gradients(self.variables.error_func, parameters) full_gradient = make_single_vector(gradients) beta = self.update_function(previous_gradient, full_gradient, previous_delta, self.epsilon) parameter_delta = tf.where(tf.equal(tf.mod(epoch, n_parameters), 1), -full_gradient, -full_gradient + beta * previous_delta) step = self.find_optimal_step(param_vector, parameter_delta) updated_parameters = param_vector + step * parameter_delta updates = setup_parameter_updates(parameters, updated_parameters) # We have to compute these values first, otherwise # parallelization in tensorflow can mix update order # and, for example, previous gradient can be equal to # current gradient value. It happens because tensorflow # try to execute operations in parallel. with tf.control_dependencies([full_gradient, parameter_delta]): updates.extend([ previous_gradient.assign(full_gradient), previous_delta.assign(parameter_delta), ]) return updates
def init_train_updates(self): penalty_const = asfloat(self.penalty_const) n_parameters = count_parameters(self.connection) parameters = parameter_values(self.connection) param_vector = make_single_vector(parameters) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) parameter_update = tf.matrix_solve( hessian_matrix + penalty_const * tf.eye(n_parameters), tf.reshape(full_gradient, [-1, 1])) updated_parameters = param_vector - flatten(parameter_update) updates = setup_parameter_updates(parameters, updated_parameters) return updates
def compute_gradient_per_value(index, result): gradients = tf.gradients(values[index], parameters) full_gradient = make_single_vector(gradients) return (index + 1, result.write(index, full_gradient))
def compute_gradient_per_value(index, result): gradients = tf.gradients(full_gradient[index], parameters) hessian = make_single_vector(gradients) return (index + 1, result.write(index, hessian))