def init_train_updates(self): step = self.variables.step previous_delta = self.variables.prev_delta previous_gradient = self.variables.prev_gradient n_parameters = count_parameters(self.connection) parameters = parameter_values(self.connection) param_vector = T.concatenate([param.flatten() for param in parameters]) gradients = T.grad(self.variables.error_func, wrt=parameters) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) beta = self.update_function(previous_gradient, full_gradient, previous_delta) parameter_delta = ifelse( T.eq(T.mod(self.variables.epoch, n_parameters), 1), -full_gradient, -full_gradient + beta * previous_delta) updated_parameters = param_vector + step * parameter_delta updates = [ (previous_gradient, full_gradient), (previous_delta, parameter_delta), ] parameter_updates = setup_parameter_updates(parameters, updated_parameters) updates.extend(parameter_updates) return updates
def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = ifelse( T.lt(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) se_for_each_sample = ( (network_output - prediction_func) ** 2 ).ravel() params = parameter_values(self.connection) param_vector = parameters2vector(self) J = compute_jacobian(se_for_each_sample, params) n_params = J.shape[1] updated_params = param_vector - slinalg.solve( J.T.dot(J) + new_mu * T.eye(n_params), J.T.dot(se_for_each_sample) ) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates
def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = ifelse( T.lt(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) se_for_each_sample = ((network_output - prediction_func)**2).ravel() params = parameter_values(self.connection) param_vector = T.concatenate([param.flatten() for param in params]) J = compute_jacobian(se_for_each_sample, params) n_params = J.shape[1] updated_params = param_vector - slinalg.solve( J.T.dot(J) + new_mu * T.eye(n_params), J.T.dot(se_for_each_sample)) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates
def init_train_updates(self): step = self.variables.step min_eigval = self.min_eigval parameters = parameter_values(self.connection) param_vector = T.concatenate([param.flatten() for param in parameters]) gradients = T.grad(self.variables.error_func, wrt=parameters) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) second_derivatives = [] for parameter, gradient in zip(parameters, gradients): second_derivative = T.grad(gradient.sum(), wrt=parameter) second_derivatives.append(second_derivative.flatten()) hessian_diag = T.concatenate(second_derivatives) hessian_diag = T.switch( T.abs_(hessian_diag) < min_eigval, T.switch( hessian_diag < 0, -min_eigval, min_eigval, ), hessian_diag) # We divide gradient by Hessian diagonal elementwise is the same # as we just took diagonal Hessian inverse (which is # reciprocal for each diagonal element) and mutliply # by gradient. This operation is less clear, but works faster. updated_parameters = (param_vector - step * full_gradient / hessian_diag) updates = setup_parameter_updates(parameters, updated_parameters) return updates
def init_train_updates(self): step = self.variables.step inv_min_eigval = 1 / self.min_eigval parameters = parameter_values(self.connection) param_vector = make_single_vector(parameters) gradients = tf.gradients(self.variables.error_func, parameters) full_gradient = make_single_vector(gradients) second_derivatives = [] for parameter, gradient in zip(parameters, gradients): second_derivative, = tf.gradients(gradient, parameter) second_derivatives.append(flatten(second_derivative)) hessian_diag = tf.concat(second_derivatives, axis=0) # it's easier to clip inverse hessian rather than the hessian,. inv_hessian_diag = tf.clip_by_value( # inverse for diagonal matrix easy to compute with # elementwise inverse operation. 1 / hessian_diag, -inv_min_eigval, inv_min_eigval, ) updates = setup_parameter_updates( parameters, param_vector - step * full_gradient * inv_hessian_diag) return updates
def init_train_updates(self): updates = super(LeakStepAdaptation, self).init_train_updates() alpha = self.alpha beta = self.beta leak_size = self.leak_size step = self.variables.step leak_average = self.variables.leak_average parameters = parameter_values(self.connection) gradients = T.grad(self.variables.error_func, wrt=parameters) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) leak_avarage_update = ( (1 - leak_size) * leak_average + leak_size * full_gradient ) new_step = step + alpha * step * ( beta * leak_avarage_update.norm(L=2) - step ) updates.extend([ (leak_average, leak_avarage_update), (step, new_step), ]) return updates
def init_train_updates(self): updates = super(LeakStepAdaptation, self).init_train_updates() alpha = self.alpha beta = self.beta leak_size = self.leak_size step = self.variables.step leak_average = self.variables.leak_average parameters = parameter_values(self.connection) gradients = T.grad(self.variables.error_func, wrt=parameters) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) leak_avarage_update = ((1 - leak_size) * leak_average + leak_size * full_gradient) new_step = step + alpha * step * ( beta * leak_avarage_update.norm(L=2) - step) updates.extend([ (leak_average, leak_avarage_update), (step, new_step), ]) return updates
def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = tf.where( tf.less(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) err_for_each_sample = flatten((network_output - prediction_func) ** 2) params = parameter_values(self.connection) param_vector = make_single_vector(params) J = compute_jacobian(err_for_each_sample, params) J_T = tf.transpose(J) n_params = J.shape[1] parameter_update = tf.matrix_solve( tf.matmul(J_T, J) + new_mu * tf.eye(n_params.value), tf.matmul(J_T, tf.expand_dims(err_for_each_sample, 1)) ) updated_params = param_vector - flatten(parameter_update) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates
def init_train_updates(self): updates = super(LeakStepAdaptation, self).init_train_updates() alpha = asfloat(self.alpha) beta = asfloat(self.beta) leak_size = asfloat(self.leak_size) step = self.variables.step leak_average = self.variables.leak_average parameters = parameter_values(self.connection) gradients = tf.gradients(self.variables.error_func, parameters) full_gradient = tf.concat([flatten(grad) for grad in gradients], axis=0) leak_avarage_update = ((1 - leak_size) * leak_average + leak_size * full_gradient) new_step = step + alpha * step * (beta * tf.norm(leak_avarage_update) - step) updates.extend([ (leak_average, leak_avarage_update), (step, new_step), ]) return updates
def init_train_updates(self): network_inputs = self.variables.network_inputs network_output = self.variables.network_output inv_hessian = self.variables.inv_hessian prev_params = self.variables.prev_params prev_full_gradient = self.variables.prev_full_gradient params = parameter_values(self.connection) param_vector = T.concatenate([param.flatten() for param in params]) gradients = T.grad(self.variables.error_func, wrt=params) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) new_inv_hessian = ifelse( T.eq(self.variables.epoch, 1), inv_hessian, self.update_function(inv_hessian, param_vector - prev_params, full_gradient - prev_full_gradient)) param_delta = -new_inv_hessian.dot(full_gradient) layers_and_parameters = list(iter_parameters(self.layers)) def prediction(step): updated_params = param_vector + step * param_delta # This trick allow us to replace shared variables # with theano variables and get output from the network start_pos = 0 for layer, attrname, param in layers_and_parameters: end_pos = start_pos + param.size updated_param_value = T.reshape( updated_params[start_pos:end_pos], param.shape) setattr(layer, attrname, updated_param_value) start_pos = end_pos output = self.connection.output(*network_inputs) # Restore previous parameters for layer, attrname, param in layers_and_parameters: setattr(layer, attrname, param) return output def phi(step): return self.error(network_output, prediction(step)) def derphi(step): error_func = self.error(network_output, prediction(step)) return T.grad(error_func, wrt=step) step = asfloat(line_search(phi, derphi)) updated_params = param_vector + step * param_delta updates = setup_parameter_updates(params, updated_params) updates.extend([ (inv_hessian, new_inv_hessian), (prev_params, param_vector), (prev_full_gradient, full_gradient), ]) return updates
def init_train_updates(self): n_parameters = count_parameters(self.connection) parameters = parameter_values(self.connection) param_vector = T.concatenate([param.flatten() for param in parameters]) penalty_const = asfloat(self.penalty_const) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) updated_parameters = param_vector - slinalg.solve( hessian_matrix + penalty_const * T.eye(n_parameters), full_gradient) updates = setup_parameter_updates(parameters, updated_parameters) return updates
def init_train_updates(self): n_parameters = count_parameters(self.connection) parameters = parameter_values(self.connection) param_vector = T.concatenate([param.flatten() for param in parameters]) penalty_const = asfloat(self.penalty_const) print n_parameters self.variables.hessian = theano.shared(value=asfloat( np.zeros((n_parameters, n_parameters))), name='hessian_inverse') hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) updated_parameters = hessian_matrix updates = setup_parameter_updates([self.variables.hessian], updated_parameters) return updates
def init_train_updates(self): penalty_const = asfloat(self.penalty_const) n_parameters = count_parameters(self.connection) parameters = parameter_values(self.connection) param_vector = make_single_vector(parameters) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) parameter_update = tf.matrix_solve( hessian_matrix + penalty_const * tf.eye(n_parameters), tf.reshape(full_gradient, [-1, 1])) updated_parameters = param_vector - flatten(parameter_update) updates = setup_parameter_updates(parameters, updated_parameters) return updates
def init_train_updates(self): n_parameters = count_parameters(self.connection) parameters = parameter_values(self.connection) param_vector = parameters2vector(self) penalty_const = asfloat(self.penalty_const) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters ) updated_parameters = param_vector - slinalg.solve( hessian_matrix + penalty_const * T.eye(n_parameters), full_gradient ) updates = setup_parameter_updates(parameters, updated_parameters) return updates
def init_train_updates(self): step = self.variables.step previous_delta = self.variables.prev_delta previous_gradient = self.variables.prev_gradient n_parameters = count_parameters(self.connection) parameters = parameter_values(self.connection) param_vector = parameters2vector(self) gradients = T.grad(self.variables.error_func, wrt=parameters) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) beta = self.update_function(previous_gradient, full_gradient, previous_delta) parameter_delta = ifelse( T.eq(T.mod(self.variables.epoch, n_parameters), 1), -full_gradient, -full_gradient + beta * previous_delta ) updated_parameters = param_vector + step * parameter_delta updates = [(previous_gradient, full_gradient), (previous_delta, parameter_delta)] parameter_updates = setup_parameter_updates(parameters, updated_parameters) updates.extend(parameter_updates) return updates
def init_layer_updates(self, layer): """ Initialize train function update in Theano format that would be trigger after each training epoch for each layer. Parameters ---------- layer : object Any layer that inherit from layers.BaseLayer class. Returns ------- list Update that excaptable by ``theano.function``. There should be a lits that contains tuples with 2 elements. First one should be parameter that would be updated after epoch and the second one should update rules for this parameter. For example parameter could be a layer's weight and bias. """ updates = [] for parameter in parameter_values(layer): updates.extend(self.init_param_updates(layer, parameter)) updates.extend(layer.updates) return updates
def init_train_updates(self): step = self.variables.step min_eigval = self.min_eigval parameters = parameter_values(self.connection) param_vector = parameters2vector(self) gradients = T.grad(self.variables.error_func, wrt=parameters) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) second_derivatives = [] for parameter, gradient in zip(parameters, gradients): second_derivative = T.grad(gradient.sum(), wrt=parameter) second_derivatives.append(second_derivative.flatten()) hessian_diag = T.concatenate(second_derivatives) hessian_diag = T.switch( T.abs_(hessian_diag) < min_eigval, T.switch( hessian_diag < 0, -min_eigval, min_eigval, ), hessian_diag ) # We divide gradient by Hessian diagonal elementwise is the same # as we just took diagonal Hessian inverse (which is # reciprocal for each diagonal element) and mutliply # by gradient. This operation is less clear, but works faster. updated_parameters = ( param_vector - step * full_gradient / hessian_diag ) updates = setup_parameter_updates(parameters, updated_parameters) return updates
def init_train_updates(self): inv_hessian = self.variables.inv_hessian prev_params = self.variables.prev_params prev_full_gradient = self.variables.prev_full_gradient params = parameter_values(self.connection) param_vector = make_single_vector(params) gradients = tf.gradients(self.variables.error_func, params) full_gradient = make_single_vector(gradients) new_inv_hessian = tf.where( tf.equal(self.variables.epoch, 1), inv_hessian, self.update_function(inv_H=inv_hessian, delta_w=param_vector - prev_params, delta_grad=full_gradient - prev_full_gradient, epsilon=self.epsilon)) param_delta = -dot(new_inv_hessian, full_gradient) step = self.find_optimal_step(param_vector, param_delta) updated_params = param_vector + step * param_delta updates = setup_parameter_updates(params, updated_params) # We have to compute these values first, otherwise # parallelization in tensorflow can mix update order # and, for example, previous gradient can be equal to # current gradient value. It happens because tensorflow # try to execute operations in parallel. required_variables = [new_inv_hessian, param_vector, full_gradient] with tf.control_dependencies(required_variables): updates.extend([ inv_hessian.assign(new_inv_hessian), prev_params.assign(param_vector), prev_full_gradient.assign(full_gradient), ]) return updates
def init_train_updates(self): step = self.variables.step epoch = self.variables.epoch previous_delta = self.variables.prev_delta previous_gradient = self.variables.prev_gradient n_parameters = count_parameters(self.connection) parameters = parameter_values(self.connection) param_vector = make_single_vector(parameters) gradients = tf.gradients(self.variables.error_func, parameters) full_gradient = make_single_vector(gradients) beta = self.update_function(previous_gradient, full_gradient, previous_delta, self.epsilon) parameter_delta = tf.where(tf.equal(tf.mod(epoch, n_parameters), 1), -full_gradient, -full_gradient + beta * previous_delta) step = self.find_optimal_step(param_vector, parameter_delta) updated_parameters = param_vector + step * parameter_delta updates = setup_parameter_updates(parameters, updated_parameters) # We have to compute these values first, otherwise # parallelization in tensorflow can mix update order # and, for example, previous gradient can be equal to # current gradient value. It happens because tensorflow # try to execute operations in parallel. with tf.control_dependencies([full_gradient, parameter_delta]): updates.extend([ previous_gradient.assign(full_gradient), previous_delta.assign(parameter_delta), ]) return updates
def init_train_updates(self): network_input = self.variables.network_input network_output = self.variables.network_output inv_hessian = self.variables.inv_hessian prev_params = self.variables.prev_params prev_full_gradient = self.variables.prev_full_gradient params = parameter_values(self.connection) param_vector = parameters2vector(self) gradients = T.grad(self.variables.error_func, wrt=params) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) new_inv_hessian = ifelse( T.eq(self.variables.epoch, 1), inv_hessian, self.update_function(inv_hessian, param_vector - prev_params, full_gradient - prev_full_gradient) ) param_delta = -new_inv_hessian.dot(full_gradient) layers_and_parameters = list(iter_parameters(self.layers)) def prediction(step): updated_params = param_vector + step * param_delta # This trick allow us to replace shared variables # with theano variables and get output from the network start_pos = 0 for layer, attrname, param in layers_and_parameters: end_pos = start_pos + param.size updated_param_value = T.reshape( updated_params[start_pos:end_pos], param.shape ) setattr(layer, attrname, updated_param_value) start_pos = end_pos output = self.connection.output(network_input) # We need to replace back parameter to shared variable for layer, attrname, param in layers_and_parameters: setattr(layer, attrname, param) return output def phi(step): return self.error(network_output, prediction(step)) def derphi(step): error_func = self.error(network_output, prediction(step)) return T.grad(error_func, wrt=step) step = asfloat(line_search(phi, derphi)) updated_params = param_vector + step * param_delta updates = setup_parameter_updates(params, updated_params) updates.extend([ (inv_hessian, new_inv_hessian), (prev_params, param_vector), (prev_full_gradient, full_gradient), ]) return updates