class LazyLearningMixin(SharedDocs): """ Mixin for lazy learning Neural Network algorithms. Notes ----- - Network uses lazy learning which mean that network doesn't need iterative training. It just stores parameters and use them to make a predictions. Methods ------- train(input_train, target_train, copy=True) Network just stores all the information about the data and use it for the prediction. Parameter ``copy`` copies input data before saving it inside the network. """ step = WithdrawProperty() show_epoch = WithdrawProperty() shuffle_data = WithdrawProperty() train_end_signal = WithdrawProperty() epoch_end_signal = WithdrawProperty() def __init__(self, *args, **kwargs): self.input_train = None self.target_train = None super(LazyLearningMixin, self).__init__(*args, **kwargs) def train(self, input_train, target_train): self.input_train = input_train self.target_train = target_train if input_train.shape[0] != target_train.shape[0]: raise ValueError("Number of samples in the input and target " "datasets are different")
class ConjugateGradient(WolfeLineSearchForStep, BaseOptimizer): """ Conjugate Gradient algorithm. Parameters ---------- update_function : ``fletcher_reeves``, ``polak_ribiere``,\ ``hentenes_stiefel``, ``dai_yuan``, ``liu_storey`` Update function. Defaults to ``fletcher_reeves``. epsilon : float Ensures computational stability during the division in ``update_function`` when denominator is very small number. Defaults to ``1e-7``. {WolfeLineSearchForStep.Parameters} {BaseOptimizer.network} {BaseOptimizer.loss} {BaseOptimizer.show_epoch} {BaseOptimizer.shuffle_data} {BaseOptimizer.signals} {BaseOptimizer.verbose} {BaseOptimizer.regularizer} Attributes ---------- {BaseOptimizer.Attributes} Methods ------- {BaseOptimizer.Methods} Examples -------- >>> from sklearn import datasets, preprocessing >>> from sklearn.model_selection import train_test_split >>> from neupy import algorithms, layers >>> >>> dataset = datasets.load_boston() >>> data, target = dataset.data, dataset.target >>> >>> data_scaler = preprocessing.MinMaxScaler() >>> target_scaler = preprocessing.MinMaxScaler() >>> >>> x_train, x_test, y_train, y_test = train_test_split( ... data_scaler.fit_transform(data), ... target_scaler.fit_transform(target), ... test_size=0.15 ... ) >>> >>> cgnet = algorithms.ConjugateGradient( ... network=[ ... layers.Input(13), ... layers.Sigmoid(50), ... layers.Sigmoid(1), ... ], ... update_function='fletcher_reeves', ... verbose=False ... ) >>> >>> cgnet.train(x_train, y_train, epochs=100) >>> y_predict = cgnet.predict(x_test).round(1) >>> >>> real = target_scaler.inverse_transform(y_test) >>> predicted = target_scaler.inverse_transform(y_predict) References ---------- [1] Jorge Nocedal, Stephen J. Wright, Numerical Optimization. Chapter 5, Conjugate Gradient Methods, p. 101-133 """ epsilon = NumberProperty(default=1e-7, minval=0) update_function = ChoiceProperty( default='fletcher_reeves', choices={ 'fletcher_reeves': fletcher_reeves, 'polak_ribiere': polak_ribiere, 'hentenes_stiefel': hentenes_stiefel, 'liu_storey': liu_storey, 'dai_yuan': dai_yuan, } ) step = WithdrawProperty() def init_functions(self): n_parameters = self.network.n_parameters self.variables.update( prev_delta=tf.Variable( tf.zeros([n_parameters]), name="conj-grad/prev-delta", dtype=tf.float32, ), prev_gradient=tf.Variable( tf.zeros([n_parameters]), name="conj-grad/prev-gradient", dtype=tf.float32, ), iteration=tf.Variable( asfloat(self.last_epoch), name='conj-grad/current-iteration', dtype=tf.float32 ), ) super(ConjugateGradient, self).init_functions() def init_train_updates(self): iteration = self.variables.iteration previous_delta = self.variables.prev_delta previous_gradient = self.variables.prev_gradient n_parameters = self.network.n_parameters variables = self.network.variables parameters = [var for var in variables.values() if var.trainable] param_vector = make_single_vector(parameters) gradients = tf.gradients(self.variables.loss, parameters) full_gradient = make_single_vector(gradients) beta = self.update_function( previous_gradient, full_gradient, previous_delta, self.epsilon) parameter_delta = tf.where( tf.equal(tf.mod(iteration, n_parameters), 0), -full_gradient, -full_gradient + beta * previous_delta ) step = self.find_optimal_step(param_vector, parameter_delta) updated_parameters = param_vector + step * parameter_delta updates = setup_parameter_updates(parameters, updated_parameters) # We have to compute these values first, otherwise # parallelization, in tensorflow, can mix update order # and, for example, previous gradient can be equal to # current gradient value. It happens because tensorflow # try to execute operations in parallel. with tf.control_dependencies([full_gradient, parameter_delta]): updates.extend([ previous_gradient.assign(full_gradient), previous_delta.assign(parameter_delta), iteration.assign(iteration + 1), ]) return updates
class QuasiNewton(StepSelectionBuiltIn, GradientDescent): """ Quasi-Newton algorithm optimization. Parameters ---------- update_function : {{'bfgs', 'dfp', 'psb', 'sr1'}} Update function. Defaults to ``bfgs``. h0_scale : float Default Hessian matrix is an identity matrix. The ``h0_scale`` parameter scales identity matrix. Defaults to ``1``. {GradientDescent.connection} {GradientDescent.error} {GradientDescent.show_epoch} {GradientDescent.shuffle_data} {GradientDescent.epoch_end_signal} {GradientDescent.train_end_signal} {GradientDescent.verbose} {GradientDescent.addons} Attributes ---------- {GradientDescent.Attributes} Methods ------- {GradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> qnnet = algorithms.QuasiNewton( ... (2, 3, 1), ... update_function='bfgs' ... ) >>> qnnet.train(x_train, y_train, epochs=10) See Also -------- :network:`GradientDescent` : GradientDescent algorithm. """ update_function = ChoiceProperty(default='bfgs', choices={ 'bfgs': bfgs, 'dfp': dfp, 'psb': psb, 'sr1': sr1, }) h0_scale = NumberProperty(default=1, minval=0) step = WithdrawProperty() def init_variables(self): super(QuasiNewton, self).init_variables() n_params = count_parameters(self.connection) self.variables.update( inv_hessian=theano.shared( name='algo:quasi-newton/matrix:inv-hessian', value=asfloat(self.h0_scale * np.eye(int(n_params))), ), prev_params=theano.shared( name='algo:quasi-newton/vector:prev-params', value=asfloat(np.zeros(n_params)), ), prev_full_gradient=theano.shared( name='algo:quasi-newton/vector:prev-full-gradient', value=asfloat(np.zeros(n_params)), ), ) def init_train_updates(self): network_inputs = self.variables.network_inputs network_output = self.variables.network_output inv_hessian = self.variables.inv_hessian prev_params = self.variables.prev_params prev_full_gradient = self.variables.prev_full_gradient params = parameter_values(self.connection) param_vector = T.concatenate([param.flatten() for param in params]) gradients = T.grad(self.variables.error_func, wrt=params) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) new_inv_hessian = ifelse( T.eq(self.variables.epoch, 1), inv_hessian, self.update_function(inv_hessian, param_vector - prev_params, full_gradient - prev_full_gradient)) param_delta = -new_inv_hessian.dot(full_gradient) layers_and_parameters = list(iter_parameters(self.layers)) def prediction(step): updated_params = param_vector + step * param_delta # This trick allow us to replace shared variables # with theano variables and get output from the network start_pos = 0 for layer, attrname, param in layers_and_parameters: end_pos = start_pos + param.size updated_param_value = T.reshape( updated_params[start_pos:end_pos], param.shape) setattr(layer, attrname, updated_param_value) start_pos = end_pos output = self.connection.output(*network_inputs) # Restore previous parameters for layer, attrname, param in layers_and_parameters: setattr(layer, attrname, param) return output def phi(step): return self.error(network_output, prediction(step)) def derphi(step): error_func = self.error(network_output, prediction(step)) return T.grad(error_func, wrt=step) step = asfloat(line_search(phi, derphi)) updated_params = param_vector + step * param_delta updates = setup_parameter_updates(params, updated_params) updates.extend([ (inv_hessian, new_inv_hessian), (prev_params, param_vector), (prev_full_gradient, full_gradient), ]) return updates
class LevenbergMarquardt(StepSelectionBuiltIn, BaseGradientDescent): """ Levenberg-Marquardt algorithm is a variation of the Newton's method. It minimizes MSE error. The algorithm approximates Hessian matrix using dot product between two jacobian matrices. Notes ----- - Method requires all training data during propagation, which means it's not allowed to use mini-batches. - Network minimizes only Mean Squared Error (MSE) loss function. - Efficient for small training datasets, because it computes gradient per each sample separately. - Efficient for small-sized networks. Parameters ---------- {BaseGradientDescent.connection} mu : float Control invertion for J.T * J matrix, defaults to ``0.1``. mu_update_factor : float Factor to decrease the mu if update decrese the error, otherwise increse mu by the same factor. Defaults to ``1.2`` error : {{``mse``}} Levenberg-Marquardt works only for quadratic functions. Defaults to ``mse``. {BaseGradientDescent.show_epoch} {BaseGradientDescent.shuffle_data} {BaseGradientDescent.epoch_end_signal} {BaseGradientDescent.train_end_signal} {BaseGradientDescent.verbose} {BaseGradientDescent.addons} Attributes ---------- {BaseGradientDescent.Attributes} Methods ------- {BaseGradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> lmnet = algorithms.LevenbergMarquardt((2, 3, 1)) >>> lmnet.train(x_train, y_train) See Also -------- :network:`BaseGradientDescent` : BaseGradientDescent algorithm. """ mu = BoundedProperty(default=0.01, minval=0) mu_update_factor = BoundedProperty(default=1.2, minval=1) error = ChoiceProperty(default='mse', choices={'mse': errors.mse}) step = WithdrawProperty() def init_variables(self): super(LevenbergMarquardt, self).init_variables() self.variables.update( mu=tf.Variable(self.mu, name='lev-marq/mu'), last_error=tf.Variable(np.nan, name='lev-marq/last-error'), ) def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = tf.where( tf.less(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) err_for_each_sample = flatten((network_output - prediction_func) ** 2) params = parameter_values(self.connection) param_vector = make_single_vector(params) J = compute_jacobian(err_for_each_sample, params) J_T = tf.transpose(J) n_params = J.shape[1] parameter_update = tf.matrix_solve( tf.matmul(J_T, J) + new_mu * tf.eye(n_params.value), tf.matmul(J_T, tf.expand_dims(err_for_each_sample, 1)) ) updated_params = param_vector - flatten(parameter_update) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates def on_epoch_start_update(self, epoch): super(LevenbergMarquardt, self).on_epoch_start_update(epoch) last_error = self.errors.last() if last_error is not None: self.variables.last_error.load(last_error, tensorflow_session())
class B(A): prop = WithdrawProperty()
class QuasiNewton(WolfeLineSearchForStep, BaseGradientDescent): """ Quasi-Newton algorithm. Every iteration quasi-Network method approximates inverse Hessian matrix with iterative updates. It doesn't have ``step`` parameter. Instead, algorithm applies line search for the step parameter that satisfies strong Wolfe condition. Parameters that control wolfe search start with the ``wolfe_`` prefix. Parameters ---------- update_function : ``bfgs``, ``dfp``, ``sr1`` Update function for the iterative inverse hessian matrix approximation. Defaults to ``bfgs``. - ``bfgs`` - It's rank 2 formula update. It can suffer from round-off error and inaccurate line searches. - ``dfp`` - DFP is a method very similar to BFGS. It's rank 2 formula update. It can suffer from round-off error and inaccurate line searches. - ``sr1`` - Symmetric rank 1 (SR1). Generates update for the inverse hessian matrix adding symmetric rank-1 matrix. It's possible that there is no rank 1 updates for the matrix and in this case update won't be applied and original inverse hessian will be returned. h0_scale : float Default Hessian matrix is an identity matrix. The ``h0_scale`` parameter scales identity matrix. Defaults to ``1``. epsilon : float Controls numerical stability for the ``update_function`` parameter. Defaults to ``1e-7``. {WolfeLineSearchForStep.Parameters} {BaseGradientDescent.connection} {BaseGradientDescent.error} {BaseGradientDescent.show_epoch} {BaseGradientDescent.shuffle_data} {BaseGradientDescent.epoch_end_signal} {BaseGradientDescent.train_end_signal} {BaseGradientDescent.verbose} {BaseGradientDescent.addons} Notes ----- - Method requires all training data during propagation, which means it's not allowed to use mini-batches. Attributes ---------- {BaseGradientDescent.Attributes} Methods ------- {BaseGradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> qnnet = algorithms.QuasiNewton( ... (2, 3, 1), ... update_function='bfgs' ... ) >>> qnnet.train(x_train, y_train, epochs=10) References ---------- [1] Yang Ding, Enkeleida Lushi, Qingguo Li, Investigation of quasi-Newton methods for unconstrained optimization. http://people.math.sfu.ca/~elushi/project_833.pdf [2] Jorge Nocedal, Stephen J. Wright, Numerical Optimization. Chapter 6, Quasi-Newton Methods, p. 135-163 """ update_function = ChoiceProperty(default='bfgs', choices={ 'bfgs': bfgs, 'dfp': dfp, 'sr1': sr1, }) epsilon = NumberProperty(default=1e-7, minval=0) h0_scale = NumberProperty(default=1, minval=0) step = WithdrawProperty() def init_variables(self): super(QuasiNewton, self).init_variables() n_parameters = count_parameters(self.connection) self.variables.update( inv_hessian=tf.Variable( asfloat(self.h0_scale) * tf.eye(n_parameters), name="quasi-newton/inv-hessian", dtype=tf.float32, ), prev_params=tf.Variable( tf.zeros([n_parameters]), name="quasi-newton/prev-params", dtype=tf.float32, ), prev_full_gradient=tf.Variable( tf.zeros([n_parameters]), name="quasi-newton/prev-full-gradient", dtype=tf.float32, ), ) def init_train_updates(self): inv_hessian = self.variables.inv_hessian prev_params = self.variables.prev_params prev_full_gradient = self.variables.prev_full_gradient params = parameter_values(self.connection) param_vector = make_single_vector(params) gradients = tf.gradients(self.variables.error_func, params) full_gradient = make_single_vector(gradients) new_inv_hessian = tf.where( tf.equal(self.variables.epoch, 1), inv_hessian, self.update_function(inv_H=inv_hessian, delta_w=param_vector - prev_params, delta_grad=full_gradient - prev_full_gradient, epsilon=self.epsilon)) param_delta = -dot(new_inv_hessian, full_gradient) step = self.find_optimal_step(param_vector, param_delta) updated_params = param_vector + step * param_delta updates = setup_parameter_updates(params, updated_params) # We have to compute these values first, otherwise # parallelization in tensorflow can mix update order # and, for example, previous gradient can be equal to # current gradient value. It happens because tensorflow # try to execute operations in parallel. required_variables = [new_inv_hessian, param_vector, full_gradient] with tf.control_dependencies(required_variables): updates.extend([ inv_hessian.assign(new_inv_hessian), prev_params.assign(param_vector), prev_full_gradient.assign(full_gradient), ]) return updates
class Hessian(BaseOptimizer): """ Hessian gradient decent optimization, also known as Newton's method. This algorithm uses second-order derivative (hessian matrix) in order to choose correct step during the training iteration. Because of this, method doesn't have ``step`` parameter. Parameters ---------- penalty_const : float Inverse hessian could be singular matrix. For this reason algorithm include penalty that add to hessian matrix identity multiplied by defined constant. Defaults to ``1``. {BaseOptimizer.network} {BaseOptimizer.loss} {BaseOptimizer.regularizer} {BaseOptimizer.show_epoch} {BaseOptimizer.shuffle_data} {BaseOptimizer.signals} {BaseOptimizer.verbose} Attributes ---------- {BaseOptimizer.Attributes} Methods ------- {BaseOptimizer.Methods} Notes ----- - Method requires all training data during propagation, which means it cannot be trained with mini-batches. - This method calculates full hessian matrix which means it will compute matrix with NxN parameters, where N = number of parameters in the network. Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> from neupy.layers import * >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> network = Input(2) >> Sigmoid(3) >> Sigmoid(1) >>> optimizer = algorithms.Hessian(network) >>> optimizer.train(x_train, y_train) See Also -------- :network:`HessianDiagonal` : Hessian diagonal approximation. """ penalty_const = BoundedProperty(default=1, minval=0) step = WithdrawProperty() def init_train_updates(self): penalty_const = asfloat(self.penalty_const) n_parameters = self.network.n_parameters variables = self.network.variables parameters = [var for var in variables.values() if var.trainable] param_vector = make_single_vector(parameters) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.loss, parameters ) parameter_update = tf.matrix_solve( hessian_matrix + penalty_const * tf.eye(n_parameters), tf.reshape(full_gradient, [-1, 1]) ) updated_parameters = param_vector - flatten(parameter_update) updates = setup_parameter_updates(parameters, updated_parameters) return updates
class Deconvolution(Convolution): """ Deconvolution layer. It's commonly called like this in the literature, but it's just gradient of the convolution and not actual deconvolution. Parameters ---------- {Convolution.size} {Convolution.padding} {Convolution.stride} weight : array-like, Tensorfow variable, scalar or Initializer Defines layer's weights. Shape of the weight will be equal to ``(filter rows, filter columns, output channels, input channels)``. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`HeNormal(gain=2) <neupy.init.HeNormal>`. {ParameterBasedLayer.bias} {BaseLayer.Parameters} Methods ------- {ParameterBasedLayer.Methods} Examples -------- >>> from neupy import layers >>> >>> layers.join( ... layers.Input((28, 28, 3)), ... layers.Convolution((3, 3, 16)), ... layers.Deconvolution((3, 3, 1)), ... ) Attributes ---------- {ParameterBasedLayer.Attributes} """ dilation = WithdrawProperty() def output_shape_per_dim(self, *args, **kwargs): return deconv_output_shape(*args, **kwargs) @property def weight_shape(self): # Compare to the regular convolution weights # have switched input and output channels. return as_tuple(self.size, self.input_shape[-1]) def output(self, input_value): input_shape = tf.shape(input_value) # We need to get information about output shape from the input # tensor's shape, because for some inputs we might have # height and width specified as None and shape value won't be # computed for these dimensions. output_shape = self.find_output_from_input_shape( tf.unstack(input_shape[1:])) batch_size = input_shape[0] padding = self.padding if isinstance(self.padding, (list, tuple)): height_pad, width_pad = self.padding # VALID option will make sure that # deconvolution won't use any padding. padding = 'VALID' # conv2d_transpose doesn't know about extra paddings that we added # in the convolution. For this reason we have to expand our # expected output shape and later we will remove these paddings # manually after transpose convolution. output_shape = ( output_shape[0] + 2 * height_pad, output_shape[1] + 2 * width_pad, output_shape[2], ) output = tf.nn.conv2d_transpose( input_value, self.weight, as_tuple(batch_size, output_shape), as_tuple(1, self.stride, 1), padding, data_format="NHWC" ) if isinstance(self.padding, (list, tuple)): h_pad, w_pad = self.padding if h_pad > 0: output = output[:, h_pad:-h_pad, :, :] if w_pad > 0: output = output[:, :, w_pad:-w_pad, :] if self.bias is not None: bias = tf.reshape(self.bias, (1, 1, 1, -1)) output += bias return output
class LevenbergMarquardt(BaseOptimizer): """ Levenberg-Marquardt algorithm is a variation of the Newton's method. It minimizes MSE error. The algorithm approximates Hessian matrix using dot product between two jacobian matrices. Notes ----- - Method requires all training data during propagation, which means it's not allowed to use mini-batches. - Network minimizes only Mean Squared Error (MSE) loss function. - Efficient for small training datasets, because it computes gradient per each sample separately. - Efficient for small-sized networks. Parameters ---------- {BaseOptimizer.network} mu : float Control invertion for J.T * J matrix, defaults to ``0.1``. mu_update_factor : float Factor to decrease the mu if update decrese the error, otherwise increse mu by the same factor. Defaults to ``1.2`` error : {{``mse``}} Levenberg-Marquardt works only for quadratic functions. Defaults to ``mse``. {BaseOptimizer.show_epoch} {BaseOptimizer.shuffle_data} {BaseOptimizer.signals} {BaseOptimizer.verbose} Attributes ---------- {BaseOptimizer.Attributes} Methods ------- {BaseOptimizer.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> from neupy.layers import * >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> network = Input(2) >> Sigmoid(3) >> Sigmoid(1) >>> optimizer = algorithms.LevenbergMarquardt(network) >>> optimizer.train(x_train, y_train) See Also -------- :network:`BaseOptimizer` : BaseOptimizer algorithm. """ mu = BoundedProperty(default=0.01, minval=0) mu_update_factor = BoundedProperty(default=1.2, minval=1) loss = ChoiceProperty(default='mse', choices={'mse': objectives.mse}) step = WithdrawProperty() regularizer = WithdrawProperty() def init_functions(self): self.variables.update( mu=tf.Variable(self.mu, name='lev-marq/mu'), last_error=tf.Variable(np.nan, name='lev-marq/last-error'), ) super(LevenbergMarquardt, self).init_functions() def init_train_updates(self): training_outputs = self.network.training_outputs last_error = self.variables.last_error error_func = self.variables.loss mu = self.variables.mu new_mu = tf.where( tf.less(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) err_for_each_sample = flatten((self.target - training_outputs)**2) variables = self.network.variables params = [var for var in variables.values() if var.trainable] param_vector = make_single_vector(params) J = compute_jacobian(err_for_each_sample, params) J_T = tf.transpose(J) n_params = J.shape[1] parameter_update = tf.matrix_solve( tf.matmul(J_T, J) + new_mu * tf.eye(n_params.value), tf.matmul(J_T, tf.expand_dims(err_for_each_sample, 1))) updated_params = param_vector - flatten(parameter_update) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates def one_training_update(self, X_train, y_train): if self.errors.train: last_error = self.errors.train[-1] self.variables.last_error.load(last_error, tensorflow_session()) return super(LevenbergMarquardt, self).one_training_update(X_train, y_train)
class RBFKMeans(StepSelectionBuiltIn, BaseNetwork): """ Radial basis function K-means for clustering. Parameters ---------- n_clusters : int Number of clusters. {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Attributes ---------- centers : array-like with shape (n_clusters, n_futures) Cluster centers. Methods ------- train(input_train, epsilon=1e-5, epochs=100) Trains network. {BaseSkeleton.predict} {BaseSkeleton.fit} Examples -------- >>> import numpy as np >>> from neupy.algorithms import RBFKMeans >>> >>> data = np.array([ ... [0.11, 0.20], ... [0.25, 0.32], ... [0.64, 0.60], ... [0.12, 0.42], ... [0.70, 0.73], ... [0.30, 0.27], ... [0.43, 0.81], ... [0.44, 0.87], ... [0.12, 0.92], ... [0.56, 0.67], ... [0.36, 0.35], ... ]) >>> rbfk_net = RBFKMeans(n_clusters=2, verbose=False) >>> rbfk_net.train(data, epsilon=1e-5) >>> rbfk_net.centers array([[ 0.228 , 0.312 ], [ 0.48166667, 0.76666667]]) >>> >>> new_data = np.array([[0.1, 0.1], [0.9, 0.9]]) >>> rbfk_net.predict(new_data) array([[ 0.], [ 1.]]) """ n_clusters = IntProperty(minval=2) step = WithdrawProperty() def __init__(self, **options): self.centers = None super(RBFKMeans, self).__init__(**options) def predict(self, input_data): input_data = format_data(input_data) centers = self.centers classes = np.zeros((input_data.shape[0], 1)) for i, value in enumerate(input_data): classes[i] = np.argmin(norm(centers - value, axis=1)) return classes def train_epoch(self, input_train, target_train): centers = self.centers old_centers = centers.copy() output_train = self.predict(input_train) for i, center in enumerate(centers): positions = np.argwhere(output_train[:, 0] == i) if not np.any(positions): continue class_data = np.take(input_train, positions, axis=0) centers[i, :] = (1 / len(class_data)) * np.sum(class_data, axis=0) return np.abs(old_centers - centers) def train(self, input_train, epsilon=1e-5, epochs=100): n_clusters = self.n_clusters input_train = format_data(input_train) n_samples = input_train.shape[0] if n_samples <= n_clusters: raise ValueError("Number of samples in the dataset is less than " "spcified number of clusters. Got {} samples, " "expected at least {} (for {} clusters)" "".format(n_samples, n_clusters + 1, n_clusters)) self.centers = input_train[:n_clusters, :].copy() super(RBFKMeans, self).train(input_train, epsilon=epsilon, epochs=epochs)
class Hessian(StepSelectionBuiltIn, GradientDescent): """ Hessian gradient decent optimization. This GD algorithm variation using second derivative information helps choose better gradient direction and as a consequence better weight update parameter after each epoch. Parameters ---------- penalty_const : float Inverse hessian could be singular matrix. For this reason algorithm include penalty that add to hessian matrix identity multiplied by defined constant. Defaults to ``1``. {GradientDescent.connection} {GradientDescent.error} {GradientDescent.show_epoch} {GradientDescent.shuffle_data} {GradientDescent.epoch_end_signal} {GradientDescent.train_end_signal} {GradientDescent.verbose} {GradientDescent.addons} Attributes ---------- {GradientDescent.Attributes} Methods ------- {GradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> mnet = algorithms.Hessian((2, 3, 1)) >>> mnet.train(x_train, y_train) See Also -------- :network:`HessianDiagonal` : Hessian diagonal approximation. """ penalty_const = BoundedProperty(default=1, minval=0) step = WithdrawProperty() def init_train_updates(self): n_parameters = count_parameters(self.connection) parameters = parameter_values(self.connection) param_vector = T.concatenate([param.flatten() for param in parameters]) penalty_const = asfloat(self.penalty_const) print n_parameters self.variables.hessian = theano.shared(value=asfloat( np.zeros((n_parameters, n_parameters))), name='hessian_inverse') hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) updated_parameters = hessian_matrix updates = setup_parameter_updates([self.variables.hessian], updated_parameters) return updates
class LevenbergMarquardt(StepSelectionBuiltIn, GradientDescent): """ Levenberg-Marquardt algorithm. Notes ----- - Network minimizes only Mean Squared Error function. - Efficient for small training datasets, because it computes gradient per each sample separately. - Efficient for small-sized networks. Parameters ---------- {GradientDescent.connection} mu : float Control invertion for J.T * J matrix, defaults to `0.1`. mu_update_factor : float Factor to decrease the mu if update decrese the error, otherwise increse mu by the same factor. Defaults to ``1.2`` error : {{``mse``}} Levenberg-Marquardt works only for quadratic functions. Defaults to ``mse``. {GradientDescent.show_epoch} {GradientDescent.shuffle_data} {GradientDescent.epoch_end_signal} {GradientDescent.train_end_signal} {GradientDescent.verbose} {GradientDescent.addons} Attributes ---------- {GradientDescent.Attributes} Methods ------- {GradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> lmnet = algorithms.LevenbergMarquardt((2, 3, 1)) >>> lmnet.train(x_train, y_train) See Also -------- :network:`GradientDescent` : GradientDescent algorithm. """ mu = BoundedProperty(default=0.01, minval=0) mu_update_factor = BoundedProperty(default=1.2, minval=1) error = ChoiceProperty(default='mse', choices={'mse': errors.mse}) step = WithdrawProperty() def init_variables(self): super(LevenbergMarquardt, self).init_variables() self.variables.update( mu=theano.shared(name='lev-marq/mu', value=asfloat(self.mu)), last_error=theano.shared(name='lev-marq/last-error', value=np.nan), ) def init_train_updates(self): network_output = self.variables.network_output prediction_func = self.variables.train_prediction_func last_error = self.variables.last_error error_func = self.variables.error_func mu = self.variables.mu new_mu = ifelse( T.lt(last_error, error_func), mu * self.mu_update_factor, mu / self.mu_update_factor, ) se_for_each_sample = ((network_output - prediction_func)**2).ravel() params = parameter_values(self.connection) param_vector = T.concatenate([param.flatten() for param in params]) J = compute_jacobian(se_for_each_sample, params) n_params = J.shape[1] updated_params = param_vector - slinalg.solve( J.T.dot(J) + new_mu * T.eye(n_params), J.T.dot(se_for_each_sample)) updates = [(mu, new_mu)] parameter_updates = setup_parameter_updates(params, updated_params) updates.extend(parameter_updates) return updates def on_epoch_start_update(self, epoch): super(LevenbergMarquardt, self).on_epoch_start_update(epoch) last_error = self.errors.last() if last_error is not None: self.variables.last_error.set_value(last_error)
class Hessian(StepSelectionBuiltIn, BaseGradientDescent): """ Hessian gradient decent optimization, also known as Newton's method. This algorithm uses second-order derivative (hessian matrix) in order to choose correct step during the training itration. Because of this, method doesn't have ``step`` parameter. Parameters ---------- penalty_const : float Inverse hessian could be singular matrix. For this reason algorithm include penalty that add to hessian matrix identity multiplied by defined constant. Defaults to ``1``. {BaseGradientDescent.connection} {BaseGradientDescent.error} {BaseGradientDescent.show_epoch} {BaseGradientDescent.shuffle_data} {BaseGradientDescent.epoch_end_signal} {BaseGradientDescent.train_end_signal} {BaseGradientDescent.verbose} {BaseGradientDescent.addons} Attributes ---------- {BaseGradientDescent.Attributes} Methods ------- {BaseGradientDescent.Methods} Notes ----- - Method requires all training data during propagation, which means it's not allowed to use mini-batches. - This method calculates full hessian matrix which means it will compute matrix with NxN parameters, where N = number of parameters in the network. Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> mnet = algorithms.Hessian((2, 3, 1)) >>> mnet.train(x_train, y_train) See Also -------- :network:`HessianDiagonal` : Hessian diagonal approximation. """ penalty_const = BoundedProperty(default=1, minval=0) step = WithdrawProperty() def init_train_updates(self): penalty_const = asfloat(self.penalty_const) n_parameters = count_parameters(self.connection) parameters = parameter_values(self.connection) param_vector = make_single_vector(parameters) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) parameter_update = tf.matrix_solve( hessian_matrix + penalty_const * tf.eye(n_parameters), tf.reshape(full_gradient, [-1, 1])) updated_parameters = param_vector - flatten(parameter_update) updates = setup_parameter_updates(parameters, updated_parameters) return updates