class GaussianNoise(BaseLayer): """ Add gaussian noise to the input value. Mean and standard deviation are layer's parameters. Parameters ---------- std : float Standard deviation of the gaussian noise. Values needs to be greater than zero. Defaults to ``1``. mean : float Mean of the gaussian noise. Defaults to ``0``. """ std = NumberProperty(default=1, minval=0) mean = NumberProperty(default=0) def __init__(self, std, **options): options['std'] = std super(GaussianNoise, self).__init__(**options) def output(self, input_value): if not self.training_state: return input_value theano_random = theano_random_stream() noise = theano_random.normal(size=input_value.shape, avg=self.mean, std=self.std) return input_value + noise def __repr__(self): classname = self.__class__.__name__ return "{}(mean={}, std={})".format(classname, self.mean, self.std)
class GaussianNoise(Identity): """ Add gaussian noise to the input value. Mean and standard deviation of the noise can be controlled from the layers parameters. It's important to note that output from the layer is controled by the ``training`` parameter in the ``output`` method. Layer will be applied only in cases when ``training=True`` propagated through the network, otherwise it will act as an identity. Parameters ---------- std : float Standard deviation of the gaussian noise. Values needs to be greater than zero. Defaults to ``1``. mean : float Mean of the gaussian noise. Defaults to ``0``. {Identity.name} Methods ------- {Identity.Methods} Attributes ---------- {Identity.Attributes} Examples -------- >>> from neupy.layers import * >>> network = join( ... Input(10), ... Relu(5) >> GaussianNoise(std=0.1), ... Relu(5) >> GaussianNoise(std=0.1), ... Sigmoid(1), ... ) >>> network (?, 10) -> [... 6 layers ...] -> (?, 1) """ mean = NumberProperty() std = NumberProperty(minval=0) def __init__(self, mean=1, std=0, name=None): super(GaussianNoise, self).__init__(name=name) self.mean = mean self.std = std def output(self, input_value, training=False): if not training: return input_value noise = tf.random_normal(shape=tf.shape(input_value), mean=self.mean, stddev=self.std) return input_value + noise
class Relu(ActivationLayer): """ The layer with the rectifier (ReLu) activation function. Parameters ---------- alpha : float Alpha parameter defines the decreasing rate for the negative values. If ``alpha`` is non-zero value then layer behave like a leaky ReLu. Defaults to ``0``. {ActivationLayer.Parameters} Methods ------- {ActivationLayer.Methods} Attributes ---------- {ActivationLayer.Attributes} """ alpha = NumberProperty(default=0, minval=0) def activation_function(self, input_value): alpha = asfloat(self.alpha) return T.nnet.relu(input_value, alpha)
class Elu(ActivationLayer): """ The layer with the exponensial linear unit (ELU) activation function. Parameters ---------- alpha : float Alpha parameter defines the decreasing exponensial rate for the negative values. Defaults to ``1``. {ActivationLayer.Parameters} Methods ------- {ActivationLayer.Methods} Attributes ---------- {ActivationLayer.Attributes} References ---------- .. [1] http://arxiv.org/pdf/1511.07289v3.pdf """ alpha = NumberProperty(default=1, minval=0) def activation_function(self, input_value): alpha = asfloat(self.alpha) return T.nnet.elu(input_value, alpha)
class LVQ2(LVQ): """ Learning Vector Quantization 2 (LVQ2) algorithm. Improved version for the LVQ algorithm. Parameters ---------- epsilon : float Ration between to closest subclasses that triggers double weight update. Defaults to ``0.1``. {LVQ.Parameters} Notes ----- {LVQ.Notes} """ epsilon = NumberProperty(default=0.1) def train_epoch(self, input_train, target_train): weight = self.weight epsilon = self.epsilon subclass_to_class = self.subclass_to_class n_correct_predictions = 0 for input_row, target in zip(input_train, target_train): step = self.training_step output = euclid_distance(input_row, weight) winner_subclasses = n_argmin(output, n=2, axis=1) top1_subclass, top2_subclass = winner_subclasses top1_class = subclass_to_class[top1_subclass] top2_class = subclass_to_class[top2_subclass] top1_weight_update = input_row - weight[top1_subclass, :] is_correct_prediction = (top1_class == target) closest_dist, runner_up_dist = output[0, winner_subclasses] double_update_condition_satisfied = ( not is_correct_prediction and (top2_class == target) and closest_dist > ((1 - epsilon) * runner_up_dist) and runner_up_dist < ((1 + epsilon) * closest_dist) ) if double_update_condition_satisfied: top2_weight_update = input_row - weight[top2_class, :] weight[top1_subclass, :] -= step * top1_weight_update weight[top2_subclass, :] += step * top2_weight_update elif is_correct_prediction: weight[top1_subclass, :] += step * top1_weight_update else: weight[top1_subclass, :] -= step * top1_weight_update n_correct_predictions += is_correct_prediction n_samples = len(input_train) return 1 - n_correct_predictions / n_samples
class RMSProp(MinibatchGradientDescent): """ RMSProp algorithm. Parameters ---------- decay : float Decay rate. Value need to be between ``0`` and ``1``. Defaults to ``0.95``. epsilon : float Value need to be greater than ``0``. Defaults to ``1e-5``. {MinibatchGradientDescent.Parameters} Attributes ---------- {MinibatchGradientDescent.Attributes} Methods ------- {MinibatchGradientDescent.Methods} """ decay = ProperFractionProperty(default=0.95) epsilon = NumberProperty(default=1e-5, minval=0) def init_layers(self): super(RMSProp, self).init_layers() for layer in self.layers: for parameter in layer.parameters: parameter_shape = T.shape(parameter).eval() parameter.prev_mean_squred_grad = theano.shared( name="prev_mean_squred_grad_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), ) def init_param_updates(self, layer, parameter): n_parameters = count_parameters(self) self.variables.hessian = theano.shared(value=asfloat( np.zeros((n_parameters, n_parameters))), name='hessian_inverse') parameters = list(iter_parameters(self)) hessian_matrix, full_gradient = find_hessian_and_gradient( self.variables.error_func, parameters) prev_mean_squred_grad = parameter.prev_mean_squred_grad step = self.variables.step gradient = T.grad(self.variables.error_func, wrt=parameter) mean_squred_grad = (self.decay * prev_mean_squred_grad + (1 - self.decay) * gradient**2) parameter_delta = gradient / T.sqrt(mean_squred_grad + self.epsilon) return [ (prev_mean_squred_grad, mean_squred_grad), (parameter, parameter - step * parameter_delta), ]
class GaussianNoise(BaseLayer): """ Add gaussian noise to the input value. Mean and standard deviation are layer's parameters. Parameters ---------- std : float Standard deviation of the gaussian noise. Values needs to be greater than zero. Defaults to ``1``. mean : float Mean of the gaussian noise. Defaults to ``0``. {BaseLayer.Parameters} Methods ------- {BaseLayer.Methods} Attributes ---------- {BaseLayer.Attributes} """ std = NumberProperty(default=1, minval=0) mean = NumberProperty(default=0) def __init__(self, mean=1, std=0, **options): super(GaussianNoise, self).__init__(mean=mean, std=std, **options) def output(self, input_value): if not self.training_state: return input_value noise = tf.random_normal( shape=tf.shape(input_value), mean=self.mean, stddev=self.std) return input_value + noise def __repr__(self): classname = self.__class__.__name__ return "{}(mean={}, std={})".format(classname, self.mean, self.std)
class Relu(ActivationLayer): """ The layer with the rectifier (ReLu) activation function. Parameters ---------- alpha : float Alpha parameter defines the decreasing rate for the negative values. If ``alpha`` is non-zero value then layer behave like a leaky ReLu. Defaults to ``0``. {ActivationLayer.size} weight : array-like, Tensorfow variable, scalar or Initializer Defines layer's weights. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`HeNormal(gain=2) <neupy.init.HeNormal>`. {ParameterBasedLayer.bias} {BaseLayer.Parameters} Methods ------- {ActivationLayer.Methods} Attributes ---------- {ActivationLayer.Attributes} Examples -------- Feedforward Neural Networks (FNN) >>> from neupy.layers import * >>> network = Input(10) > Relu(20) > Relu(1) Convolutional Neural Networks (CNN) >>> from neupy.layers import * >>> network = join( ... Input((32, 32, 3)), ... Convolution((3, 3, 16)) > Relu(), ... Convolution((3, 3, 32)) > Relu(), ... Reshape(), ... Softmax(10), ... ) """ alpha = NumberProperty(default=0, minval=0) weight = ParameterProperty(default=init.HeNormal(gain=2)) def activation_function(self, input_value): if self.alpha == 0: return tf.nn.relu(input_value) return tf.nn.leaky_relu(input_value, asfloat(self.alpha))
class RMSProp(MinibatchGradientDescent): """ RMSProp algorithm. Parameters ---------- decay : float Decay rate. Value need to be between ``0`` and ``1``. Defaults to ``0.95``. epsilon : float Value need to be greater than ``0``. Defaults to ``1e-5``. {MinibatchGradientDescent.Parameters} Attributes ---------- {MinibatchGradientDescent.Attributes} Methods ------- {MinibatchGradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> mnet = algorithms.RMSProp((2, 3, 1)) >>> mnet.train(x_train, y_train) """ decay = ProperFractionProperty(default=0.95) epsilon = NumberProperty(default=1e-5, minval=0) def init_param_updates(self, layer, parameter): step = self.variables.step parameter_shape = T.shape(parameter).eval() prev_mean_squred_grad = theano.shared( name="{}/prev-mean-squared-grad".format(parameter.name), value=asfloat(np.zeros(parameter_shape)), ) gradient = T.grad(self.variables.error_func, wrt=parameter) mean_squred_grad = (self.decay * prev_mean_squred_grad + (1 - self.decay) * gradient**2) parameter_delta = gradient / T.sqrt(mean_squred_grad + self.epsilon) return [ (prev_mean_squred_grad, mean_squred_grad), (parameter, parameter - step * parameter_delta), ]
class SearchThenConverge(SingleStepConfigurable): """ Algorithm decrease learning step after each epoch. Parameters ---------- reduction_freq : int The parameter controls the frequency reduction step with respect to epochs. Defaults to ``100`` epochs. Can't be less than ``1``. Less value mean that step decrease faster. rate_coefitient : float Second important parameter to control the rate of error reduction. Defaults to ``0.2`` Warns ----- {SingleStepConfigurable.Warns} Examples -------- >>> from neupy import algorithms >>> >>> bpnet = algorithms.GradientDescent( ... (2, 4, 1), ... step=0.1, ... verbose=False, ... addons=[algorithms.SearchThenConverge] ... ) >>> See Also -------- :network:`StepDecay` """ reduction_freq = IntProperty(minval=1, default=100) rate_coefitient = NumberProperty(default=0.2) def init_train_updates(self): updates = super(SearchThenConverge, self).init_train_updates() first_step = asfloat(self.step) reduction_freq = asfloat(self.reduction_freq) step = self.variables.step epoch = self.variables.epoch epoch_value = epoch / reduction_freq rated_value = 1 + (self.rate_coefitient / first_step) * epoch_value step_update_condition = (first_step * rated_value) / ( rated_value + reduction_freq * epoch_value**2) updates.append((step, step_update_condition)) return updates
class ZCA(BaseSkeleton): """ ZCA (zero-phase component analysis) whitening. Parameters ---------- regularization : float Regularization parameter. Defaults to ``1e-5``. Attributes ---------- mean : 1D array Mean for each feature. components : array-like ZCA components. Methods ------- train(data) Train ZCA. transform(data) Transform input data. """ regularization = NumberProperty(default=1e-5, minval=0) def __init__(self, regularization=1e-5, **options): self.regularization = regularization self.mean = None self.components = None super(ZCA, self).__init__(**options) def fit(self, X, *args, **kwargs): self.train(X, *args, **kwargs) return self def train(self, data): data = as_array2d(data) self.mean = data.mean(axis=0) data = data - self.mean n_features = data.shape[1] sigma = np.dot(data.T, data) / n_features U, S, V = np.linalg.svd(sigma) self.components = (U / np.sqrt(S + self.regularization)).dot(U.T) def transform(self, data): if self.mean is None or self.components is None: raise NotTrainedException("Train ZCA before use it.") data_shape = data.shape data = as_array2d(data) data_transformed = data - self.mean data_transformed = np.dot(data_transformed, self.components.T) return data_transformed.reshape(data_shape)
class RMSProp(MinibatchGradientDescent): """ RMSProp algorithm. Parameters ---------- decay : float Decay rate. Value need to be between ``0`` and ``1``. Defaults to ``0.95``. epsilon : float Value need to be greater than ``0``. Defaults to ``1e-5``. {MinibatchGradientDescent.batch_size} {GradientDescent.addons} {ConstructableNetwork.connection} {ConstructableNetwork.error} {BaseNetwork.step} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} {SupervisedLearning.train} {BaseSkeleton.fit} {BaseNetwork.plot_errors} """ decay = ProperFractionProperty(default=0.95) epsilon = NumberProperty(default=1e-5, minval=0) def init_layers(self): super(RMSProp, self).init_layers() for layer in self.layers: for parameter in layer.parameters: parameter_shape = T.shape(parameter).eval() parameter.prev_mean_squred_grad = theano.shared( name="prev_mean_squred_grad_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), ) def init_param_updates(self, layer, parameter): prev_mean_squred_grad = parameter.prev_mean_squred_grad step = self.variables.step gradient = T.grad(self.variables.error_func, wrt=parameter) mean_squred_grad = (self.decay * prev_mean_squred_grad + (1 - self.decay) * gradient**2) parameter_delta = gradient / T.sqrt(mean_squred_grad + self.epsilon) return [ (prev_mean_squred_grad, mean_squred_grad), (parameter, parameter - step * parameter_delta), ]
class SearchThenConverge(SingleStep): """ Algorithm minimize learning step. Similar to :network:`SimpleStepMinimization`, but more complicated step update rule. Parameters ---------- epochs_step_minimizator : int The parameter controls the frequency reduction step with respect to epochs. Defaults to ``100`` epochs. Can't be less than ``1``. Less value mean that step decrease faster. rate_coefitient : float Second important parameter to control the rate of error reduction. Defaults to ``0.2`` Attributes ---------- {first_step} Warns ----- {bp_depending} Examples -------- >>> from neupy import algorithms >>> >>> bpnet = algorithms.Backpropagation( ... (2, 4, 1), ... step=0.1, ... verbose=False, ... optimizations=[algorithms.SearchThenConverge] ... ) >>> See Also -------- :network:`SimpleStepMinimization` """ epochs_step_minimizator = NonNegativeIntProperty(min_size=1, default=100) rate_coefitient = NumberProperty(default=0.2) def after_weight_update(self, input_train, target_train): super(SearchThenConverge, self).after_weight_update(input_train, target_train) first_step = self.first_step epochs_step_minimizator = self.epochs_step_minimizator epoch_value = self.epoch / epochs_step_minimizator rated_value = (self.rate_coefitient / first_step) * epoch_value self.step = first_step * (1 + rated_value) / ( 1 + rated_value + epochs_step_minimizator * epoch_value**2)
class MaxNormRegularization(WeightUpdateConfigurable): """ Max-norm regularization algorithm will clip norm of the parameter in case if it will exceed maximum limit. .. code-block:: python if norm(weight) > max_norm: weight = max_norm * weight / norm(weight) .. raw:: html <br> Warns ----- {WeightUpdateConfigurable.Warns} Parameters ---------- max_norm : int, float Any parameter that has norm greater than this value will be clipped. Defaults to ``10``. Examples -------- >>> from neupy import algorithms >>> bpnet = algorithms.GradientDescent( ... (2, 4, 1), ... step=0.1, ... max_norm=4, ... addons=[algorithms.MaxNormRegularization] ... ) References ---------- [1] N. Srivastava, G. Hinton, A. Krizhevsky, I. Sutskever, R. Salakhutdinov. Dropout: A Simple Way to Prevent Neural Networks from Overfitting. http://jmlr.org/papers/volume15/srivastava14a/srivastava14a.pdf """ max_norm = NumberProperty(default=10, minval=0) def init_param_updates(self, layer, parameter): updates = super(MaxNormRegularization, self).init_param_updates(layer, parameter) updates_mapper = dict(updates) updated_value = updates_mapper[parameter] updates_mapper[parameter] = max_norm_clip(updated_value, self.max_norm) return list(updates_mapper.items())
class Adagrad(MinibatchGradientDescent): """ Adagrad algorithm. Parameters ---------- epsilon : float Value need to be greater than ``0``. Defaults to ``1e-5``. {MinibatchGradientDescent.Parameters} Attributes ---------- {MinibatchGradientDescent.Attributes} Methods ------- {MinibatchGradientDescent.Methods} """ epsilon = NumberProperty(default=1e-5, minval=0) def init_layers(self): super(Adagrad, self).init_layers() for layer in self.layers: for parameter in layer.parameters: parameter_shape = T.shape(parameter).eval() parameter.prev_mean_squred_grad = theano.shared( name="prev_mean_squred_grad_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), ) def init_param_updates(self, layer, parameter): prev_mean_squred_grad = parameter.prev_mean_squred_grad step = self.variables.step gradient = T.grad(self.variables.error_func, wrt=parameter) mean_squred_grad = prev_mean_squred_grad + gradient**2 parameter_delta = gradient * T.sqrt(mean_squred_grad + self.epsilon) return [ (prev_mean_squred_grad, mean_squred_grad), (parameter, parameter - step * parameter_delta), ]
class Relu(ActivationLayer): """ The layer with the rectifier (ReLu) activation function. Parameters ---------- alpha : float Alpha parameter defines the decreasing rate for the negative values. If ``alpha`` is non-zero value then layer behave like a leaky ReLu. Defaults to ``0``. {ActivationLayer.size} weight : array-like, Tensorfow variable, scalar or Initializer Defines layer's weights. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`HeNormal(gain=2) <neupy.init.HeNormal>`. {ParameterBasedLayer.bias} {BaseLayer.Parameters} Methods ------- {ActivationLayer.Methods} Attributes ---------- {ActivationLayer.Attributes} """ alpha = NumberProperty(default=0, minval=0) weight = ParameterProperty(default=init.HeNormal(gain=2)) def activation_function(self, input_value): if self.alpha == 0: return tf.nn.relu(input_value) return tf.nn.leaky_relu(input_value, asfloat(self.alpha))
class WolfeLineSearchForStep(StepSelectionBuiltIn, Configurable): """ Class that has all functions required in order to apply line search over step parameter that used during the network training. Parameters ---------- wolfe_maxiter : int Controls maximun number of iteration during the line search that identifies optimal step size during the weight update stage. Defaults to ``20``. wolfe_c1 : float Parameter for Armijo condition rule. It's used during the line search that identifies optimal step size during the weight update stage. Defaults ``1e-4``. wolfe_c2 : float Parameter for curvature condition rule. It's used during the line search that identifies optimal step size during the weight update stage. Defaults ``0.9``. """ wolfe_maxiter = IntProperty(default=20, minval=0) wolfe_c1 = NumberProperty(default=1e-4, minval=0) wolfe_c2 = NumberProperty(default=0.9, minval=0) def find_optimal_step(self, parameter_vector, parameter_update): network_inputs = self.variables.network_inputs network_output = self.variables.network_output layers_and_parameters = list(iter_parameters(self.layers)) def prediction(step): step = asfloat(step) updated_params = parameter_vector + step * parameter_update # This trick allow us to replace shared variables # with tensorflow variables and get output from the network start_pos = 0 for layer, attrname, param in layers_and_parameters: end_pos = start_pos + get_variable_size(param) updated_param_value = tf.reshape( updated_params[start_pos:end_pos], param.shape) setattr(layer, attrname, updated_param_value) start_pos = end_pos output = self.connection.output(*network_inputs) # Restore previous parameters for layer, attrname, param in layers_and_parameters: setattr(layer, attrname, param) return output def phi(step): return self.error(network_output, prediction(step)) def derphi(step): error_func = self.error(network_output, prediction(step)) gradient, = tf.gradients(error_func, step) return gradient return line_search(phi, derphi, self.wolfe_maxiter, self.wolfe_c1, self.wolfe_c2)
class BaseNetwork(BaseSkeleton): """ Base class for Neural Network algorithms. Parameters ---------- step : float Learning rate, defaults to ``0.1``. show_epoch : int This property controls how often the network will display information about training. It has to be defined as positive integer. For instance, number ``100`` mean that network shows summary at 1st, 100th, 200th, 300th ... and last epochs. Defaults to ``1``. shuffle_data : bool If it's ``True`` than training data will be shuffled before the training. Defaults to ``True``. signals : dict, list or function Function that will be triggered after certain events during the training. {Verbose.Parameters} Methods ------- {BaseSkeleton.fit} predict(X) Propagates input ``X`` through the network and returns produced output. plot_errors(logx=False, show=True, **figkwargs) Using errors collected during the training this method generates plot that can give additional insight into the performance reached during the training. Attributes ---------- errors : list Information about errors. It has two main attributes, namely ``train`` and ``valid``. These attributes provide access to the training and validation errors respectively. last_epoch : int Value equals to the last trained epoch. After initialization it is equal to ``0``. n_updates_made : int Number of training updates applied to the network. """ step = NumberProperty(default=0.1, minval=0) show_epoch = IntProperty(minval=1, default=1) shuffle_data = Property(default=False, expected_type=bool) signals = Property(expected_type=object) def __init__(self, *args, **options): super(BaseNetwork, self).__init__(*args, **options) self.last_epoch = 0 self.n_updates_made = 0 self.errors = base_signals.ErrorCollector() signals = list( as_tuple( base_signals.ProgressbarSignal(), base_signals.PrintLastErrorSignal(), self.errors, self.signals, )) for i, signal in enumerate(signals): if inspect.isfunction(signal): signals[i] = base_signals.EpochEndSignal(signal) elif inspect.isclass(signal): signals[i] = signal() self.events = Events(network=self, signals=signals) def one_training_update(self, X_train, y_train=None): """ Function would be trigger before run all training procedure related to the current epoch. Parameters ---------- epoch : int Current epoch number. """ raise NotImplementedError() def score(self, X, y): raise NotImplementedError() def plot_errors(self, logx=False, show=True, **figkwargs): return plot_optimizer_errors(optimizer=self, logx=logx, show=show, **figkwargs) def train(self, X_train, y_train=None, X_test=None, y_test=None, epochs=100, batch_size=None): """ Method train neural network. Parameters ---------- X_train : array-like y_train : array-like or None X_test : array-like or None y_test : array-like or None epochs : int Defaults to ``100``. epsilon : float or None Defaults to ``None``. """ if epochs <= 0: raise ValueError("Number of epochs needs to be a positive number") epochs = int(epochs) first_epoch = self.last_epoch + 1 batch_size = batch_size or getattr(self, 'batch_size', None) self.events.trigger( name='train_start', X_train=X_train, y_train=y_train, epochs=epochs, batch_size=batch_size, store_data=False, ) try: for epoch in range(first_epoch, first_epoch + epochs): self.events.trigger('epoch_start') self.last_epoch = epoch iterator = iters.minibatches( (X_train, y_train), batch_size, self.shuffle_data, ) for X_batch, y_batch in iterator: self.events.trigger('update_start') update_start_time = time.time() train_error = self.one_training_update(X_batch, y_batch) self.n_updates_made += 1 self.events.trigger( name='train_error', value=train_error, eta=time.time() - update_start_time, epoch=epoch, n_updates=self.n_updates_made, n_samples=iters.count_samples(X_batch), store_data=True, ) self.events.trigger('update_end') if X_test is not None: test_start_time = time.time() validation_error = self.score(X_test, y_test) self.events.trigger( name='valid_error', value=validation_error, eta=time.time() - test_start_time, epoch=epoch, n_updates=self.n_updates_made, n_samples=iters.count_samples(X_test), store_data=True, ) self.events.trigger('epoch_end') except StopTraining as err: self.logs.message( "TRAIN", "Epoch #{} was stopped. Message: {}".format(epoch, str(err))) self.events.trigger('train_end')
class ConjugateGradient(WolfeLineSearchForStep, BaseOptimizer): """ Conjugate Gradient algorithm. Parameters ---------- update_function : ``fletcher_reeves``, ``polak_ribiere``,\ ``hentenes_stiefel``, ``dai_yuan``, ``liu_storey`` Update function. Defaults to ``fletcher_reeves``. epsilon : float Ensures computational stability during the division in ``update_function`` when denominator is very small number. Defaults to ``1e-7``. {WolfeLineSearchForStep.Parameters} {BaseOptimizer.network} {BaseOptimizer.loss} {BaseOptimizer.show_epoch} {BaseOptimizer.shuffle_data} {BaseOptimizer.signals} {BaseOptimizer.verbose} {BaseOptimizer.regularizer} Attributes ---------- {BaseOptimizer.Attributes} Methods ------- {BaseOptimizer.Methods} Examples -------- >>> from sklearn import datasets, preprocessing >>> from sklearn.model_selection import train_test_split >>> from neupy import algorithms, layers >>> >>> dataset = datasets.load_boston() >>> data, target = dataset.data, dataset.target >>> >>> data_scaler = preprocessing.MinMaxScaler() >>> target_scaler = preprocessing.MinMaxScaler() >>> >>> x_train, x_test, y_train, y_test = train_test_split( ... data_scaler.fit_transform(data), ... target_scaler.fit_transform(target), ... test_size=0.15 ... ) >>> >>> cgnet = algorithms.ConjugateGradient( ... network=[ ... layers.Input(13), ... layers.Sigmoid(50), ... layers.Sigmoid(1), ... ], ... update_function='fletcher_reeves', ... verbose=False ... ) >>> >>> cgnet.train(x_train, y_train, epochs=100) >>> y_predict = cgnet.predict(x_test).round(1) >>> >>> real = target_scaler.inverse_transform(y_test) >>> predicted = target_scaler.inverse_transform(y_predict) References ---------- [1] Jorge Nocedal, Stephen J. Wright, Numerical Optimization. Chapter 5, Conjugate Gradient Methods, p. 101-133 """ epsilon = NumberProperty(default=1e-7, minval=0) update_function = ChoiceProperty( default='fletcher_reeves', choices={ 'fletcher_reeves': fletcher_reeves, 'polak_ribiere': polak_ribiere, 'hentenes_stiefel': hentenes_stiefel, 'liu_storey': liu_storey, 'dai_yuan': dai_yuan, } ) step = WithdrawProperty() def init_functions(self): n_parameters = self.network.n_parameters self.variables.update( prev_delta=tf.Variable( tf.zeros([n_parameters]), name="conj-grad/prev-delta", dtype=tf.float32, ), prev_gradient=tf.Variable( tf.zeros([n_parameters]), name="conj-grad/prev-gradient", dtype=tf.float32, ), iteration=tf.Variable( asfloat(self.last_epoch), name='conj-grad/current-iteration', dtype=tf.float32 ), ) super(ConjugateGradient, self).init_functions() def init_train_updates(self): iteration = self.variables.iteration previous_delta = self.variables.prev_delta previous_gradient = self.variables.prev_gradient n_parameters = self.network.n_parameters variables = self.network.variables parameters = [var for var in variables.values() if var.trainable] param_vector = make_single_vector(parameters) gradients = tf.gradients(self.variables.loss, parameters) full_gradient = make_single_vector(gradients) beta = self.update_function( previous_gradient, full_gradient, previous_delta, self.epsilon) parameter_delta = tf.where( tf.equal(tf.mod(iteration, n_parameters), 0), -full_gradient, -full_gradient + beta * previous_delta ) step = self.find_optimal_step(param_vector, parameter_delta) updated_parameters = param_vector + step * parameter_delta updates = setup_parameter_updates(parameters, updated_parameters) # We have to compute these values first, otherwise # parallelization, in tensorflow, can mix update order # and, for example, previous gradient can be equal to # current gradient value. It happens because tensorflow # try to execute operations in parallel. with tf.control_dependencies([full_gradient, parameter_delta]): updates.extend([ previous_gradient.assign(full_gradient), previous_delta.assign(parameter_delta), iteration.assign(iteration + 1), ]) return updates
class Adamax(MinibatchGradientDescent): """ AdaMax algorithm. Parameters ---------- beta1 : float Decay rate. Value need to be between ``0`` and ``1``. Defaults to ``0.95``. beta2 : float Decay rate. Value need to be between ``0`` and ``1``. Defaults to ``0.95``. epsilon : float Value need to be greater than ``0``. Defaults to ``1e-5``. step : float Learning rate, defaults to ``0.001``. {MinibatchGradientDescent.batch_size} {GradientDescent.addons} {ConstructableNetwork.connection} {ConstructableNetwork.error} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} {SupervisedLearning.train} {BaseSkeleton.fit} {BaseNetwork.plot_errors} """ step = NumberProperty(default=0.001, minval=0) beta1 = ProperFractionProperty(default=0.9) beta2 = ProperFractionProperty(default=0.999) epsilon = NumberProperty(default=1e-8, minval=0) def init_layers(self): super(Adamax, self).init_layers() for layer in self.layers: for parameter in layer.parameters: parameter_shape = T.shape(parameter).eval() parameter.prev_first_moment = theano.shared( name="prev_first_moment_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), ) parameter.prev_weighted_inf_norm = theano.shared( name="prev_weighted_inf_norm_" + parameter.name, value=asfloat(np.zeros(parameter_shape)), ) def init_param_updates(self, layer, parameter): epoch = self.variables.epoch prev_first_moment = parameter.prev_first_moment prev_weighted_inf_norm = parameter.prev_weighted_inf_norm step = self.variables.step beta1 = self.beta1 beta2 = self.beta2 gradient = T.grad(self.variables.error_func, wrt=parameter) first_moment = beta1 * prev_first_moment + (1 - beta1) * gradient weighted_inf_norm = T.maximum(beta2 * prev_weighted_inf_norm, T.abs_(gradient)) parameter_delta = ((1 / (1 - beta1**epoch)) * (first_moment / (weighted_inf_norm + self.epsilon))) return [ (prev_first_moment, first_moment), (prev_weighted_inf_norm, weighted_inf_norm), (parameter, parameter - step * parameter_delta), ]
class LVQ(BaseNetwork): """ Learning Vector Quantization (LVQ) algorithm. Notes ----- - Input data needs to be normalized, because LVQ uses Euclidian distance to find clusters. - Training error is just a ratio of miscassified samples Parameters ---------- n_inputs : int Number of input units. It should be equal to the number of features in the input data set. n_subclasses : int, None Defines total number of subclasses. Values should be greater or equal to the number of classes. ``None`` will set up number of subclasses equal to the number of classes. Defaults to ``None`` (or the same as ``n_classes``). n_classes : int Number of classes in the data set. prototypes_per_class : list, None Defines number of prototypes per each class. For instance, if ``n_classes=3`` and ``n_subclasses=8`` then there are can be 3 subclasses for the first class, 3 for the second one and 2 for the third one (3 + 3 + 2 == 8). The following example can be specified as ``prototypes_per_class=[3, 3, 2]``. There are two rules that apply to this parameter: 1. ``sum(prototypes_per_class) == n_subclasses`` 2. ``len(prototypes_per_class) == n_classes`` The ``None`` value will distribute approximately equal number of subclasses per each class. It's approximately, because in casses when ``n_subclasses % n_classes != 0`` there is no way to distribute equal number of subclasses per each class. Defaults to ``None``. {BaseNetwork.step} n_updates_to_stepdrop : int or None If this options is not equal to ``None`` then after every update LVQ reduces step size and do it until number of applied updates would reach the ``n_updates_to_stepdrop`` value. The minimum possible step size defined in the ``minstep`` parameter. Be aware that number of updates is not the same as number of epochs. LVQ applies update after each propagated sample through the network. Relations between this parameter and maximum number of epochs is following .. code-block:: python n_updates_to_stepdrop = n_samples * n_max_epochs If parameter equal to ``None`` then step size wouldn't be reduced after each update. Defaults to ``None``. minstep : float Step size would never be lower than this value. This property useful only in case if ``n_updates_to_stepdrop`` is not ``None``. Defaults to ``1e-5``. {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} {BaseSkeleton.fit} """ n_inputs = IntProperty(minval=1) n_subclasses = IntProperty(minval=2, default=None, allow_none=True) n_classes = IntProperty(minval=2) prototypes_per_class = TypedListProperty(allow_none=True, default=None) weight = Property(expected_type=(np.ndarray, init.Initializer), allow_none=True, default=None) n_updates_to_stepdrop = IntProperty(default=None, allow_none=True, minval=1) minstep = NumberProperty(minval=0, default=1e-5) def __init__(self, **options): self.initialized = False super(LVQ, self).__init__(**options) self.n_updates = 0 if self.n_subclasses is None: self.n_subclasses = self.n_classes if isinstance(self.weight, init.Initializer): weight_shape = (self.n_inputs, self.n_subclasses) self.weight = self.weight.sample(weight_shape) if self.weight is not None: self.initialized = True if self.n_subclasses < self.n_classes: raise ValueError("Number of subclasses should be greater " "or equal to the number of classes. Network " "was defined with {} subclasses and {} classes" "".format(self.n_subclasses, self.n_classes)) if self.prototypes_per_class is None: whole, reminder = divmod(self.n_subclasses, self.n_classes) self.prototypes_per_class = [whole] * self.n_classes if reminder: # Since we have reminder left, it means that we cannot # have an equal number of subclasses per each class, # therefor we will add +1 to randomly selected class. class_indeces = np.random.choice(self.n_classes, reminder, replace=False) for class_index in class_indeces: self.prototypes_per_class[class_index] += 1 if len(self.prototypes_per_class) != self.n_classes: raise ValueError("LVQ defined for classification problem that has " "{} classes, but the `prototypes_per_class` " "variable has defined data for {} classes." "".format(self.n_classes, len(self.prototypes_per_class))) if sum(self.prototypes_per_class) != self.n_subclasses: raise ValueError("Invalid distribution of subclasses for the " "`prototypes_per_class` variable. Got total " "of {} subclasses ({}) instead of {} expected" "".format(sum(self.prototypes_per_class), self.prototypes_per_class, self.n_subclasses)) self.subclass_to_class = [] for class_id, n_prototypes in enumerate(self.prototypes_per_class): self.subclass_to_class.extend([class_id] * n_prototypes) @property def training_step(self): if self.n_updates_to_stepdrop is None: return self.step updates_ratio = (1 - self.n_updates / self.n_updates_to_stepdrop) return self.minstep + (self.step - self.minstep) * updates_ratio def predict(self, input_data): if not self.initialized: raise NotTrained("LVQ network hasn't been trained yet") input_data = format_data(input_data) subclass_to_class = self.subclass_to_class weight = self.weight predictions = [] for input_row in input_data: output = euclid_distance(input_row, weight) winner_subclass = int(output.argmin(axis=1)) predicted_class = subclass_to_class[winner_subclass] predictions.append(predicted_class) return np.array(predictions) def train(self, input_train, target_train, *args, **kwargs): input_train = format_data(input_train) target_train = format_data(target_train) n_input_samples = len(input_train) if n_input_samples <= self.n_subclasses: raise ValueError("Number of training input samples should be " "greater than number of sublcasses. Training " "method recived {} input samples." "".format(n_input_samples)) if not self.initialized: target_classes = sorted(np.unique(target_train).astype(np.int)) expected_classes = list(range(self.n_classes)) if target_classes != expected_classes: raise ValueError("All classes should be integers from the " "range [0, {}], but got the following " "classes instead {}".format( self.n_classes - 1, target_classes)) weights = [] iterator = zip(target_classes, self.prototypes_per_class) for target_class, n_prototypes in iterator: is_valid_class = (target_train[:, 0] == target_class) is_valid_class = is_valid_class.astype('float64') n_samples_per_class = sum(is_valid_class) is_valid_class /= n_samples_per_class if n_samples_per_class <= n_prototypes: raise ValueError("Input data has {0} samples for class-{1}" ". Number of samples per specified " "class-{1} should be greater than {2}." "".format(n_samples_per_class, target_class, n_prototypes)) class_weight_indeces = np.random.choice( np.arange(n_input_samples), n_prototypes, replace=False, p=is_valid_class) class_weight = input_train[class_weight_indeces] weights.extend(class_weight) self.weight = np.array(weights) self.initialized = True super(LVQ, self).train(input_train, target_train, *args, **kwargs) def train_epoch(self, input_train, target_train): weight = self.weight subclass_to_class = self.subclass_to_class n_correct_predictions = 0 for input_row, target in zip(input_train, target_train): step = self.training_step output = euclid_distance(input_row, weight) winner_subclass = int(output.argmin()) predicted_class = subclass_to_class[winner_subclass] weight_update = input_row - weight[winner_subclass, :] is_correct_prediction = (predicted_class == target) if is_correct_prediction: weight[winner_subclass, :] += step * weight_update else: weight[winner_subclass, :] -= step * weight_update n_correct_predictions += is_correct_prediction self.n_updates += 1 n_samples = len(input_train) return 1 - n_correct_predictions / n_samples
class LVQ3(LVQ21): """ Learning Vector Quantization 3 (LVQ3) algorithm. Improved version for the LVQ2.1 algorithm. Parameters ---------- {LVQ.n_inputs} {LVQ.n_subclasses} {LVQ.n_classes} {LVQ.prototypes_per_class} {LVQ2.epsilon} slowdown_rate : float Paremeter scales learning step in order to decrease it in case if the two closest subclasses predict target value correctly. Defaults to ``0.4``. step : float Learning rate, defaults to ``0.01``. {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Notes ----- {LVQ21.Notes} - Decreasing step and increasing number of training epochs can improve the performance. """ step = NumberProperty(minval=0, default=0.01) slowdown_rate = NumberProperty(minval=0, default=0.4) def train_epoch(self, input_train, target_train): weight = self.weight epsilon = self.epsilon slowdown_rate = self.slowdown_rate subclass_to_class = self.subclass_to_class n_correct_predictions = 0 for input_row, target in zip(input_train, target_train): step = self.training_step output = euclid_distance(input_row, weight) winner_subclasses = n_argmin(output, n=2, axis=1) top1_subclass, top2_subclass = winner_subclasses top1_class = subclass_to_class[top1_subclass] top2_class = subclass_to_class[top2_subclass] top1_weight_update = input_row - weight[top1_subclass, :] is_first_correct = (top1_class == target) is_second_correct = (top2_class == target) closest_dist, runner_up_dist = output[0, winner_subclasses] double_update_condition_satisfied = ( ( (is_first_correct and not is_second_correct) or (is_second_correct and not is_first_correct) ) and closest_dist > ((1 - epsilon) * runner_up_dist) and runner_up_dist < ((1 + epsilon) * closest_dist) ) two_closest_correct_condition_satisfied = ( is_first_correct and is_second_correct and closest_dist > ((1 - epsilon) * (1 + epsilon) * runner_up_dist) ) if double_update_condition_satisfied: top2_weight_update = input_row - weight[top2_class, :] if is_first_correct: weight[top1_subclass, :] += step * top1_weight_update weight[top2_subclass, :] -= step * top2_weight_update else: weight[top1_subclass, :] -= step * top1_weight_update weight[top2_subclass, :] += step * top2_weight_update elif two_closest_correct_condition_satisfied: beta = step * slowdown_rate top2_weight_update = input_row - weight[top2_class, :] weight[top1_subclass, :] += beta * top1_weight_update weight[top2_subclass, :] += beta * top2_weight_update else: weight[top1_subclass, :] -= step * top1_weight_update n_correct_predictions += is_first_correct self.n_updates += 1 n_samples = len(input_train) return 1 - n_correct_predictions / n_samples
class GRU(BaseRNNLayer): """ Gated Recurrent Unit (GRU) Layer. Parameters ---------- {BaseRNNLayer.size} weights : dict or Initializer Weight parameters for different gates. Defaults to :class:`XavierUniform() <neupy.init.XavierUniform>`. - In case if application requires the same initialization method for all weights, then it's possible to specify initialization method that would be automaticaly applied to all weight parameters in the GRU layer. .. code-block:: python layers.GRU(2, weights=init.Normal(0.1)) - In case if application requires different initialization values for different weights then it's possible to specify an exact weight by name. .. code-block:: python dict( weight_in_to_updategate=init.XavierUniform(), weight_hid_to_updategate=init.XavierUniform(), weight_in_to_resetgate=init.XavierUniform(), weight_hid_to_resetgate=init.XavierUniform(), weight_in_to_hidden_update=init.XavierUniform(), weight_hid_to_hidden_update=init.XavierUniform(), ) If application requires modification to only one (or multiple) parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(weight_in_to_updategate=init.Normal(0.1)) Other parameters like ``weight_in_to_resetgate`` will be equal to their default values. biases : dict or Initializer Bias parameters for different gates. Defaults to :class:`Constant(0) <neupy.init.Constant>`. - In case if application requires the same initialization method for all biases, then it's possible to specify initialization method that would be automaticaly applied to all bias parameters in the GRU layer. .. code-block:: python layers.GRU(2, biases=init.Constant(1)) - In case if application requires different initialization values for different weights then it's possible to specify an exact weight by name. .. code-block:: python dict( bias_updategate=init.Constant(0), bias_resetgate=init.Constant(0), bias_hidden_update=init.Constant(0), ) If application requires modification to only one (or multiple) parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(bias_resetgate=init.Constant(1)) Other parameters like ``bias_updategate`` will be equal to their default values. activation_functions : dict, callable Activation functions for different gates. Defaults to: .. code-block:: python # import theano.tensor as T dict( resetgate=T.nnet.sigmoid, updategate=T.nnet.sigmoid, hidden_update=T.tanh, ) If application requires modification to only one parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(resetgate=T.tanh) Other parameters like ``updategate`` or ``hidden_update`` will be equal to their default values. learn_init : bool If ``True``, make ``hid_init`` trainable variable. Defaults to ``False``. hid_init : array-like, Theano variable, scalar or Initializer Initializer for initial hidden state (:math:`h_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. {BaseRNNLayer.only_return_final} backwards : bool If ``True``, process the sequence backwards and then reverse the output again such that the output from the layer is always from :math:`x_1` to :math:`x_n`. Defaults to ``False``. precompute_input : bool if ``True``, precompute ``input_to_hid`` before iterating through the sequence. This can result in a speed up at the expense of an increase in memory usage. Defaults to ``True``. unroll_scan : bool If ``True`` the recursion is unrolled instead of using scan. For some graphs this gives a significant speed up but it might also consume more memory. When ``unroll_scan=True``, backpropagation always includes the full sequence, so ``n_gradient_steps`` must be set to ``-1`` and the input sequence length must be known at compile time (i.e., cannot be given as ``None``). Defaults to ``False``. {BaseLayer.Parameters} Notes ----- Code was adapted from the `Lasagne <https://github.com/Lasagne/Lasagne>`_ library. Examples -------- Sequence classification .. code-block:: python from neupy import layers, algorithms n_time_steps = 40 n_categories = 20 embedded_size = 10 network = algorithms.RMSProp( [ layers.Input(n_time_steps), layers.Embedding(n_categories, embedded_size), layers.GRU(20), layers.Sigmoid(1), ] ) """ weights = MultiParameterProperty( default=dict( weight_in_to_updategate=init.XavierUniform(), weight_hid_to_updategate=init.XavierUniform(), weight_in_to_resetgate=init.XavierUniform(), weight_hid_to_resetgate=init.XavierUniform(), weight_in_to_hidden_update=init.XavierUniform(), weight_hid_to_hidden_update=init.XavierUniform(), )) biases = MultiParameterProperty( default=dict( bias_updategate=init.Constant(0), bias_resetgate=init.Constant(0), bias_hidden_update=init.Constant(0), )) activation_functions = MultiCallableProperty( default=dict( resetgate=T.nnet.sigmoid, updategate=T.nnet.sigmoid, hidden_update=T.tanh, )) learn_init = Property(default=False, expected_type=bool) hid_init = ParameterProperty(default=init.Constant(0)) backwards = Property(default=False, expected_type=bool) unroll_scan = Property(default=False, expected_type=bool) precompute_input = Property(default=True, expected_type=bool) n_gradient_steps = IntProperty(default=-1) gradient_clipping = NumberProperty(default=0, minval=0) def initialize(self): super(GRU, self).initialize() n_inputs = np.prod(self.input_shape[1:]) weights = self.weights biases = self.biases # Update gate parameters self.weight_in_to_updategate = self.add_parameter( value=weights.weight_in_to_updategate, name='weight_in_to_updategate', shape=(n_inputs, self.size)) self.weight_hid_to_updategate = self.add_parameter( value=weights.weight_hid_to_updategate, name='weight_hid_to_updategate', shape=(self.size, self.size)) self.bias_updategate = self.add_parameter( value=biases.bias_updategate, name='bias_updategate', shape=(self.size,)) # Reset gate parameters self.weight_in_to_resetgate = self.add_parameter( value=weights.weight_in_to_resetgate, name='weight_in_to_resetgate', shape=(n_inputs, self.size)) self.weight_hid_to_resetgate = self.add_parameter( value=weights.weight_hid_to_resetgate, name='weight_hid_to_resetgate', shape=(self.size, self.size)) self.bias_resetgate = self.add_parameter( value=biases.bias_resetgate, name='bias_forgetgate', shape=(self.size,)) # Hidden update gate parameters self.weight_in_to_hidden_update = self.add_parameter( value=weights.weight_in_to_hidden_update, name='weight_in_to_hidden_update', shape=(n_inputs, self.size)) self.weight_hid_to_hidden_update = self.add_parameter( value=weights.weight_hid_to_hidden_update, name='weight_hid_to_hidden_update', shape=(self.size, self.size)) self.bias_hidden_update = self.add_parameter( value=biases.bias_hidden_update, name='bias_hidden_update', shape=(self.size,)) self.add_parameter(value=self.hid_init, shape=(1, self.size), name="hid_init", trainable=self.learn_init) def output(self, input_value): # Treat all dimensions after the second as flattened # feature dimensions if input_value.ndim > 3: input_value = T.flatten(input_value, 3) # Because scan iterates over the first dimension we # dimshuffle to (n_time_steps, n_batch, n_features) input_value = input_value.dimshuffle(1, 0, 2) seq_len, n_batch, _ = input_value.shape # Stack input weight matrices into a (num_inputs, 3 * num_units) # matrix, which speeds up computation weight_in_stacked = T.concatenate([ self.weight_in_to_updategate, self.weight_in_to_resetgate, self.weight_in_to_hidden_update], axis=1) # Same for hidden weight matrices weight_hid_stacked = T.concatenate([ self.weight_hid_to_updategate, self.weight_hid_to_resetgate, self.weight_hid_to_hidden_update], axis=1) # Stack biases into a (3 * num_units) vector bias_stacked = T.concatenate([ self.bias_updategate, self.bias_resetgate, self.bias_hidden_update], axis=0) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # weight_in_stacked is (n_features, 3 * num_units). # Input: (n_time_steps, n_batch, 3 * num_units). input_value = T.dot(input_value, weight_in_stacked) + bias_stacked # When theano.scan calls step, input_n will be # (n_batch, 3 * num_units). We define a slicing function # that extract the input to each GRU gate def slice_w(x, n): s = x[:, n * self.size:(n + 1) * self.size] if self.size == 1: s = T.addbroadcast(s, 1) # Theano cannot infer this by itself return s # Create single recurrent computation step function # input_n is the n'th vector of the input def one_gru_step(input_n, hid_previous, *args): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, # and W_{hc} h_{t - 1} hid_input = T.dot(hid_previous, weight_hid_stacked) if self.gradient_clipping: input_n = theano.gradient.grad_clip( input_n, -self.gradient_clipping, self.gradient_clipping) hid_input = theano.gradient.grad_clip( hid_input, -self.gradient_clipping, self.gradient_clipping) if not self.precompute_input: # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, # and W_{xc}x_t + b_c input_n = T.dot(input_n, weight_in_stacked) + bias_stacked # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) resetgate = self.activation_functions.resetgate(resetgate) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) updategate = self.activation_functions.updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate * hidden_update_hid if self.gradient_clipping: hidden_update = theano.gradient.grad_clip( hidden_update, -self.gradient_clipping, self.gradient_clipping) hidden_update = self.activation_functions.hidden_update( hidden_update) # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate) * hid_previous + updategate * hidden_update return hid hid_init = T.dot(T.ones((n_batch, 1)), self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_sequences = [weight_hid_stacked] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_sequences += [weight_in_stacked, bias_stacked] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer n_time_steps = self.input_shape[0] # Explicitly unroll the recurrence instead of using scan hid_out, = unroll_scan( fn=one_gru_step, sequences=[input_value], outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_sequences, n_steps=n_time_steps) else: # Scan op iterates over first dimension of input and # repeatedly applies the step function hid_out, _ = theano.scan( fn=one_gru_step, sequences=[input_value], outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_sequences, truncate_gradient=self.n_gradient_steps, strict=True) # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: return hid_out[-1] # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
class QuasiNewton(StepSelectionBuiltIn, GradientDescent): """ Quasi-Newton algorithm optimization. Parameters ---------- update_function : {{'bfgs', 'dfp', 'psb', 'sr1'}} Update function. Defaults to ``bfgs``. h0_scale : float Default Hessian matrix is an identity matrix. The ``h0_scale`` parameter scales identity matrix. Defaults to ``1``. {GradientDescent.connection} {GradientDescent.error} {GradientDescent.show_epoch} {GradientDescent.shuffle_data} {GradientDescent.epoch_end_signal} {GradientDescent.train_end_signal} {GradientDescent.verbose} {GradientDescent.addons} Attributes ---------- {GradientDescent.Attributes} Methods ------- {GradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> qnnet = algorithms.QuasiNewton( ... (2, 3, 1), ... update_function='bfgs' ... ) >>> qnnet.train(x_train, y_train, epochs=10) See Also -------- :network:`GradientDescent` : GradientDescent algorithm. """ update_function = ChoiceProperty(default='bfgs', choices={ 'bfgs': bfgs, 'dfp': dfp, 'psb': psb, 'sr1': sr1, }) h0_scale = NumberProperty(default=1, minval=0) step = WithdrawProperty() def init_variables(self): super(QuasiNewton, self).init_variables() n_params = count_parameters(self.connection) self.variables.update( inv_hessian=theano.shared( name='algo:quasi-newton/matrix:inv-hessian', value=asfloat(self.h0_scale * np.eye(int(n_params))), ), prev_params=theano.shared( name='algo:quasi-newton/vector:prev-params', value=asfloat(np.zeros(n_params)), ), prev_full_gradient=theano.shared( name='algo:quasi-newton/vector:prev-full-gradient', value=asfloat(np.zeros(n_params)), ), ) def init_train_updates(self): network_inputs = self.variables.network_inputs network_output = self.variables.network_output inv_hessian = self.variables.inv_hessian prev_params = self.variables.prev_params prev_full_gradient = self.variables.prev_full_gradient params = parameter_values(self.connection) param_vector = T.concatenate([param.flatten() for param in params]) gradients = T.grad(self.variables.error_func, wrt=params) full_gradient = T.concatenate([grad.flatten() for grad in gradients]) new_inv_hessian = ifelse( T.eq(self.variables.epoch, 1), inv_hessian, self.update_function(inv_hessian, param_vector - prev_params, full_gradient - prev_full_gradient)) param_delta = -new_inv_hessian.dot(full_gradient) layers_and_parameters = list(iter_parameters(self.layers)) def prediction(step): updated_params = param_vector + step * param_delta # This trick allow us to replace shared variables # with theano variables and get output from the network start_pos = 0 for layer, attrname, param in layers_and_parameters: end_pos = start_pos + param.size updated_param_value = T.reshape( updated_params[start_pos:end_pos], param.shape) setattr(layer, attrname, updated_param_value) start_pos = end_pos output = self.connection.output(*network_inputs) # Restore previous parameters for layer, attrname, param in layers_and_parameters: setattr(layer, attrname, param) return output def phi(step): return self.error(network_output, prediction(step)) def derphi(step): error_func = self.error(network_output, prediction(step)) return T.grad(error_func, wrt=step) step = asfloat(line_search(phi, derphi)) updated_params = param_vector + step * param_delta updates = setup_parameter_updates(params, updated_params) updates.extend([ (inv_hessian, new_inv_hessian), (prev_params, param_vector), (prev_full_gradient, full_gradient), ]) return updates
class LVQ2(LVQ): """ Learning Vector Quantization 2 (LVQ2) algorithm. Improved version for the LVQ algorithm. Parameters ---------- epsilon : float Ration between to closest subclasses that triggers double weight update. Defaults to ``0.1``. {LVQ.Parameters} Notes ----- {LVQ.Notes} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> X = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [2, 2], [1, 2]]) >>> y = np.array([0, 0, 0, 1, 1, 1]) >>> >>> lvqnet = algorithms.LVQ2(n_inputs=2, n_classes=2) >>> lvqnet.train(X, y, epochs=100) >>> lvqnet.predict([[2, 1], [-1, -1]]) array([1, 0]) """ epsilon = NumberProperty(default=0.1) def one_training_update(self, X_train, y_train): weight = self.weight epsilon = self.epsilon subclass_to_class = self.subclass_to_class n_correct_predictions = 0 for input_row, target in zip(X_train, y_train): step = self.training_step output = euclid_distance(input_row, weight) winner_subclasses = n_argmin(output, n=2, axis=1) top1_subclass, top2_subclass = winner_subclasses top1_class = subclass_to_class[top1_subclass] top2_class = subclass_to_class[top2_subclass] top1_weight_update = input_row - weight[top1_subclass, :] is_correct_prediction = (top1_class == target).item(0) closest_dist, runner_up_dist = output[0, winner_subclasses] double_update_condition_satisfied = ( not is_correct_prediction and (top2_class == target) and closest_dist > ((1 - epsilon) * runner_up_dist) and runner_up_dist < ((1 + epsilon) * closest_dist)) if double_update_condition_satisfied: top2_weight_update = input_row - weight[top2_class, :] weight[top1_subclass, :] -= step * top1_weight_update weight[top2_subclass, :] += step * top2_weight_update elif is_correct_prediction: weight[top1_subclass, :] += step * top1_weight_update else: weight[top1_subclass, :] -= step * top1_weight_update n_correct_predictions += is_correct_prediction n_samples = len(X_train) return 1 - n_correct_predictions / n_samples
class GrowingNeuralGas(BaseNetwork): """ Growing Neural Gas (GNG) algorithm. Current algorithm has two modifications that hasn't been mentioned in the paper, but they help to speed up training. - The ``n_start_nodes`` parameter provides possibility to increase number of nodes during initialization step. It's useful when algorithm takes a lot of time building up large amount of neurons. - The ``min_distance_for_update`` parameter allows to speed up training when some data samples has neurons very close to them. The ``min_distance_for_update`` parameter controls threshold for the minimum distance for which we will want to update weights. Parameters ---------- n_inputs : int Number of features in each sample. n_start_nodes : int Number of nodes that algorithm generates from the data during the initialization step. Defaults to ``2``. step : float Step (learning rate) for the neuron winner. Defaults to ``0.2``. neighbour_step : float Step (learning rate) for the neurons that connected via edges with neuron winner. This value typically has to be smaller than ``step`` value. Defaults to ``0.05``. max_edge_age : int It means that if edge won't be updated for ``max_edge_age`` iterations than it would be removed. The larger the value the more updates we allow to do before removing edge. Defaults to ``100``. n_iter_before_neuron_added : int Each ``n_iter_before_neuron_added`` weight update algorithm add new neuron. The smaller the value the more frequently algorithm adds new neurons to the network. Defaults to ``1000``. error_decay_rate : float This error decay rate would be applied to every neuron in the graph after each training iteration. It ensures that old errors will be reduced over time. Defaults to ``0.995``. after_split_error_decay_rate : float This decay rate reduces error for neurons with largest errors after algorithm added new neuron. This value typically lower than ``error_decay_rate``. Defaults to ``0.5``. max_nodes : int Maximum number of nodes that would be generated during the training. This parameter won't stop training when maximum number of nodes will be exceeded. Defaults to ``1000``. min_distance_for_update : float Parameter controls for which neurons we want to apply updates. In case if euclidean distance between data sample and closest neurons will be less than the ``min_distance_for_update`` value than update would be skipped for this data sample. Setting value to zero will disable effect provided by this parameter. Defaults to ``0``. {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.signals} {Verbose.verbose} Methods ------- train(X_train, epochs=100) Network learns topological structure of the data. Learned structure will be stored in the ``graph`` attribute. {BaseSkeleton.fit} initialize_nodes(data) Network initializes nodes randomly sampling ``n_start_nodes`` from the data. It would be applied automatically before the training in case if graph is empty. Note: Node re-initialization can reset network. Notes ----- - Unlike other algorithms this network doesn't make predictions. Instead, it learns topological structure of the data in form of the graph. After that training, structure of the network can be extracted from the ``graph`` attribute. - In order to speed up training, it might be useful to increase the ``n_start_nodes`` parameter. - During the training it happens that nodes learn topological structure of one part of the data better than the other, mostly because of the different data sample density in different places. Increasing the ``min_distance_for_update`` can speed up training ignoring updates for the neurons that very close to the data sample. (below specified ``min_distance_for_update`` value). Training can be stopped in case if none of the neurons has been updated during the training epoch. Attributes ---------- graph : NeuralGasGraph instance This attribute stores all neurons and connections between them in the form of undirected graph. {BaseNetwork.Attributes} Examples -------- >>> from neupy import algorithms >>> from sklearn.datasets import make_blobs >>> >>> data, _ = make_blobs( ... n_samples=1000, ... n_features=2, ... centers=2, ... cluster_std=0.4, ... ) >>> >>> neural_gas = algorithms.GrowingNeuralGas( ... n_inputs=2, ... shuffle_data=True, ... verbose=True, ... max_edge_age=10, ... n_iter_before_neuron_added=50, ... max_nodes=100, ... ) >>> neural_gas.graph.n_nodes 100 >>> len(neural_gas.graph.edges) 175 >>> edges = list(neural_gas.graph.edges.keys()) >>> neuron_1, neuron_2 = edges[0] >>> >>> neuron_1.weight array([[-6.77166299, 2.4121606 ]]) >>> neuron_2.weight array([[-6.829309 , 2.27839633]]) References ---------- [1] A Growing Neural Gas Network Learns Topologies, Bernd Fritzke """ n_inputs = IntProperty(minval=1, required=True) n_start_nodes = IntProperty(minval=2, default=2) step = NumberProperty(default=0.2, minval=0) neighbour_step = NumberProperty(default=0.05, minval=0) max_edge_age = IntProperty(default=100, minval=1) max_nodes = IntProperty(default=1000, minval=1) n_iter_before_neuron_added = IntProperty(default=1000, minval=1) after_split_error_decay_rate = ProperFractionProperty(default=0.5) error_decay_rate = ProperFractionProperty(default=0.995) min_distance_for_update = NumberProperty(default=0.0, minval=0) def __init__(self, *args, **kwargs): super(GrowingNeuralGas, self).__init__(*args, **kwargs) self.n_updates = 0 self.graph = NeuralGasGraph() def format_input_data(self, X): is_feature1d = self.n_inputs == 1 X = format_data(X, is_feature1d) if X.ndim != 2: raise ValueError("Cannot make prediction, because input " "data has more than 2 dimensions") n_samples, n_features = X.shape if n_features != self.n_inputs: raise ValueError("Input data expected to have {} features, " "but got {}".format(self.n_inputs, n_features)) return X def initialize_nodes(self, data): self.graph = NeuralGasGraph() for sample in sample_data_point(data, n=self.n_start_nodes): self.graph.add_node(NeuronNode(sample.reshape(1, -1))) def train(self, X_train, epochs=100): X_train = self.format_input_data(X_train) if not self.graph.nodes: self.initialize_nodes(X_train) return super(GrowingNeuralGas, self).train( X_train=X_train, y_train=None, X_test=None, y_test=None, epochs=epochs) def one_training_update(self, X_train, y_train=None): graph = self.graph step = self.step neighbour_step = self.neighbour_step max_nodes = self.max_nodes max_edge_age = self.max_edge_age error_decay_rate = self.error_decay_rate after_split_error_decay_rate = self.after_split_error_decay_rate n_iter_before_neuron_added = self.n_iter_before_neuron_added # We square this value, because we deal with # squared distances during the training. min_distance_for_update = np.square(self.min_distance_for_update) n_samples = len(X_train) total_error = 0 did_update = False for sample in X_train: nodes = graph.nodes weights = np.concatenate([node.weight for node in nodes]) distance = np.linalg.norm(weights - sample, axis=1) neuron_ids = np.argsort(distance) closest_neuron_id, second_closest_id = neuron_ids[:2] closest_neuron = nodes[closest_neuron_id] second_closest = nodes[second_closest_id] total_error += distance[closest_neuron_id] if distance[closest_neuron_id] < min_distance_for_update: continue self.n_updates += 1 did_update = True closest_neuron.error += distance[closest_neuron_id] closest_neuron.weight += step * (sample - closest_neuron.weight) graph.add_edge(closest_neuron, second_closest) for to_neuron in list(graph.edges_per_node[closest_neuron]): edge_id = graph.find_edge_id(to_neuron, closest_neuron) age = graph.edges[edge_id] if age >= max_edge_age: graph.remove_edge(to_neuron, closest_neuron) if not graph.edges_per_node[to_neuron]: graph.remove_node(to_neuron) else: graph.edges[edge_id] += 1 to_neuron.weight += neighbour_step * ( sample - to_neuron.weight) time_to_add_new_neuron = ( self.n_updates % n_iter_before_neuron_added == 0 and graph.n_nodes < max_nodes) if time_to_add_new_neuron: nodes = graph.nodes largest_error_neuron = max(nodes, key=attrgetter('error')) neighbour_neuron = max( graph.edges_per_node[largest_error_neuron], key=attrgetter('error')) largest_error_neuron.error *= after_split_error_decay_rate neighbour_neuron.error *= after_split_error_decay_rate new_weight = 0.5 * ( largest_error_neuron.weight + neighbour_neuron.weight ) new_neuron = NeuronNode(weight=new_weight.reshape(1, -1)) graph.remove_edge(neighbour_neuron, largest_error_neuron) graph.add_node(new_neuron) graph.add_edge(largest_error_neuron, new_neuron) graph.add_edge(neighbour_neuron, new_neuron) for node in graph.nodes: node.error *= error_decay_rate if not did_update and min_distance_for_update != 0 and n_samples > 1: raise StopTraining( "Distance between every data sample and neurons, closest " "to them, is less then {}".format(min_distance_for_update)) return total_error / n_samples def predict(self, *args, **kwargs): raise NotImplementedError( "Growing Neural Gas algorithm doesn't make prediction. " "It only learns graph structure from the data " "(class has `graph` attribute). ")
class LVQ3(LVQ21): """ Learning Vector Quantization 3 (LVQ3) algorithm. Improved version for the LVQ algorithm. Parameters ---------- slowdown_rate : float Paremeter scales learning step in order to decrease it in case if the two closest subclasses predict target value correctly. Defaults to ``0.4``. {LVQ21.Parameters} Notes ----- {LVQ21.Notes} """ slowdown_rate = NumberProperty(minval=0, default=0.4) def train_epoch(self, input_train, target_train): step = self.step weight = self.weight epsilon = self.epsilon slowdown_rate = self.slowdown_rate subclass_to_class = self.subclass_to_class n_correct_predictions = 0 for input_row, target in zip(input_train, target_train): output = euclid_distance(input_row, weight) winner_subclasses = n_argmin(output, n=2, axis=1) top1_subclass, top2_subclass = winner_subclasses top1_class = subclass_to_class[top1_subclass] top2_class = subclass_to_class[top2_subclass] top1_weight_update = input_row - weight[top1_subclass, :] is_first_correct = (top1_class == target) is_second_correct = (top2_class == target) closest_dist, runner_up_dist = output[0, winner_subclasses] double_update_condition_satisfied = ( ( (is_first_correct and not is_second_correct) or (is_second_correct and not is_first_correct) ) and closest_dist > ((1 - epsilon) * runner_up_dist) and runner_up_dist < ((1 + epsilon) * closest_dist) ) two_closest_correct_condition_satisfied = ( is_first_correct and is_second_correct and closest_dist > ((1 - epsilon) * (1 + epsilon) * runner_up_dist) ) if double_update_condition_satisfied: top2_weight_update = input_row - weight[top2_class, :] if is_first_correct: weight[top2_subclass, :] -= step * top2_weight_update weight[top1_subclass, :] += step * top1_weight_update else: weight[top1_subclass, :] -= step * top1_weight_update weight[top2_subclass, :] += step * top2_weight_update elif two_closest_correct_condition_satisfied: beta = step * slowdown_rate weight[top1_subclass, :] += beta * top1_weight_update weight[top2_subclass, :] += beta * top2_weight_update else: weight[top1_subclass, :] -= step * top1_weight_update n_correct_predictions += is_first_correct n_samples = len(input_train) return 1 - n_correct_predictions / n_samples
class BaseNetwork(BaseSkeleton): """ Base class for Neural Network algorithms. Parameters ---------- step : float Learning rate, defaults to ``0.1``. show_epoch : int or str This property controls how often the network will display information about training. There are two main syntaxes for this property. - You can define it as a positive integer number. It defines how offen would you like to see summary output in terminal. For instance, number `100` mean that network shows summary at 100th, 200th, 300th ... epochs. - String defines number of times you want to see output in terminal. For instance, value ``'2 times'`` mean that the network will show output twice with approximately equal period of epochs and one additional output would be after the finall epoch. Defaults to ``1``. shuffle_data : bool If it's ``True`` class shuffles all your training data before training your network, defaults to ``True``. epoch_end_signal : function Calls this function when train epoch finishes. train_end_signal : function Calls this function when train process finishes. {Verbose.Parameters} Attributes ---------- errors : ErrorHistoryList Contains list of training errors. This object has the same properties as list and in addition there are three additional useful methods: `last`, `previous` and `normalized`. train_errors : ErrorHistoryList Alias to the ``errors`` attribute. validation_errors : ErrorHistoryList The same as `errors` attribute, but it contains only validation errors. last_epoch : int Value equals to the last trained epoch. After initialization it is equal to ``0``. """ step = NumberProperty(default=0.1, minval=0) show_epoch = ShowEpochProperty(minval=1, default=1) shuffle_data = Property(default=False, expected_type=bool) epoch_end_signal = Property(expected_type=types.FunctionType) train_end_signal = Property(expected_type=types.FunctionType) def __init__(self, *args, **options): self.errors = self.train_errors = ErrorHistoryList() self.validation_errors = ErrorHistoryList() self.training = AttributeKeyDict() self.last_epoch = 0 super(BaseNetwork, self).__init__(*args, **options) if self.verbose: show_network_options(self, highlight_options=options) def predict(self, input_data): """ Return prediction results for the input data. Parameters ---------- input_data : array-like Returns ------- array-like """ raise NotImplementedError def on_epoch_start_update(self, epoch): """ Function would be trigger before run all training procedure related to the current epoch. Parameters ---------- epoch : int Current epoch number. """ self.last_epoch = epoch def train_epoch(self, input_train, target_train=None): raise NotImplementedError() def prediction_error(self, input_test, target_test): raise NotImplementedError() def train(self, input_train, target_train=None, input_test=None, target_test=None, epochs=100, epsilon=None, summary='table'): """ Method train neural network. Parameters ---------- input_train : array-like target_train : array-like or None input_test : array-like or None target_test : array-like or None epochs : int Defaults to `100`. epsilon : float or None Defaults to ``None``. """ show_epoch = self.show_epoch logs = self.logs training = self.training = AttributeKeyDict() if epochs <= 0: raise ValueError("Number of epochs needs to be greater than 0.") if epsilon is not None and epochs <= 2: raise ValueError("Network should train at teast 3 epochs before " "check the difference between errors") logging_info_about_the_data(self, input_train, input_test) logging_info_about_training(self, epochs, epsilon) logs.newline() if summary == 'table': summary = SummaryTable( table_builder=table.TableBuilder( table.Column(name="Epoch #"), table.NumberColumn(name="Train err", places=4), table.NumberColumn(name="Valid err", places=4), table.TimeColumn(name="Time", width=10), stdout=logs.write ), network=self, delay_limit=1., delay_history_length=10, ) elif summary == 'inline': summary = InlineSummary(network=self) else: raise ValueError("`{}` is unknown summary type" "".format(summary)) iterepochs = create_training_epochs_iterator(self, epochs, epsilon) show_epoch = parse_show_epoch_property(self, epochs, epsilon) training.show_epoch = show_epoch # Storring attributes and methods in local variables we prevent # useless __getattr__ call a lot of times in each loop. # This variables speed up loop in case on huge amount of # iterations. training_errors = self.errors validation_errors = self.validation_errors shuffle_data = self.shuffle_data train_epoch = self.train_epoch epoch_end_signal = self.epoch_end_signal train_end_signal = self.train_end_signal on_epoch_start_update = self.on_epoch_start_update is_first_iteration = True can_compute_validation_error = (input_test is not None) last_epoch_shown = 0 ############################################# symMatrix = tt.dmatrix("symMatrix") symEigenvalues, eigenvectors = tt.nlinalg.eig(symMatrix) get_Eigen = theano.function([symMatrix], [symEigenvalues, eigenvectors]) ############################################# with logs.disable_user_input(): for epoch in iterepochs: validation_error = None epoch_start_time = time.time() on_epoch_start_update(epoch) if shuffle_data: data = shuffle(*as_tuple(input_train, target_train)) input_train, target_train = data[:-1], data[-1] try: train_error = train_epoch(input_train, target_train) print epoch name=str(self) if(name.split('(')[0]=='Hessian'): H=self.variables.hessian.get_value() ev,_=get_Eigen(H) print "positive EV ",np.sum(ev>0) print "Just zero EV", np.sum(ev==0) print "Zero EV ", np.sum(ev==0)+np.sum((ev < 0) & (ev > (np.min(ev)/2.0))) print "Neg EV ", np.sum(ev<0) print "Max EV ",np.max(ev) print "Min EV ",np.min(ev) s=str(self.itr)+'.npy' np.save(s,ev) if can_compute_validation_error: validation_error = self.prediction_error(input_test, target_test) training_errors.append(train_error) validation_errors.append(validation_error) epoch_finish_time = time.time() training.epoch_time = epoch_finish_time - epoch_start_time if epoch % training.show_epoch == 0 or is_first_iteration: summary.show_last() last_epoch_shown = epoch if epoch_end_signal is not None: epoch_end_signal(self) is_first_iteration = False except StopTraining as err: # TODO: This notification breaks table view in terminal. # I need to show it in a different way. logs.message("TRAIN", "Epoch #{} stopped. {}" "".format(epoch, str(err))) break if epoch != last_epoch_shown: summary.show_last() if train_end_signal is not None: train_end_signal(self) summary.finish() logs.newline()
class Adam(GradientDescent): """ Adam algorithm. Parameters ---------- beta1 : float Decay rate. Value need to be between ``0`` and ``1``. Defaults to ``0.95``. beta2 : float Decay rate. Value need to be between ``0`` and ``1``. Defaults to ``0.95``. epsilon : float Value need to be greater than ``0``. Defaults to ``1e-5``. step : float Learning rate, defaults to ``0.001``. {GradientDescent.batch_size} {BaseGradientDescent.addons} {ConstructibleNetwork.connection} {ConstructibleNetwork.error} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Attributes ---------- {GradientDescent.Attributes} Methods ------- {GradientDescent.Methods} References ---------- [1] Diederik P. Kingma, Jimmy Lei Ba Adam: a Method for Stochastic Optimization. https://arxiv.org/pdf/1412.6980.pdf Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> mnet = algorithms.Adam((2, 3, 1)) >>> mnet.train(x_train, y_train) """ step = NumberProperty(default=0.001, minval=0) beta1 = ProperFractionProperty(default=0.9) beta2 = ProperFractionProperty(default=0.999) epsilon = NumberProperty(default=1e-7, minval=0) def init_variables(self): super(Adam, self).init_variables() self.variables.iteration = tf.Variable( asfloat(1), name='iteration', dtype=tf.float32, ) def init_train_updates(self): updates = [] iteration = self.variables.iteration step = self.variables.step # Since beta1 and beta2 are typically close to 1 and initial # values for first and second moments are close to zero the # initial estimates for these moments will be biased towards zero. # In order to solve this problem we need to correct this bias # by rescaling moments with large values during first updates # and vanishing this scaling factor more and more after every # update. # # Note that bias correction factor has been changed in order # to improve computational speed (suggestion from the original # paper). bias_correction = ( tf.sqrt(1. - self.beta2 ** iteration) / (1. - self.beta1 ** iteration) ) for layer, parameter, gradient in self.iter_params_and_grads(): prev_first_moment = tf.Variable( tf.zeros(parameter.shape), name="{}/prev-first-moment".format(parameter.op.name), dtype=tf.float32, ) prev_second_moment = tf.Variable( tf.zeros(parameter.shape), name="{}/prev-second-moment".format(parameter.op.name), dtype=tf.float32, ) first_moment = ( self.beta1 * prev_first_moment + (1. - self.beta1) * gradient ) second_moment = ( self.beta2 * prev_second_moment + (1. - self.beta2) * gradient ** 2 ) parameter_delta = bias_correction * first_moment / ( tf.sqrt(second_moment) + self.epsilon) updates.extend([ (prev_first_moment, first_moment), (prev_second_moment, second_moment), (parameter, parameter - step * parameter_delta), ]) updates.append((iteration, iteration + 1)) return updates
class QuasiNewton(WolfeLineSearchForStep, BaseGradientDescent): """ Quasi-Newton algorithm. Every iteration quasi-Network method approximates inverse Hessian matrix with iterative updates. It doesn't have ``step`` parameter. Instead, algorithm applies line search for the step parameter that satisfies strong Wolfe condition. Parameters that control wolfe search start with the ``wolfe_`` prefix. Parameters ---------- update_function : ``bfgs``, ``dfp``, ``sr1`` Update function for the iterative inverse hessian matrix approximation. Defaults to ``bfgs``. - ``bfgs`` - It's rank 2 formula update. It can suffer from round-off error and inaccurate line searches. - ``dfp`` - DFP is a method very similar to BFGS. It's rank 2 formula update. It can suffer from round-off error and inaccurate line searches. - ``sr1`` - Symmetric rank 1 (SR1). Generates update for the inverse hessian matrix adding symmetric rank-1 matrix. It's possible that there is no rank 1 updates for the matrix and in this case update won't be applied and original inverse hessian will be returned. h0_scale : float Default Hessian matrix is an identity matrix. The ``h0_scale`` parameter scales identity matrix. Defaults to ``1``. epsilon : float Controls numerical stability for the ``update_function`` parameter. Defaults to ``1e-7``. {WolfeLineSearchForStep.Parameters} {BaseGradientDescent.connection} {BaseGradientDescent.error} {BaseGradientDescent.show_epoch} {BaseGradientDescent.shuffle_data} {BaseGradientDescent.epoch_end_signal} {BaseGradientDescent.train_end_signal} {BaseGradientDescent.verbose} {BaseGradientDescent.addons} Notes ----- - Method requires all training data during propagation, which means it's not allowed to use mini-batches. Attributes ---------- {BaseGradientDescent.Attributes} Methods ------- {BaseGradientDescent.Methods} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> x_train = np.array([[1, 2], [3, 4]]) >>> y_train = np.array([[1], [0]]) >>> >>> qnnet = algorithms.QuasiNewton( ... (2, 3, 1), ... update_function='bfgs' ... ) >>> qnnet.train(x_train, y_train, epochs=10) References ---------- [1] Yang Ding, Enkeleida Lushi, Qingguo Li, Investigation of quasi-Newton methods for unconstrained optimization. http://people.math.sfu.ca/~elushi/project_833.pdf [2] Jorge Nocedal, Stephen J. Wright, Numerical Optimization. Chapter 6, Quasi-Newton Methods, p. 135-163 """ update_function = ChoiceProperty(default='bfgs', choices={ 'bfgs': bfgs, 'dfp': dfp, 'sr1': sr1, }) epsilon = NumberProperty(default=1e-7, minval=0) h0_scale = NumberProperty(default=1, minval=0) step = WithdrawProperty() def init_variables(self): super(QuasiNewton, self).init_variables() n_parameters = count_parameters(self.connection) self.variables.update( inv_hessian=tf.Variable( asfloat(self.h0_scale) * tf.eye(n_parameters), name="quasi-newton/inv-hessian", dtype=tf.float32, ), prev_params=tf.Variable( tf.zeros([n_parameters]), name="quasi-newton/prev-params", dtype=tf.float32, ), prev_full_gradient=tf.Variable( tf.zeros([n_parameters]), name="quasi-newton/prev-full-gradient", dtype=tf.float32, ), ) def init_train_updates(self): inv_hessian = self.variables.inv_hessian prev_params = self.variables.prev_params prev_full_gradient = self.variables.prev_full_gradient params = parameter_values(self.connection) param_vector = make_single_vector(params) gradients = tf.gradients(self.variables.error_func, params) full_gradient = make_single_vector(gradients) new_inv_hessian = tf.where( tf.equal(self.variables.epoch, 1), inv_hessian, self.update_function(inv_H=inv_hessian, delta_w=param_vector - prev_params, delta_grad=full_gradient - prev_full_gradient, epsilon=self.epsilon)) param_delta = -dot(new_inv_hessian, full_gradient) step = self.find_optimal_step(param_vector, param_delta) updated_params = param_vector + step * param_delta updates = setup_parameter_updates(params, updated_params) # We have to compute these values first, otherwise # parallelization in tensorflow can mix update order # and, for example, previous gradient can be equal to # current gradient value. It happens because tensorflow # try to execute operations in parallel. required_variables = [new_inv_hessian, param_vector, full_gradient] with tf.control_dependencies(required_variables): updates.extend([ inv_hessian.assign(new_inv_hessian), prev_params.assign(param_vector), prev_full_gradient.assign(full_gradient), ]) return updates