class Relu(ActivationLayer): """ The layer with the rectifier (ReLu) activation function. Parameters ---------- alpha : float Alpha parameter defines the decreasing rate for the negative values. If ``alpha`` is non-zero value then layer behave like a leaky ReLu. Defaults to ``0``. {ActivationLayer.Parameters} Methods ------- {ActivationLayer.Methods} Attributes ---------- {ActivationLayer.Attributes} """ alpha = NumberProperty(default=0, minval=0) weight = ParameterProperty(default=init.XavierNormal(gain='relu')) def activation_function(self, input_value): alpha = asfloat(self.alpha) return T.nnet.relu(input_value, alpha)
class PRelu(ActivationLayer): """ The layer with the parametrized ReLu activation function. Parameters ---------- alpha_axes : int or tuple Axes that will not include unique alpha parameter. Single integer value defines the same as a tuple with one value. Defaults to ``1``. alpha : array-like, Theano shared variable, scalar or Initializer Alpha parameter per each non-shared axis for the ReLu. Scalar value means that each element in the tensor will be equal to the specified value. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=0.25)``. {ActivationLayer.Parameters} Methods ------- {ActivationLayer.Methods} Attributes ---------- {ActivationLayer.Attributes} References ---------- .. [1] https://arxiv.org/pdf/1502.01852v1.pdf """ alpha_axes = AxesProperty(default=1) alpha = ParameterProperty(default=init.Constant(value=0.25)) def __init__(self, *args, **options): super(PRelu, self).__init__(*args, **options) if 0 in self.alpha_axes: raise ValueError("Cannot specify alpha for 0-axis") def validate(self, input_shape): if max(self.alpha_axes) > len(input_shape): max_axis_index = len(input_shape) - 1 raise ValueError("Cannot specify alpha for the axis #{}. " "Maximum available axis is #{} (0-based indeces)." "".format(max(self.alpha_axes), max_axis_index)) def initialize(self): super(PRelu, self).initialize() alpha_shape = [self.output_shape[axis - 1] for axis in self.alpha_axes] self.add_parameter(value=self.alpha, name='alpha', shape=alpha_shape, trainable=True) def activation_function(self, input_value): alpha = dimshuffle(self.alpha, input_value.ndim, self.alpha_axes) return T.nnet.relu(input_value, alpha)
class Relu(ActivationLayer): """ The layer with the rectifier (ReLu) activation function. Parameters ---------- alpha : float Alpha parameter defines the decreasing rate for the negative values. If ``alpha`` is non-zero value then layer behave like a leaky ReLu. Defaults to ``0``. {ActivationLayer.size} weight : array-like, Tensorfow variable, scalar or Initializer Defines layer's weights. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`HeNormal(gain=2) <neupy.init.HeNormal>`. {ParameterBasedLayer.bias} {BaseLayer.Parameters} Methods ------- {ActivationLayer.Methods} Attributes ---------- {ActivationLayer.Attributes} Examples -------- Feedforward Neural Networks (FNN) >>> from neupy.layers import * >>> network = Input(10) > Relu(20) > Relu(1) Convolutional Neural Networks (CNN) >>> from neupy.layers import * >>> network = join( ... Input((32, 32, 3)), ... Convolution((3, 3, 16)) > Relu(), ... Convolution((3, 3, 32)) > Relu(), ... Reshape(), ... Softmax(10), ... ) """ alpha = NumberProperty(default=0, minval=0) weight = ParameterProperty(default=init.HeNormal(gain=2)) def activation_function(self, input_value): if self.alpha == 0: return tf.nn.relu(input_value) return tf.nn.leaky_relu(input_value, asfloat(self.alpha))
class BaseAssociative(UnsupervisedLearningMixin, BaseNetwork): """ Base class for associative learning. Parameters ---------- n_inputs : int Number of input units. n_outputs : int Number of output units. weight : array-like, Initializer Neural network weights. Value defined manualy should have shape ``(n_inputs, n_outputs)``. Defaults to :class:`Normal() <neupy.core.init.Normal>`. {BaseNetwork.step} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} train(input_train, epochs=100): Train neural network. {BaseSkeleton.fit} """ n_inputs = IntProperty(minval=1, required=True) n_outputs = IntProperty(minval=1, required=True) weight = ParameterProperty(default=Normal()) def __init__(self, **options): super(BaseAssociative, self).__init__(**options) self.init_layers() def init_layers(self): valid_weight_shape = (self.n_inputs, self.n_outputs) if isinstance(self.weight, Initializer): self.weight = self.weight.sample(valid_weight_shape) if self.weight.shape != valid_weight_shape: raise ValueError("Weight matrix has invalid shape. Got {}, " "expected {}".format(self.weight.shape, valid_weight_shape)) self.weight = self.weight.astype(float) def train(self, input_train, epochs=100): return super(BaseAssociative, self).train(input_train, epochs=epochs, epsilon=None)
class Relu(ActivationLayer): """ The layer with the rectifier (ReLu) activation function. Parameters ---------- alpha : float Alpha parameter defines the decreasing rate for the negative values. If ``alpha`` is non-zero value then layer behave like a leaky ReLu. Defaults to ``0``. {ActivationLayer.size} weight : array-like, Tensorfow variable, scalar or Initializer Defines layer's weights. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`HeNormal(gain=2) <neupy.init.HeNormal>`. {ParameterBasedLayer.bias} {BaseLayer.Parameters} Methods ------- {ActivationLayer.Methods} Attributes ---------- {ActivationLayer.Attributes} """ alpha = NumberProperty(default=0, minval=0) weight = ParameterProperty(default=init.HeNormal(gain=2)) def activation_function(self, input_value): if self.alpha == 0: return tf.nn.relu(input_value) return tf.nn.leaky_relu(input_value, asfloat(self.alpha))
class Oja(BaseNetwork): """ Oja is an unsupervised technique used for the dimensionality reduction tasks. Notes ----- - In practice use step as very small value. For instance, value ``1e-7`` can be a good choice. - Normalize the input data before use Oja algorithm. Input data shouldn't contains large values. - Set up smaller values for weight if error for a few first iterations is big compare to the input values scale. For instance, if your input data have values between ``0`` and ``1`` error value equal to ``100`` is big. - During the training network report mean absolute error (MAE) Parameters ---------- minimized_data_size : int Expected number of features after minimization, defaults to ``1``. weight : array-like or ``None`` Defines networks weights. Defaults to :class:`XavierNormal() <neupy.init.XavierNormal>`. {BaseNetwork.Parameters} Methods ------- reconstruct(X) Reconstruct original dataset from the minimized input. train(X, epochs=100) Trains the network to the data X. Network trains until maximum number of ``epochs`` was reached. predict(X) Returns hidden representation of the input data ``X``. Basically, it applies dimensionality reduction. {BaseSkeleton.fit} Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> data = np.array([[2, 2], [1, 1], [4, 4], [5, 5]]) >>> >>> ojanet = algorithms.Oja( ... minimized_data_size=1, ... step=0.01, ... verbose=False ... ) >>> >>> ojanet.train(data, epochs=100) >>> minimized = ojanet.predict(data) >>> minimized array([[-2.82843122], [-1.41421561], [-5.65686243], [-7.07107804]]) >>> ojanet.reconstruct(minimized) array([[ 2.00000046, 2.00000046], [ 1.00000023, 1.00000023], [ 4.00000093, 4.00000093], [ 5.00000116, 5.00000116]]) """ minimized_data_size = IntProperty(minval=1) weight = ParameterProperty(default=init.XavierNormal()) def one_training_update(self, X, y_train): weight = self.weight minimized = np.dot(X, weight) reconstruct = np.dot(minimized, weight.T) error = X - reconstruct weight += self.step * np.dot(error.T, minimized) mae = np.sum(np.abs(error)) / X.size # Clean objects from the memory del minimized del reconstruct del error return mae def train(self, X, epochs=100): X = format_data(X) n_input_features = X.shape[1] if isinstance(self.weight, init.Initializer): weight_shape = (n_input_features, self.minimized_data_size) self.weight = self.weight.sample(weight_shape, return_array=True) if n_input_features != self.weight.shape[0]: raise ValueError("Invalid number of features. Expected {}, got {}" "".format(self.weight.shape[0], n_input_features)) super(Oja, self).train(X, epochs=epochs) def reconstruct(self, X): if not isinstance(self.weight, np.ndarray): raise NotTrained("Network hasn't been trained yet") X = format_data(X) if X.shape[1] != self.minimized_data_size: raise ValueError("Invalid input data feature space, expected " "{}, got {}.".format(X.shape[1], self.minimized_data_size)) return np.dot(X, self.weight.T) def predict(self, X): if not isinstance(self.weight, np.ndarray): raise NotTrained("Network hasn't been trained yet") X = format_data(X) return np.dot(X, self.weight)
class BaseAssociative(BaseNetwork): """ Base class for associative learning. Parameters ---------- n_inputs : int Number of features (columns) in the input data. n_outputs : int Number of outputs in the network. weight : array-like, Initializer Neural network weights. Value defined manualy should have shape ``(n_inputs, n_outputs)``. Defaults to :class:`Normal() <neupy.init.Normal>`. {BaseNetwork.step} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} train(input_train, summary='table', epochs=100) Train neural network. {BaseSkeleton.fit} """ n_inputs = IntProperty(minval=1, required=True) n_outputs = IntProperty(minval=1, required=True) weight = ParameterProperty(default=init.Normal()) def __init__(self, **options): super(BaseAssociative, self).__init__(**options) self.init_layers() def init_layers(self): valid_weight_shape = (self.n_inputs, self.n_outputs) if isinstance(self.weight, init.Initializer): self.weight = self.weight.sample( valid_weight_shape, return_array=True) if self.weight.shape != valid_weight_shape: raise ValueError( "Weight matrix has invalid shape. Got {}, expected {}" "".format(self.weight.shape, valid_weight_shape)) self.weight = self.weight.astype(float) def format_input_data(self, input_data): is_feature1d = self.n_inputs == 1 input_data = format_data(input_data, is_feature1d) if input_data.ndim != 2: raise ValueError("Cannot make prediction, because input " "data has more than 2 dimensions") n_samples, n_features = input_data.shape if n_features != self.n_inputs: raise ValueError("Input data expected to have {} features, " "but got {}".format(self.n_inputs, n_features)) return input_data def train(self, input_train, summary='table', epochs=100): input_train = self.format_input_data(input_train) return super(BaseAssociative, self).train( input_train=input_train, target_train=None, input_test=None, target_test=None, epochs=epochs, epsilon=None, summary=summary)
class Oja(UnsupervisedLearningMixin, BaseNetwork): """ Oja unsupervised algorithm that minimize input data feature space. Notes ----- * In practice use step as very small value. For example ``1e-7``. * Normalize the input data before use Oja algorithm. Input data \ shouldn't contains large values. * Set up smaller values for weight if error for a few first iterations \ is big compare to the input values scale. For example, if your input \ data have values between 0 and 1 error value equal to 100 is big. Parameters ---------- minimized_data_size : int Expected number of features after minimization, defaults to ``1`` weight : array-like or ``None`` Defines networks weights. Defaults to :class:`XavierNormal() <neupy.core.init.XavierNormal>`. {BaseNetwork.step} {BaseNetwork.show_epoch} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- reconstruct(input_data): Reconstruct your minimized data. {BaseSkeleton.predict} {UnsupervisedLearningMixin.train} {BaseSkeleton.fit} Raises ------ ValueError * Try reconstruct without training. * Invalid number of input data features for ``train`` and \ ``reconstruct`` methods. Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> data = np.array([[2, 2], [1, 1], [4, 4], [5, 5]]) >>> >>> ojanet = algorithms.Oja( ... minimized_data_size=1, ... step=0.01, ... verbose=False ... ) >>> >>> ojanet.train(data, epsilon=1e-5) >>> minimized = ojanet.predict(data) >>> minimized array([[-2.82843122], [-1.41421561], [-5.65686243], [-7.07107804]]) >>> ojanet.reconstruct(minimized) array([[ 2.00000046, 2.00000046], [ 1.00000023, 1.00000023], [ 4.00000093, 4.00000093], [ 5.00000116, 5.00000116]]) """ minimized_data_size = IntProperty(minval=1) weight = ParameterProperty(default=init.XavierNormal()) def init_properties(self): del self.shuffle_data super(Oja, self).init_properties() def train_epoch(self, input_data, target_train): weight = self.weight minimized = np.dot(input_data, weight) reconstruct = np.dot(minimized, weight.T) error = input_data - reconstruct weight += self.step * np.dot(error.T, minimized) mae = np.sum(np.abs(error)) / input_data.size # Clear memory del minimized del reconstruct del error return mae def train(self, input_data, epsilon=1e-2, epochs=100): input_data = format_data(input_data) n_input_features = input_data.shape[1] if isinstance(self.weight, init.Initializer): weight_shape = (n_input_features, self.minimized_data_size) self.weight = self.weight.sample(weight_shape) if n_input_features != self.weight.shape[0]: raise ValueError( "Invalid number of features. Expected {}, got {}".format( self.weight.shape[0], n_input_features ) ) super(Oja, self).train(input_data, epsilon=epsilon, epochs=epochs) def reconstruct(self, input_data): if not isinstance(self.weight, np.ndarray): raise NotTrainedException("Train network before use " "reconstruct method.") input_data = format_data(input_data) if input_data.shape[1] != self.minimized_data_size: raise ValueError( "Invalid input data feature space, expected " "{}, got {}.".format( input_data.shape[1], self.minimized_data_size ) ) return np.dot(input_data, self.weight.T) def predict(self, input_data): if not isinstance(self.weight, np.ndarray): raise NotTrainedException("Train network before use " "prediction method.") input_data = format_data(input_data) return np.dot(input_data, self.weight)
class RBM(BaseAlgorithm, BaseNetwork, MinibatchTrainingMixin, DumpableObject): """ Boolean/Bernoulli Restricted Boltzmann Machine (RBM). Algorithm assumes that inputs are either binary values or values between 0 and 1. Parameters ---------- n_visible : int Number of visible units. Number of features (columns) in the input data. n_hidden : int Number of hidden units. The large the number the more information network can capture from the data, but it also mean that network is more likely to overfit. batch_size : int Size of the mini-batch. Defaults to ``10``. weight : array-like, Tensorfow variable, Initializer or scalar Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`Normal <neupy.init.Normal>`. hidden_bias : array-like, Tensorfow variable, Initializer or scalar Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`Constant(value=0) <neupy.init.Constant>`. visible_bias : array-like, Tensorfow variable, Initializer or scalar Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`Constant(value=0) <neupy.init.Constant>`. {BaseNetwork.Parameters} Methods ------- train(input_train, epochs=100) Trains network. {BaseSkeleton.fit} visible_to_hidden(visible_input) Populates data throught the network and returns output from the hidden layer. hidden_to_visible(hidden_input) Propagates output from the hidden layer backward to the visible. gibbs_sampling(visible_input, n_iter=1) Makes Gibbs sampling ``n`` times using visible input. Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> data = np.array([ ... [1, 0, 1, 0], ... [1, 0, 1, 0], ... [1, 0, 0, 0], # incomplete sample ... [1, 0, 1, 0], ... ... [0, 1, 0, 1], ... [0, 0, 0, 1], # incomplete sample ... [0, 1, 0, 1], ... [0, 1, 0, 1], ... [0, 1, 0, 1], ... [0, 1, 0, 1], ... ]) >>> >>> rbm = algorithms.RBM(n_visible=4, n_hidden=1) >>> rbm.train(data, epochs=100) >>> >>> hidden_states = rbm.visible_to_hidden(data) >>> hidden_states.round(2) array([[ 0.99], [ 0.99], [ 0.95], [ 0.99], [ 0. ], [ 0.01], [ 0. ], [ 0. ], [ 0. ], [ 0. ]]) References ---------- [1] G. Hinton, A Practical Guide to Training Restricted Boltzmann Machines, 2010. http://www.cs.toronto.edu/~hinton/absps/guideTR.pdf """ n_visible = IntProperty(minval=1) n_hidden = IntProperty(minval=1) batch_size = IntProperty(minval=1, default=10) weight = ParameterProperty(default=init.Normal()) hidden_bias = ParameterProperty(default=init.Constant(value=0)) visible_bias = ParameterProperty(default=init.Constant(value=0)) def __init__(self, n_visible, n_hidden, **options): options.update({'n_visible': n_visible, 'n_hidden': n_hidden}) super(RBM, self).__init__(**options) def init_input_output_variables(self): with tf.variable_scope('rbm'): self.weight = create_shared_parameter(value=self.weight, name='weight', shape=(self.n_visible, self.n_hidden)) self.hidden_bias = create_shared_parameter( value=self.hidden_bias, name='hidden-bias', shape=(self.n_hidden, ), ) self.visible_bias = create_shared_parameter( value=self.visible_bias, name='visible-bias', shape=(self.n_visible, ), ) self.variables.update(network_input=tf.placeholder( tf.float32, (None, self.n_visible), name="network-input", ), network_hidden_input=tf.placeholder( tf.float32, (None, self.n_hidden), name="network-hidden-input", )) def init_variables(self): with tf.variable_scope('rbm'): self.variables.update(h_samples=tf.Variable( tf.zeros([self.batch_size, self.n_hidden]), name="hidden-samples", dtype=tf.float32, ), ) def init_methods(self): def free_energy(visible_sample): with tf.name_scope('free-energy'): wx = tf.matmul(visible_sample, self.weight) wx_b = wx + self.hidden_bias visible_bias_term = dot(visible_sample, self.visible_bias) # We can get infinity when wx_b is a relatively large number # (maybe 100). Taking exponent makes it even larger and # for with float32 it can convert it to infinity. But because # number is so large we don't care about +1 value before taking # logarithms and therefore we can just pick value as it is # since our operation won't change anything. hidden_terms = tf.where( # exp(30) is such a big number that +1 won't # make any difference in the outcome. tf.greater(wx_b, 30), wx_b, tf.log1p(tf.exp(wx_b)), ) hidden_term = tf.reduce_sum(hidden_terms, axis=1) return -(visible_bias_term + hidden_term) def visible_to_hidden(visible_sample): with tf.name_scope('visible-to-hidden'): wx = tf.matmul(visible_sample, self.weight) wx_b = wx + self.hidden_bias return tf.nn.sigmoid(wx_b) def hidden_to_visible(hidden_sample): with tf.name_scope('hidden-to-visible'): wx = tf.matmul(hidden_sample, self.weight, transpose_b=True) wx_b = wx + self.visible_bias return tf.nn.sigmoid(wx_b) def sample_hidden_from_visible(visible_sample): with tf.name_scope('sample-hidden-to-visible'): hidden_prob = visible_to_hidden(visible_sample) hidden_sample = random_binomial(hidden_prob) return hidden_sample def sample_visible_from_hidden(hidden_sample): with tf.name_scope('sample-visible-to-hidden'): visible_prob = hidden_to_visible(hidden_sample) visible_sample = random_binomial(visible_prob) return visible_sample network_input = self.variables.network_input network_hidden_input = self.variables.network_hidden_input input_shape = tf.shape(network_input) n_samples = input_shape[0] weight = self.weight h_bias = self.hidden_bias v_bias = self.visible_bias h_samples = self.variables.h_samples step = asfloat(self.step) with tf.name_scope('positive-values'): # We have to use `cond` instead of `where`, because # different if-else cases might have different shapes # and it triggers exception in tensorflow. v_pos = tf.cond( tf.equal(n_samples, self.batch_size), lambda: network_input, lambda: random_sample(network_input, self.batch_size)) h_pos = visible_to_hidden(v_pos) with tf.name_scope('negative-values'): v_neg = sample_visible_from_hidden(h_samples) h_neg = visible_to_hidden(v_neg) with tf.name_scope('weight-update'): weight_update = ( tf.matmul(v_pos, h_pos, transpose_a=True) - tf.matmul(v_neg, h_neg, transpose_a=True)) / asfloat(n_samples) with tf.name_scope('hidden-bias-update'): h_bias_update = tf.reduce_mean(h_pos - h_neg, axis=0) with tf.name_scope('visible-bias-update'): v_bias_update = tf.reduce_mean(v_pos - v_neg, axis=0) with tf.name_scope('flipped-input-features'): # Each row will have random feature marked with number 1 # Other values will be equal to 0 possible_feature_corruptions = tf.eye(self.n_visible) corrupted_features = random_sample(possible_feature_corruptions, n_samples) rounded_input = tf.round(network_input) # If we scale input values from [0, 1] range to [-1, 1] # than it will be easier to flip feature values with simple # multiplication. scaled_rounded_input = 2 * rounded_input - 1 scaled_flipped_rounded_input = ( # for corrupted_features we convert 0 to 1 and 1 to -1 # in this way after multiplication we will flip all # signs where -1 in the transformed corrupted_features (-2 * corrupted_features + 1) * scaled_rounded_input) # Scale it back to the [0, 1] range flipped_rounded_input = (scaled_flipped_rounded_input + 1) / 2 with tf.name_scope('pseudo-likelihood-loss'): # Stochastic pseudo-likelihood error = tf.reduce_mean(self.n_visible * tf.log_sigmoid( free_energy(flipped_rounded_input) - free_energy(rounded_input))) with tf.name_scope('gibbs-sampling'): gibbs_sampling = sample_visible_from_hidden( sample_hidden_from_visible(network_input)) initialize_uninitialized_variables() self.methods.update(train_epoch=function( [network_input], error, name='rbm/train-epoch', updates=[ (weight, weight + step * weight_update), (h_bias, h_bias + step * h_bias_update), (v_bias, v_bias + step * v_bias_update), (h_samples, random_binomial(p=h_neg)), ]), prediction_error=function( [network_input], error, name='rbm/prediction-error', ), diff1=function( [network_input], free_energy(flipped_rounded_input), name='rbm/diff1-error', ), diff2=function( [network_input], free_energy(rounded_input), name='rbm/diff2-error', ), visible_to_hidden=function( [network_input], visible_to_hidden(network_input), name='rbm/visible-to-hidden', ), hidden_to_visible=function( [network_hidden_input], hidden_to_visible(network_hidden_input), name='rbm/hidden-to-visible', ), gibbs_sampling=function( [network_input], gibbs_sampling, name='rbm/gibbs-sampling', )) def train(self, input_train, input_test=None, epochs=100, summary='table'): """ Train RBM. Parameters ---------- input_train : 1D or 2D array-like input_test : 1D or 2D array-like or None Defaults to ``None``. epochs : int Number of training epochs. Defaults to ``100``. summary : {'table', 'inline'} Training summary type. Defaults to ``'table'``. """ return super(RBM, self).train(input_train=input_train, target_train=None, input_test=input_test, target_test=None, epochs=epochs, epsilon=None, summary=summary) def train_epoch(self, input_train, target_train=None): """ Train one epoch. Parameters ---------- input_train : array-like (n_samples, n_features) Returns ------- float """ errors = self.apply_batches( function=self.methods.train_epoch, input_data=input_train, description='Training batches', show_error_output=True, ) n_samples = len(input_train) return average_batch_errors(errors, n_samples, self.batch_size) def visible_to_hidden(self, visible_input): """ Populates data throught the network and returns output from the hidden layer. Parameters ---------- visible_input : array-like (n_samples, n_visible_features) Returns ------- array-like """ is_input_feature1d = (self.n_visible == 1) visible_input = format_data(visible_input, is_input_feature1d) outputs = self.apply_batches( function=self.methods.visible_to_hidden, input_data=visible_input, description='Hidden from visible batches', show_progressbar=True, show_error_output=False, scalar_output=False, ) return np.concatenate(outputs, axis=0) def hidden_to_visible(self, hidden_input): """ Propagates output from the hidden layer backward to the visible. Parameters ---------- hidden_input : array-like (n_samples, n_hidden_features) Returns ------- array-like """ is_input_feature1d = (self.n_hidden == 1) hidden_input = format_data(hidden_input, is_input_feature1d) outputs = self.apply_batches( function=self.methods.hidden_to_visible, input_data=hidden_input, description='Visible from hidden batches', show_progressbar=True, show_error_output=False, scalar_output=False, ) return np.concatenate(outputs, axis=0) def prediction_error(self, input_data, target_data=None): """ Compute the pseudo-likelihood of input samples. Parameters ---------- input_data : array-like Values of the visible layer Returns ------- float Value of the pseudo-likelihood. """ is_input_feature1d = (self.n_visible == 1) input_data = format_data(input_data, is_input_feature1d) errors = self.apply_batches( function=self.methods.prediction_error, input_data=input_data, description='Validation batches', show_error_output=True, ) return average_batch_errors( errors, n_samples=len(input_data), batch_size=self.batch_size, ) def gibbs_sampling(self, visible_input, n_iter=1): """ Makes Gibbs sampling n times using visible input. Parameters ---------- visible_input : 1d or 2d array n_iter : int Number of Gibbs sampling iterations. Defaults to ``1``. Returns ------- array-like Output from the visible units after perfoming n Gibbs samples. Array will contain only binary units (0 and 1). """ is_input_feature1d = (self.n_visible == 1) visible_input = format_data(visible_input, is_input_feature1d) gibbs_sampling = self.methods.gibbs_sampling input_ = visible_input for iteration in range(n_iter): input_ = gibbs_sampling(input_) return input_
class ParameterBasedLayer(BaseLayer): """ Layer that creates weight and bias parameters. Parameters ---------- size : int Layer's output size. weight : array-like, Theano variable, scalar or Initializer Defines layer's weights. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`XavierNormal() <neupy.init.XavierNormal>`. bias : 1D array-like, Theano variable, scalar, Initializer or None Defines layer's bias. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`Constant(0) <neupy.init.Constant>`. The ``None`` value excludes bias from the calculations and do not add it into parameters list. {BaseLayer.Parameters} Methods ------- {BaseLayer.Methods} Attributes ---------- {BaseLayer.Attributes} """ size = IntProperty(minval=1) weight = ParameterProperty(default=init.XavierNormal()) bias = ParameterProperty(default=init.Constant(value=0), allow_none=True) def __init__(self, size, **options): super(ParameterBasedLayer, self).__init__(size=size, **options) @property def weight_shape(self): return as_tuple(self.input_shape, self.output_shape) @property def bias_shape(self): if self.bias is not None: return as_tuple(self.output_shape) def initialize(self): super(ParameterBasedLayer, self).initialize() self.add_parameter(value=self.weight, name='weight', shape=self.weight_shape, trainable=True) if self.bias is not None: self.add_parameter(value=self.bias, name='bias', shape=self.bias_shape, trainable=True) def __repr__(self): classname = self.__class__.__name__ return '{name}({size})'.format(name=classname, size=self.size)
class GRU(BaseRNNLayer): """ Gated Recurrent Unit (GRU) Layer. Parameters ---------- {BaseRNNLayer.n_units} {BaseRNNLayer.only_return_final} input_weights : Initializer, ndarray Weight parameters for input connection. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. hidden_weights : Initializer, ndarray Weight parameters for hidden connection. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. biases : Initializer, ndarray Bias parameters for all gates. Defaults to :class:`Constant(0) <neupy.init.Constant>`. resetgate : function Activation function for the reset gate. Defaults to ``tf.nn.sigmoid``. updategate : function Activation function for the update gate. Defaults to ``tf.nn.sigmoid``. hidden_update : function Activation function for the hidden state update. Defaults to ``tf.tanh``. learn_init : bool If ``True``, make ``hidden_init`` trainable variable. Defaults to ``False``. hidden_init : array-like, Tensorfow variable, scalar or Initializer Initializer for initial hidden state (:math:`h_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. backwards : bool If ``True``, process the sequence backwards and then reverse the output again such that the output from the layer is always from :math:`x_1` to :math:`x_n`. Defaults to ``False``. unroll_scan : bool If ``True`` the recursion is unrolled instead of using scan. For some graphs this gives a significant speed up but it might also consume more memory. When ``unroll_scan=True``, backpropagation always includes the full sequence, so ``n_gradient_steps`` must be set to ``-1`` and the input sequence length must be known at compile time (i.e., cannot be given as ``None``). Defaults to ``False``. {BaseLayer.name} Notes ----- Code was adapted from the `Lasagne <https://github.com/Lasagne/Lasagne>`_ library. Examples -------- Sequence classification >>> from neupy.layers import * >>> >>> n_time_steps = 40 >>> n_categories = 20 >>> embedded_size = 10 >>> >>> network = join( ... Input(n_time_steps), ... Embedding(n_categories, embedded_size), ... GRU(20), ... Sigmoid(1), ... ) >>> network (?, 40) -> [... 4 layers ...] -> (?, 1) """ input_weights = ParameterProperty() hidden_weights = ParameterProperty() biases = ParameterProperty() resetgate = Property(expected_type=types.FunctionType) updategate = Property(expected_type=types.FunctionType) hidden_update = Property(expected_type=types.FunctionType) hidden_init = ParameterProperty() learn_init = Property(default=False, expected_type=bool) backwards = Property(expected_type=bool) unroll_scan = Property(expected_type=bool) gradient_clipping = NumberProperty(default=0, minval=0) def __init__( self, n_units, only_return_final=True, # Trainable parameters input_weights=init.HeNormal(), hidden_weights=init.HeNormal(), biases=0, # Activation functions resetgate=tf.nn.sigmoid, updategate=tf.nn.sigmoid, hidden_update=tf.tanh, # Cell states hidden_init=0, learn_init=False, # Misc unroll_scan=False, backwards=False, gradient_clipping=0, name=None): super(GRU, self).__init__( n_units=n_units, only_return_final=only_return_final, name=name, ) self.input_weights = input_weights self.hidden_weights = hidden_weights self.biases = biases self.resetgate = resetgate self.updategate = updategate self.hidden_update = hidden_update self.hidden_init = hidden_init self.learn_init = learn_init self.unroll_scan = unroll_scan self.backwards = backwards self.gradient_clipping = gradient_clipping def create_variables(self, input_shape): self.input_weights = self.variable( value=self.input_weights, name='input_weights', shape=(input_shape[-1], 3 * self.n_units), ) self.hidden_weights = self.variable( value=self.hidden_weights, name='hidden_weights', shape=(self.n_units, 3 * self.n_units), ) self.biases = self.variable( value=self.biases, name='biases', shape=(3 * self.n_units, ), ) self.hidden_init = self.variable(value=self.hidden_init, shape=(1, self.n_units), name="hidden_init", trainable=self.learn_init) def output(self, input, **kwargs): # Because scan iterates over the first dimension we # dimshuffle to (n_time_steps, n_samples, n_features) input = tf.transpose(input, [1, 0, 2]) # Create single recurrent computation step function # input_n is the n'th vector of the input def one_gru_step(states, input_n): with tf.name_scope('gru-cell'): hid_previous, = states input_n = tf.matmul(input_n, self.input_weights) + self.biases # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, # and W_{hc} h_{t - 1} hid_input = tf.matmul(hid_previous, self.hidden_weights) if self.gradient_clipping != 0: input_n = clip_gradient(input_n, self.gradient_clipping) hid_input = clip_gradient(hid_input, self.gradient_clipping) hid_resetgate, hid_updategate, hid_hidden = tf.split(hid_input, 3, axis=1) in_resetgate, in_updategate, in_hidden = tf.split(input_n, 3, axis=1) # Reset and update gates resetgate = self.resetgate(hid_resetgate + in_resetgate) updategate = self.updategate(hid_updategate + in_updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update = in_hidden + resetgate * hid_hidden if self.gradient_clipping != 0: hidden_update = clip_gradient(hidden_update, self.gradient_clipping) hidden_update = self.hidden_update(hidden_update) # Compute (1 - u_t)h_{t - 1} + u_t c_t return [ hid_previous - updategate * (hid_previous - hidden_update) ] input_shape = tf.shape(input) n_samples = input_shape[1] # batch dim has been moved hidden_init = tf.tile(self.hidden_init, (n_samples, 1)) sequence = input if self.backwards: sequence = tf.reverse(sequence, axis=[0]) if self.unroll_scan: # Explicitly unroll the recurrence instead of using scan hid_out = unroll_scan(fn=one_gru_step, sequence=sequence, outputs_info=[hidden_init]) else: hid_out, = tf.scan( fn=one_gru_step, elems=sequence, initializer=[hidden_init], name='gru-scan', ) # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: return hid_out[-1] # if scan is backward reverse the output if self.backwards: hid_out = tf.reverse(hid_out, axis=[0]) # dimshuffle back to (n_samples, n_time_steps, n_features)) hid_out = tf.transpose(hid_out, [1, 0, 2]) return hid_out
class GRU(BaseRNNLayer): """ Gated Recurrent Unit (GRU) Layer. Parameters ---------- {BaseRNNLayer.size} input_weights : Initializer, ndarray Weight parameters for input connection. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. hidden_weights : Initializer, ndarray Weight parameters for hidden connection. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. bias : Initializer, ndarray Bias parameters for all gates. Defaults to :class:`Constant(0) <neupy.init.Constant>`. activation_functions : dict, callable Activation functions for different gates. Defaults to: .. code-block:: python # import tensorflow as tf dict( resetgate=tf.nn.sigmoid, updategate=tf.nn.sigmoid, hidden_update=tf.tanh, ) If application requires modification to only one parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(resetgate=tf.tanh) Other parameters like ``updategate`` or ``hidden_update`` will be equal to their default values. learn_init : bool If ``True``, make ``hidden_init`` trainable variable. Defaults to ``False``. hidden_init : array-like, Tensorfow variable, scalar or Initializer Initializer for initial hidden state (:math:`h_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. {BaseRNNLayer.only_return_final} backwards : bool If ``True``, process the sequence backwards and then reverse the output again such that the output from the layer is always from :math:`x_1` to :math:`x_n`. Defaults to ``False``. unroll_scan : bool If ``True`` the recursion is unrolled instead of using scan. For some graphs this gives a significant speed up but it might also consume more memory. When ``unroll_scan=True``, backpropagation always includes the full sequence, so ``n_gradient_steps`` must be set to ``-1`` and the input sequence length must be known at compile time (i.e., cannot be given as ``None``). Defaults to ``False``. {BaseLayer.Parameters} Notes ----- Code was adapted from the `Lasagne <https://github.com/Lasagne/Lasagne>`_ library. Examples -------- Sequence classification .. code-block:: python from neupy import layers, algorithms n_time_steps = 40 n_categories = 20 embedded_size = 10 network = algorithms.RMSProp( [ layers.Input(n_time_steps), layers.Embedding(n_categories, embedded_size), layers.GRU(20), layers.Sigmoid(1), ] ) """ input_weights = ParameterProperty(default=init.HeNormal()) hidden_weights = ParameterProperty(default=init.HeNormal()) biases = ParameterProperty(default=init.Constant(0)) activation_functions = MultiCallableProperty(default=dict( resetgate=tf.nn.sigmoid, updategate=tf.nn.sigmoid, hidden_update=tf.tanh, )) learn_init = Property(default=False, expected_type=bool) hidden_init = ParameterProperty(default=init.Constant(0)) backwards = Property(default=False, expected_type=bool) unroll_scan = Property(default=False, expected_type=bool) gradient_clipping = NumberProperty(default=0, minval=0) def initialize(self): super(GRU, self).initialize() n_inputs = np.prod(self.input_shape[1:]) self.input_weights = self.add_parameter( value=self.input_weights, name='input_weights', shape=(n_inputs, 3 * self.size), ) self.hidden_weights = self.add_parameter( value=self.hidden_weights, name='hidden_weights', shape=(self.size, 3 * self.size), ) self.biases = self.add_parameter( value=self.biases, name='biases', shape=(3 * self.size, ), ) self.add_parameter(value=self.hidden_init, shape=(1, self.size), name="hidden_init", trainable=self.learn_init) def output(self, input_value): # Because scan iterates over the first dimension we # dimshuffle to (n_time_steps, n_batch, n_features) input_value = tf.transpose(input_value, [1, 0, 2]) input_shape = tf.shape(input_value) n_batch = input_shape[1] # Create single recurrent computation step function # input_n is the n'th vector of the input def one_gru_step(states, input_n): with tf.name_scope('gru-cell'): hid_previous, = states input_n = tf.matmul(input_n, self.input_weights) + self.biases # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, # and W_{hc} h_{t - 1} hid_input = tf.matmul(hid_previous, self.hidden_weights) if self.gradient_clipping != 0: input_n = clip_gradient(input_n, self.gradient_clipping) hid_input = clip_gradient(hid_input, self.gradient_clipping) hid_resetgate, hid_updategate, hid_hidden = tf.split(hid_input, 3, axis=1) in_resetgate, in_updategate, in_hidden = tf.split(input_n, 3, axis=1) # Reset and update gates resetgate = self.activation_functions.resetgate(hid_resetgate + in_resetgate) updategate = self.activation_functions.updategate( hid_updategate + in_updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update = in_hidden + resetgate * hid_hidden if self.gradient_clipping != 0: hidden_update = clip_gradient(hidden_update, self.gradient_clipping) hidden_update = self.activation_functions.hidden_update( hidden_update) # Compute (1 - u_t)h_{t - 1} + u_t c_t return [ hid_previous - updategate * (hid_previous - hidden_update) ] hidden_init = tf.tile(self.hidden_init, (n_batch, 1)) sequence = input_value if self.backwards: sequence = tf.reverse(sequence, axis=[0]) if self.unroll_scan: # Explicitly unroll the recurrence instead of using scan hid_out = unroll_scan(fn=one_gru_step, sequence=sequence, outputs_info=[hidden_init]) else: hid_out, = tf.scan( fn=one_gru_step, elems=input_value, initializer=[hidden_init], name='gru-scan', ) # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: return hid_out[-1] # if scan is backward reverse the output if self.backwards: hid_out = tf.reverse(hid_out, axis=[0]) # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = tf.transpose(hid_out, [1, 0, 2]) return hid_out
class LSTM(BaseRNNLayer): """ Long Short Term Memory (LSTM) Layer. Parameters ---------- {BaseRNNLayer.size} weights : dict or Initializer Weight parameters for different gates. Defaults to :class:`XavierUniform() <neupy.init.XavierUniform>`. - In case if application requires the same initialization method for all weights, then it's possible to specify initialization method that would be automaticaly applied to all weight parameters in the LSTM layer. .. code-block:: python layers.LSTM(2, weights=init.Normal(0.1)) - In case if application requires different initialization values for different weights then it's possible to specify an exact weight by name. .. code-block:: python dict( weight_in_to_ingate=init.XavierUniform(), weight_hid_to_ingate=init.XavierUniform(), weight_cell_to_ingate=init.XavierUniform(), weight_in_to_forgetgate=init.XavierUniform(), weight_hid_to_forgetgate=init.XavierUniform(), weight_cell_to_forgetgate=init.XavierUniform(), weight_in_to_outgate=init.XavierUniform(), weight_hid_to_outgate=init.XavierUniform(), weight_cell_to_outgate=init.XavierUniform(), weight_in_to_cell=init.XavierUniform(), weight_hid_to_cell=init.XavierUniform(), ) If application requires modification to only one (or multiple) parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(weight_in_to_ingate=init.Normal(0.1)) Other parameters like ``weight_cell_to_outgate`` will be equal to their default values. biases : dict or Initializer Bias parameters for different gates. Defaults to :class:`Constant(0) <neupy.init.Constant>`. - In case if application requires the same initialization method for all biases, then it's possible to specify initialization method that would be automaticaly applied to all bias parameters in the LSTM layer. .. code-block:: python layers.LSTM(2, biases=init.Constant(1)) - In case if application requires different initialization values for different weights then it's possible to specify an exact weight by name. .. code-block:: python dict( bias_ingate=init.Constant(0), bias_forgetgate=init.Constant(0), bias_cell=init.Constant(0), bias_outgate=init.Constant(0), ) If application requires modification to only one (or multiple) parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(bias_ingate=init.Constant(1)) Other parameters like ``bias_cell`` will be equal to their default values. activation_functions : dict, callable Activation functions for different gates. Defaults to: .. code-block:: python # import theano.tensor as T dict( ingate=T.nnet.sigmoid, forgetgate=T.nnet.sigmoid, outgate=T.nnet.sigmoid, cell=T.tanh, ) If application requires modification to only one parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(ingate=T.tanh) Other parameters like ``forgetgate`` or ``outgate`` will be equal to their default values. learn_init : bool If ``True``, make ``cell_init`` and ``hid_init`` trainable variables. Defaults to ``False``. cell_init : array-like, Theano variable, scalar or Initializer Initializer for initial cell state (:math:`c_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. hid_init : array-like, Theano variable, scalar or Initializer Initializer for initial hidden state (:math:`h_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. backwards : bool If ``True``, process the sequence backwards and then reverse the output again such that the output from the layer is always from :math:`x_1` to :math:`x_n`. Defaults to ``False`` {BaseRNNLayer.only_return_final} precompute_input : bool if ``True``, precompute ``input_to_hid`` before iterating through the sequence. This can result in a speed up at the expense of an increase in memory usage. Defaults to ``True``. peepholes : bool If ``True``, the LSTM uses peephole connections. When ``False``, cell parameters are ignored. Defaults to ``False``. unroll_scan : bool If ``True`` the recursion is unrolled instead of using scan. For some graphs this gives a significant speed up but it might also consume more memory. When ``unroll_scan=True``, backpropagation always includes the full sequence, so ``n_gradient_steps`` must be set to ``-1`` and the input sequence length must be known at compile time (i.e., cannot be given as ``None``). Defaults to ``False``. gradient_clipping : flaot or int If nonzero, the gradient messages are clipped to the given value during the backward pass. Defaults to ``0``. n_gradient_steps : int Number of timesteps to include in the backpropagated gradient. If ``-1``, backpropagate through the entire sequence. Defaults to ``-1``. {BaseLayer.Parameters} Notes ----- Code was adapted from the `Lasagne <https://github.com/Lasagne/Lasagne>`_ library. Examples -------- Sequence classification .. code-block:: python from neupy import layers, algorithms n_time_steps = 40 n_categories = 20 embedded_size = 10 network = algorithms.RMSProp( [ layers.Input(n_time_steps), layers.Embedding(n_categories, embedded_size), layers.LSTM(20), layers.Sigmoid(1), ] ) """ weights = MultiParameterProperty( default=dict( weight_in_to_ingate=init.XavierUniform(), weight_hid_to_ingate=init.XavierUniform(), weight_cell_to_ingate=init.XavierUniform(), weight_in_to_forgetgate=init.XavierUniform(), weight_hid_to_forgetgate=init.XavierUniform(), weight_cell_to_forgetgate=init.XavierUniform(), weight_in_to_outgate=init.XavierUniform(), weight_hid_to_outgate=init.XavierUniform(), weight_cell_to_outgate=init.XavierUniform(), weight_in_to_cell=init.XavierUniform(), weight_hid_to_cell=init.XavierUniform(), )) biases = MultiParameterProperty( default=dict( bias_ingate=init.Constant(0), bias_forgetgate=init.Constant(0), bias_cell=init.Constant(0), bias_outgate=init.Constant(0), )) activation_functions = MultiCallableProperty( default=dict( ingate=T.nnet.sigmoid, forgetgate=T.nnet.sigmoid, outgate=T.nnet.sigmoid, cell=T.tanh, )) learn_init = Property(default=False, expected_type=bool) cell_init = ParameterProperty(default=init.Constant(0)) hid_init = ParameterProperty(default=init.Constant(0)) unroll_scan = Property(default=False, expected_type=bool) backwards = Property(default=False, expected_type=bool) precompute_input = Property(default=True, expected_type=bool) peepholes = Property(default=False, expected_type=bool) n_gradient_steps = IntProperty(default=-1) gradient_clipping = NumberProperty(default=0, minval=0) def initialize(self): super(LSTM, self).initialize() n_inputs = np.prod(self.input_shape[1:]) weights = self.weights biases = self.biases # Input gate parameters self.weight_in_to_ingate = self.add_parameter( value=weights.weight_in_to_ingate, name='weight_in_to_ingate', shape=(n_inputs, self.size)) self.weight_hid_to_ingate = self.add_parameter( value=weights.weight_hid_to_ingate, name='weight_hid_to_ingate', shape=(self.size, self.size)) self.bias_ingate = self.add_parameter( value=biases.bias_ingate, name='bias_ingate', shape=(self.size,)) # Forget gate parameters self.weight_in_to_forgetgate = self.add_parameter( value=weights.weight_in_to_forgetgate, name='weight_in_to_forgetgate', shape=(n_inputs, self.size)) self.weight_hid_to_forgetgate = self.add_parameter( value=weights.weight_hid_to_forgetgate, name='weight_hid_to_forgetgate', shape=(self.size, self.size)) self.bias_forgetgate = self.add_parameter( value=biases.bias_forgetgate, name='bias_forgetgate', shape=(self.size,)) # Cell parameters self.weight_in_to_cell = self.add_parameter( value=weights.weight_in_to_cell, name='weight_in_to_cell', shape=(n_inputs, self.size)) self.weight_hid_to_cell = self.add_parameter( value=weights.weight_hid_to_cell, name='weight_hid_to_cell', shape=(self.size, self.size)) self.bias_cell = self.add_parameter( value=biases.bias_cell, name='bias_cell', shape=(self.size,)) # If peephole (cell to gate) connections were enabled, initialize # peephole connections. These are elementwise products with the cell # state, so they are represented as vectors. if self.peepholes: self.weight_cell_to_ingate = self.add_parameter( value=weights.weight_cell_to_ingate, name='weight_cell_to_ingate', shape=(self.size,)) self.weight_cell_to_forgetgate = self.add_parameter( value=weights.weight_cell_to_forgetgate, name='weight_cell_to_forgetgate', shape=(self.size,)) self.weight_cell_to_outgate = self.add_parameter( value=weights.weight_cell_to_outgate, name='weight_cell_to_outgate', shape=(self.size,)) # Output gate parameters self.weight_in_to_outgate = self.add_parameter( value=weights.weight_in_to_outgate, name='weight_in_to_outgate', shape=(n_inputs, self.size)) self.weight_hid_to_outgate = self.add_parameter( value=weights.weight_hid_to_outgate, name='weight_hid_to_outgate', shape=(self.size, self.size)) self.bias_outgate = self.add_parameter( value=biases.bias_outgate, name='bias_outgate', shape=(self.size,)) # Initialization parameters self.add_parameter(value=self.cell_init, shape=(1, self.size), name="cell_init", trainable=self.learn_init) self.add_parameter(value=self.hid_init, shape=(1, self.size), name="hid_init", trainable=self.learn_init) def output(self, input_value): # Treat all dimensions after the second as flattened # feature dimensions if input_value.ndim > 3: input_value = T.flatten(input_value, 3) # Because scan iterates over the first dimension we # dimshuffle to (n_time_steps, n_batch, n_features) input_value = input_value.dimshuffle(1, 0, 2) seq_len, n_batch, _ = input_value.shape # Stack input weight matrices into a (num_inputs, 4 * num_units) # matrix, which speeds up computation weight_in_stacked = T.concatenate([ self.weight_in_to_ingate, self.weight_in_to_forgetgate, self.weight_in_to_cell, self.weight_in_to_outgate], axis=1) # Same for hidden weight matrices weight_hid_stacked = T.concatenate([ self.weight_hid_to_ingate, self.weight_hid_to_forgetgate, self.weight_hid_to_cell, self.weight_hid_to_outgate], axis=1) # Stack biases into a (4 * num_units) vector bias_stacked = T.concatenate([ self.bias_ingate, self.bias_forgetgate, self.bias_cell, self.bias_outgate], axis=0) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # weight_in_stacked is (n_features, 4 * num_units). # Input: (n_time_steps, n_batch, 4 * num_units). input_value = T.dot(input_value, weight_in_stacked) + bias_stacked # When theano.scan calls step, input_n will be # (n_batch, 4 * num_units). We define a slicing function # that extract the input to each LSTM gate def slice_w(x, n): return x[:, n * self.size:(n + 1) * self.size] def one_lstm_step(input_n, cell_previous, hid_previous, *args): if not self.precompute_input: input_n = T.dot(input_n, weight_in_stacked) + bias_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, weight_hid_stacked) # Clip gradients if self.gradient_clipping: gates = theano.gradient.grad_clip( gates, -self.gradient_clipping, self.gradient_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.weight_cell_to_ingate forgetgate += cell_previous * self.weight_cell_to_forgetgate # Apply nonlinearities ingate = self.activation_functions.ingate(ingate) forgetgate = self.activation_functions.forgetgate(forgetgate) cell_input = self.activation_functions.cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.weight_cell_to_outgate outgate = self.activation_functions.outgate(outgate) # Compute new hidden unit activation hid = outgate * T.tanh(cell) return [cell, hid] ones = T.ones((n_batch, 1)) cell_init = T.dot(ones, self.cell_init) hid_init = T.dot(ones, self.hid_init) non_sequences = [weight_hid_stacked] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_sequences += [weight_in_stacked, bias_stacked] # The "peephole" weight matrices are only used # when self.peepholes=True if self.peepholes: non_sequences += [self.weight_cell_to_ingate, self.weight_cell_to_forgetgate, self.weight_cell_to_outgate] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer n_time_steps = self.input_shape[0] # Explicitly unroll the recurrence instead of using scan _, hid_out = unroll_scan( fn=one_lstm_step, sequences=[input_value], outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_sequences, n_steps=n_time_steps) else: (_, hid_out), _ = theano.scan( fn=one_lstm_step, sequences=input_value, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, truncate_gradient=self.n_gradient_steps, non_sequences=non_sequences, strict=True) # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: return hid_out[-1] # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
class BaseStepAssociative(BaseAssociative): """ Base class for associative algorithms which have 2 layers and first one is has step function as activation. Parameters ---------- {BaseAssociative.n_inputs} {BaseAssociative.n_outputs} n_unconditioned : int Number of unconditioned units in neraul networks. All these units wouldn't update during the training procedure. Unconditioned should be the first feature in the dataset. weight : array-like Neural network weights. Value defined manualy should have shape ``(n_inputs, n_outputs)``. Defaults to ``None`` which means that all unconditional weights will be equal to ``1``. Other weights equal to ``0``. bias : array-like, Initializer Neural network bias units. Defaults to :class:`Constant(-0.5) <neupy.init.Constant>`. {BaseNetwork.step} {BaseNetwork.show_epoch} {BaseNetwork.shuffle_data} {BaseNetwork.epoch_end_signal} {BaseNetwork.train_end_signal} {Verbose.verbose} Methods ------- {BaseSkeleton.predict} {BaseAssociative.train} {BaseSkeleton.fit} """ n_inputs = IntProperty(minval=2, required=True) n_unconditioned = IntProperty(minval=1, required=True) weight = ArrayProperty() bias = ParameterProperty(default=init.Constant(-0.5)) def init_layers(self): if self.n_inputs <= self.n_unconditioned: raise ValueError( "Number of uncondition features should be less than total " "number of features. `n_inputs`={} and " "`n_unconditioned`={}".format(self.n_inputs, self.n_unconditioned)) valid_weight_shape = (self.n_inputs, self.n_outputs) valid_bias_shape = (self.n_outputs, ) if self.weight is None: self.weight = np.zeros(valid_weight_shape) self.weight[:self.n_unconditioned, :] = 1 if isinstance(self.bias, init.Initializer): self.bias = self.bias.sample(valid_bias_shape) super(BaseStepAssociative, self).init_layers() if self.bias.shape != valid_bias_shape: raise ValueError("Bias vector has invalid shape. Got {}, " "expected {}".format(self.bias.shape, valid_bias_shape)) self.bias = self.bias.astype(float) def predict(self, input_data): input_data = format_data(input_data, is_feature1d=False) raw_output = input_data.dot(self.weight) + self.bias return np.where(raw_output > 0, 1, 0) def train(self, input_train, *args, **kwargs): input_train = format_data(input_train, is_feature1d=False) return super(BaseStepAssociative, self).train(input_train, *args, **kwargs) def train_epoch(self, input_train, target_train): weight = self.weight n_unconditioned = self.n_unconditioned predict = self.predict weight_delta = self.weight_delta for input_row in input_train: input_row = np.reshape(input_row, (1, input_row.size)) layer_output = predict(input_row) weight[n_unconditioned:, :] += weight_delta( input_row, layer_output)
class PRelu(ActivationLayer): """ The layer with the parametrized ReLu activation function. Parameters ---------- alpha_axes : int or tuple Axes that will not include unique alpha parameter. Single integer value defines the same as a tuple with one value. Defaults to ``1``. alpha : array-like, Theano shared variable, scalar or Initializer Alpha parameter per each non-shared axis for the ReLu. Scalar value means that each element in the tensor will be equal to the specified value. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=0.25)``. {ActivationLayer.Parameters} Methods ------- {ActivationLayer.Methods} Attributes ---------- {ActivationLayer.Attributes} References ---------- .. [1] https://arxiv.org/pdf/1502.01852v1.pdf """ alpha_axes = AxesProperty(default=1) alpha = ParameterProperty(default=Constant(value=0.25)) def initialize(self): super(PRelu, self).initialize() alpha = self.alpha alpha_axes = self.alpha_axes output_shape = self.output_shape if 0 in alpha_axes: raise ValueError("Cannot specify alpha per input sample.") if max(alpha_axes) > len(output_shape): raise ValueError("Cannot specify alpha for the axis #{}. " "Maximum available axis is #{}" "".format(max(alpha_axes), len(output_shape) - 1)) alpha_shape = [output_shape[axis - 1] for axis in alpha_axes] if isinstance(alpha, Initializer): alpha = alpha.sample(alpha_shape) self.alpha = create_shared_parameter(value=alpha, name='alpha_{}'.format( self.layer_id), shape=alpha_shape) self.parameters.append(self.alpha) def activation_function(self, input_value): alpha = dimshuffle(self.alpha, input_value.ndim, self.alpha_axes) return T.nnet.relu(input_value, alpha)
class PRelu(ActivationLayer): """ The layer with the parametrized ReLu activation function. Parameters ---------- alpha_axes : int or tuple Axes that will not include unique alpha parameter. Single integer value defines the same as a tuple with one value. Defaults to ``-1``. alpha : array-like, Tensorfow variable, scalar or Initializer Alpha parameter per each non-shared axis for the ReLu. Scalar value means that each element in the tensor will be equal to the specified value. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=0.25)``. {ActivationLayer.Parameters} Methods ------- {ActivationLayer.Methods} Attributes ---------- {ActivationLayer.Attributes} Examples -------- Feedforward Neural Networks (FNN) >>> from neupy.layers import * >>> network = Input(10) > PRelu(20) > PRelu(1) Convolutional Neural Networks (CNN) >>> from neupy.layers import * >>> network = join( ... Input((32, 32, 3)), ... Convolution((3, 3, 16)) > PRelu(), ... Convolution((3, 3, 32)) > PRelu(), ... Reshape(), ... Softmax(10), ... ) References ---------- .. [1] https://arxiv.org/pdf/1502.01852v1.pdf """ alpha_axes = AxesProperty(default=-1) alpha = ParameterProperty(default=init.Constant(value=0.25)) def __init__(self, *args, **options): super(PRelu, self).__init__(*args, **options) if 0 in self.alpha_axes: raise ValueError("Cannot specify alpha for 0-axis") def validate(self, input_shape): if max(self.alpha_axes) > len(input_shape): max_axis_index = len(input_shape) - 1 raise ValueError("Cannot specify alpha for the axis #{}. " "Maximum available axis is {} (0-based indeces)." "".format(max(self.alpha_axes), max_axis_index)) def initialize(self): super(PRelu, self).initialize() output_shape = as_tuple(None, self.output_shape) alpha_shape = [output_shape[axis] for axis in self.alpha_axes] self.add_parameter( value=self.alpha, name='alpha', shape=alpha_shape, trainable=True, ) def activation_function(self, input_value): input_value = tf.convert_to_tensor(input_value, dtype=tf.float32) ndim = len(input_value.get_shape()) dimensions = np.arange(ndim) alpha_axes = dimensions[list(self.alpha_axes)] alpha = dimshuffle(self.alpha, ndim, alpha_axes) return tf.nn.leaky_relu(tf.to_float(input_value), tf.to_float(alpha))
class Linear(BaseLayer): """ Layer with linear activation function. It applies linear transformation when the ``n_units`` parameter specified and acts as an identity when it's not specified. Parameters ---------- n_units : int or None Number of units in the layers. It also corresponds to the number of output features that will be produced per sample after passing it through this layer. The ``None`` value means that layer will not have parameters and it will only apply activation function to the input without linear transformation output for the specified input value. Defaults to ``None``. weight : array-like, Tensorfow variable, scalar or Initializer Defines layer's weights. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. bias : 1D array-like, Tensorfow variable, scalar, Initializer or None Defines layer's bias. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`Constant(0) <neupy.init.Constant>`. The ``None`` value excludes bias from the calculations and do not add it into parameters list. {BaseLayer.name} Methods ------- {BaseLayer.Methods} activation_function(input) Applies activation function to the input. Attributes ---------- {BaseLayer.Attributes} Examples -------- Linear Regression >>> from neupy.layers import * >>> network = Input(10) >> Linear(5) """ n_units = IntProperty(minval=1, allow_none=True) weight = ParameterProperty() bias = ParameterProperty(allow_none=True) def __init__(self, n_units=None, weight=init.HeNormal(), bias=0, name=None): super(Linear, self).__init__(name=name) self.n_units = n_units self.weight = weight self.bias = bias def get_output_shape(self, input_shape): input_shape = tf.TensorShape(input_shape) if self.n_units is None: return input_shape if input_shape and input_shape.ndims != 2: raise LayerConnectionError( "Input shape expected to have 2 dimensions, got {} instead. " "Shape: {}".format(input_shape.ndims, input_shape)) n_samples = input_shape[0] return tf.TensorShape((n_samples, self.n_units)) def create_variables(self, input_shape): if self.n_units is None: return input_shape = tf.TensorShape(input_shape) self.input_shape = input_shape _, n_input_features = input_shape if n_input_features.value is None: raise WeightInitializationError( "Cannot create variables for the layer `{}`, because " "number of input features is unknown. Input shape: {}" "Layer: {}".format(self.name, input_shape, self)) self.weight = self.variable(value=self.weight, name='weight', shape=as_tuple(n_input_features, self.n_units)) if self.bias is not None: self.bias = self.variable(value=self.bias, name='bias', shape=as_tuple(self.n_units)) def output(self, input, **kwargs): input = tf.convert_to_tensor(input, dtype=tf.float32) if self.n_units is None: return self.activation_function(input) if self.bias is None: output = tf.matmul(input, self.weight) return self.activation_function(output) output = tf.matmul(input, self.weight) + self.bias return self.activation_function(output) def activation_function(self, input_value): return input_value def __repr__(self): if self.n_units is None: return self._repr_arguments(name=self.name) return self._repr_arguments( self.n_units, name=self.name, weight=self.weight, bias=self.bias, )
class GroupNorm(Identity): """ Group Normalization layer. This layer is a simple alternative to the Batch Normalization layer for cases when batch size is small. Parameters ---------- n_groups : int During normalization all the channels will be break down into separate groups and mean and variance will be estimated per group. This parameter controls number of groups. gamma : array-like, Tensorfow variable, scalar or Initializer Scale. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=1)``. beta : array-like, Tensorfow variable, scalar or Initializer Offset. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=0)``. epsilon : float Epsilon ensures that input rescaling procedure that uses estimated variance will never cause division by zero. Defaults to ``1e-5``. {Identity.name} Methods ------- {Identity.Methods} Attributes ---------- {Identity.Attributes} Examples -------- Convolutional Neural Networks (CNN) >>> from neupy.layers import * >>> network = join( ... Input((28, 28, 1)), ... Convolution((3, 3, 16)) >> GroupNorm(4) >> Relu(), ... Convolution((3, 3, 16)) >> GroupNorm(4) >> Relu(), ... Reshape(), ... Softmax(10), ... ) References ---------- .. [1] Group Normalization, Yuxin Wu, Kaiming He, https://arxiv.org/pdf/1803.08494.pdf """ n_groups = IntProperty(minval=1) beta = ParameterProperty() gamma = ParameterProperty() epsilon = NumberProperty(minval=0) def __init__(self, n_groups, beta=0, gamma=1, epsilon=1e-5, name=None): super(GroupNorm, self).__init__(name=name) self.n_groups = n_groups self.beta = beta self.gamma = gamma self.epsilon = epsilon def create_variables(self, input_shape): n_channels = input_shape[3] if n_channels.value is None: raise WeightInitializationError( "Cannot initialize variables when number of " "channels is unknown. Input shape: {}, Layer: {}" "".format(input_shape, self)) parameter_shape = (1, 1, 1, n_channels) self.gamma = self.variable(value=self.gamma, name='gamma', shape=parameter_shape) self.beta = self.variable(value=self.beta, name='beta', shape=parameter_shape) def get_output_shape(self, input_shape): input_shape = tf.TensorShape(input_shape) if input_shape and input_shape.ndims != 4: raise LayerConnectionError( "Group normalization layer expects 4 dimensional input, " "got {} instead. Input shape: {}, Layer: {}" "".format(input_shape.ndims, input_shape, self)) n_channels = input_shape[3] if n_channels.value and n_channels % self.n_groups != 0: raise LayerConnectionError( "Cannot divide {} input channels into {} groups. " "Input shape: {}, Layer: {}".format(n_channels, self.n_groups, input_shape, self)) return super(GroupNorm, self).get_output_shape(input_shape) def output(self, input): input = tf.convert_to_tensor(input, dtype=tf.float32) input_shape = tf.shape(input) n_groups = self.n_groups # We access dimensional information in form of tensors in case # if some of the dimensions are undefined. In this way we make # sure that reshape will work even if part of the input shape # is undefined. dims = [input_shape[i] for i in range(4)] n_samples, height, width, n_channels = dims input = tf.reshape( input, [n_samples, height, width, n_groups, n_channels // n_groups]) mean, variance = tf.nn.moments(input, [1, 2, 4], keep_dims=True) input = (input - mean) / tf.sqrt(variance + self.epsilon) input = tf.reshape(input, input_shape) return input * self.gamma + self.beta
class BatchNorm(Identity): """ Batch normalization layer. Parameters ---------- axes : tuple with ints or None Axes along which normalization will be applied. The ``None`` value means that normalization will be applied over all axes except the last one. In case of 4D tensor it will be equal to ``(0, 1, 2)``. Defaults to ``None``. epsilon : float Epsilon is a positive constant that adds to the standard deviation to prevent the division by zero. Defaults to ``1e-5``. alpha : float Coefficient for the exponential moving average of batch-wise means and standard deviations computed during training; the closer to one, the more it will depend on the last batches seen. Value needs to be between ``0`` and ``1``. Defaults to ``0.1``. gamma : array-like, Tensorfow variable, scalar or Initializer Scale. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=1)``. beta : array-like, Tensorfow variable, scalar or Initializer Offset. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=0)``. running_mean : array-like, Tensorfow variable, scalar or Initializer Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=0)``. running_inv_std : array-like, Tensorfow variable, scalar or Initializer Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=1)``. {Identity.name} Methods ------- {Identity.Methods} Attributes ---------- {Identity.Attributes} Examples -------- Feedforward Neural Networks (FNN) with batch normalization after activation function was applied. >>> from neupy.layers import * >>> network = join( ... Input(10), ... Relu(5) >> BatchNorm(), ... Relu(5) >> BatchNorm(), ... Sigmoid(1), ... ) Feedforward Neural Networks (FNN) with batch normalization before activation function was applied. >>> from neupy.layers import * >>> network = join( ... Input(10), ... Linear(5) >> BatchNorm() >> Relu(), ... Linear(5) >> BatchNorm() >> Relu(), ... Sigmoid(1), ... ) Convolutional Neural Networks (CNN) >>> from neupy.layers import * >>> network = join( ... Input((28, 28, 1)), ... Convolution((3, 3, 16)) >> BatchNorm() >> Relu(), ... Convolution((3, 3, 16)) >> BatchNorm() >> Relu(), ... Reshape(), ... Softmax(10), ... ) References ---------- .. [1] Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift, http://arxiv.org/pdf/1502.03167v3.pdf """ axes = TypedListProperty(allow_none=True) epsilon = NumberProperty(minval=0) alpha = ProperFractionProperty() beta = ParameterProperty() gamma = ParameterProperty() running_mean = ParameterProperty() running_inv_std = ParameterProperty() def __init__(self, axes=None, alpha=0.1, beta=0, gamma=1, epsilon=1e-5, running_mean=0, running_inv_std=1, name=None): super(BatchNorm, self).__init__(name=name) self.axes = axes self.alpha = alpha self.beta = beta self.gamma = gamma self.epsilon = epsilon self.running_mean = running_mean self.running_inv_std = running_inv_std if axes is not None and len(set(axes)) != len(axes): raise ValueError( "Specified axes have to contain only unique values") def create_variables(self, input_shape): input_shape = tf.TensorShape(input_shape) if input_shape.ndims is None: raise WeightInitializationError( "Cannot initialize variables for the batch normalization " "layer, because input shape is undefined. Layer: {}" "".format(self)) if self.axes is None: # If ndims == 4 then axes = (0, 1, 2) # If ndims == 2 then axes = (0,) self.axes = tuple(range(input_shape.ndims - 1)) if any(axis >= input_shape.ndims for axis in self.axes): raise LayerConnectionError( "Batch normalization cannot be applied over one of " "the axis, because input has only {} dimensions. Layer: {}" "".format(input_shape.ndims, self)) parameter_shape = tuple([ input_shape[axis].value if axis not in self.axes else 1 for axis in range(input_shape.ndims) ]) if any(parameter is None for parameter in parameter_shape): unknown_dim_index = parameter_shape.index(None) raise WeightInitializationError( "Cannot create variables for batch normalization, because " "input has unknown dimension #{} (0-based indices). " "Input shape: {}, Layer: {}".format(unknown_dim_index, input_shape, self)) self.input_shape = input_shape self.running_mean = self.variable(value=self.running_mean, shape=parameter_shape, name='running_mean', trainable=False) self.running_inv_std = self.variable(value=self.running_inv_std, shape=parameter_shape, name='running_inv_std', trainable=False) self.gamma = self.variable(value=self.gamma, name='gamma', shape=parameter_shape) self.beta = self.variable(value=self.beta, name='beta', shape=parameter_shape) def output(self, input, training=False): input = tf.convert_to_tensor(input, dtype=tf.float32) if not training: mean = self.running_mean inv_std = self.running_inv_std else: alpha = asfloat(self.alpha) mean = tf.reduce_mean( input, self.axes, keepdims=True, name="mean", ) variance = tf.reduce_mean( tf.squared_difference(input, tf.stop_gradient(mean)), self.axes, keepdims=True, name="variance", ) inv_std = tf.rsqrt(variance + asfloat(self.epsilon)) tf.add_to_collection( tf.GraphKeys.UPDATE_OPS, self.running_inv_std.assign( asfloat(1 - alpha) * self.running_inv_std + alpha * inv_std)) tf.add_to_collection( tf.GraphKeys.UPDATE_OPS, self.running_mean.assign( asfloat(1 - alpha) * self.running_mean + alpha * mean)) normalized_value = (input - mean) * inv_std return self.gamma * normalized_value + self.beta
class PRelu(Linear): """ Layer with the parametrized ReLu used as an activation function. Layer learns additional parameter ``alpha`` during the training. It applies linear transformation when the ``n_units`` parameter specified and parametrized relu function after the transformation. When ``n_units`` is not specified, only parametrized relu function will be applied to the input. Parameters ---------- alpha_axes : int or tuple Axes that will not include unique alpha parameter. Single integer value defines the same as a tuple with one value. Defaults to ``-1``. alpha : array-like, Tensorfow variable, scalar or Initializer Separate alpha parameter per each non-shared axis for the ReLu. Scalar value means that each element in the tensor will be equal to the specified value. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=0.25)``. {Linear.Parameters} Methods ------- {Linear.Methods} Attributes ---------- {Linear.Attributes} Examples -------- Feedforward Neural Networks (FNN) >>> from neupy.layers import * >>> network = Input(10) >> PRelu(20) >> PRelu(1) Convolutional Neural Networks (CNN) >>> from neupy.layers import * >>> network = join( ... Input((32, 32, 3)), ... Convolution((3, 3, 16)) >> PRelu(), ... Convolution((3, 3, 32)) >> PRelu(), ... Reshape(), ... Softmax(10), ... ) References ---------- .. [1] Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification. https://arxiv.org/pdf/1502.01852v1.pdf """ alpha_axes = TypedListProperty() alpha = ParameterProperty() def __init__(self, n_units=None, alpha_axes=-1, alpha=0.25, weight=init.HeNormal(gain=2), bias=0, name=None): self.alpha = alpha self.alpha_axes = as_tuple(alpha_axes) if 0 in self.alpha_axes: raise ValueError("Cannot specify alpha for 0-axis") super(PRelu, self).__init__(n_units=n_units, weight=weight, bias=bias, name=name) def get_output_shape(self, input_shape): input_shape = tf.TensorShape(input_shape) if input_shape and max(self.alpha_axes) >= input_shape.ndims: max_axis_index = input_shape.ndims - 1 raise LayerConnectionError( "Cannot specify alpha for the axis #{}. Maximum " "available axis is {} (0-based indices)." "".format(max(self.alpha_axes), max_axis_index)) return super(PRelu, self).get_output_shape(input_shape) def create_variables(self, input_shape): super(PRelu, self).create_variables(input_shape) output_shape = self.get_output_shape(input_shape) self.alpha = self.variable( value=self.alpha, name='alpha', shape=[output_shape[axis] for axis in self.alpha_axes]) def activation_function(self, input): input = tf.convert_to_tensor(input, dtype=tf.float32) ndim = input.shape.ndims dimensions = np.arange(ndim) alpha_axes = dimensions[list(self.alpha_axes)] alpha = tf_utils.dimshuffle(self.alpha, ndim, alpha_axes) return tf.maximum(0.0, input) + alpha * tf.minimum(0.0, input) def __repr__(self): if self.n_units is None: return self._repr_arguments(name=self.name, alpha_axes=self.alpha_axes, alpha=self.alpha) return self._repr_arguments(self.n_units, name=self.name, alpha_axes=self.alpha_axes, alpha=self.alpha, weight=self.weight, bias=self.bias)
class Convolution(ParameterBasedLayer): """ Convolutional layer. Parameters ---------- size : tuple of int Filter shape. In should be defined as a tuple with three integers ``(filter rows, filter columns, output channels)``. padding : {{``same``, ``valid``}}, int, tuple Zero padding for the input tensor. - ``valid`` - Padding won't be added to the tensor. Result will be the same as for ``padding=0`` - ``same`` - Padding will depend on the number of rows and columns in the filter. This padding makes sure that image with the ``stride=1`` won't change its width and height. It's the same as ``padding=(filter rows // 2, filter columns // 2)``. - Custom value for the padding can be specified as an integer, like ``padding=1`` or it can be specified as a tuple when different dimensions have different padding values, for example ``padding=(2, 3)``. Defaults to ``valid``. stride : tuple with ints, int. Stride size. Defaults to ``(1, 1)`` dilation : int, tuple Rate for the fiter upsampling. When ``dilation > 1`` layer will become diated convolution (or atrous convolution). Defaults to ``1``. weight : array-like, Tensorfow variable, scalar or Initializer Defines layer's weights. Shape of the weight will be equal to ``(filter rows, filter columns, input channels, output channels)``. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`HeNormal(gain=2) <neupy.init.HeNormal>`. {ParameterBasedLayer.bias} {BaseLayer.Parameters} Examples -------- 2D Convolution >>> from neupy import layers >>> >>> layers.join( ... layers.Input((28, 28, 3)), ... layers.Convolution((3, 3, 16)), ... ) 1D Convolution >>> from neupy import layers >>> >>> layers.join( ... layers.Input((30, 10)), ... layers.Reshape((30, 1, 10)), ... layers.Convolution((3, 1, 16)), ... ) Methods ------- {ParameterBasedLayer.Methods} Attributes ---------- {ParameterBasedLayer.Attributes} """ # We use gain=2 because it's suitable choice for relu non-linearity # and relu is the most common non-linearity used for CNN. weight = ParameterProperty(default=init.HeNormal(gain=2)) size = TypedListProperty(required=True, element_type=int) padding = PaddingProperty(default='valid') stride = Spatial2DProperty(default=(1, 1)) dilation = Spatial2DProperty(default=1) def validate(self, input_shape): if input_shape and len(input_shape) != 3: raise LayerConnectionError( "Convolutional layer expects an input with 3 " "dimensions, got {} with shape {}" "".format(len(input_shape), input_shape)) def output_shape_per_dim(self, *args, **kwargs): return conv_output_shape(*args, **kwargs) def find_output_from_input_shape(self, input_shape): padding = self.padding rows, cols, _ = input_shape row_filter_size, col_filter_size, n_kernels = self.size row_stride, col_stride = self.stride row_dilation, col_dilation = self.dilation or (1, 1) if isinstance(padding, (list, tuple)): row_padding, col_padding = padding else: row_padding, col_padding = padding, padding output_rows = self.output_shape_per_dim( rows, row_filter_size, row_padding, row_stride, row_dilation, ) output_cols = self.output_shape_per_dim( cols, col_filter_size, col_padding, col_stride, col_dilation, ) return (output_rows, output_cols, n_kernels) @property def output_shape(self): if self.input_shape is not None: return self.find_output_from_input_shape(self.input_shape) @property def weight_shape(self): n_channels = self.input_shape[-1] n_rows, n_cols, n_filters = self.size return (n_rows, n_cols, n_channels, n_filters) @property def bias_shape(self): return as_tuple(self.size[-1]) def output(self, input_value): padding = self.padding if not isinstance(padding, six.string_types): height_pad, weight_pad = padding input_value = tf.pad(input_value, [ [0, 0], [height_pad, height_pad], [weight_pad, weight_pad], [0, 0], ]) # VALID option will make sure that # convolution won't use any padding. padding = 'VALID' output = tf.nn.convolution( input_value, self.weight, padding=padding, strides=self.stride, dilation_rate=self.dilation, data_format="NHWC" ) if self.bias is not None: bias = tf.reshape(self.bias, (1, 1, 1, -1)) output += bias return output
class Convolution(BaseLayer): """ Convolutional layer. Parameters ---------- size : tuple of int Filter shape. In should be defined as a tuple with three integers ``(filter rows, filter columns, output channels)``. padding : {{``same``, ``valid``}}, int, tuple Zero padding for the input tensor. - ``valid`` - Padding won't be added to the tensor. Result will be the same as for ``padding=0`` - ``same`` - Padding will depend on the number of rows and columns in the filter. This padding makes sure that image with the ``stride=1`` won't change its width and height. It's the same as ``padding=(filter rows // 2, filter columns // 2)``. - Custom value for the padding can be specified as an integer, like ``padding=1`` or it can be specified as a tuple when different dimensions have different padding values, for example ``padding=(2, 3)``. Defaults to ``valid``. stride : tuple with ints, int. Stride size. Defaults to ``(1, 1)`` dilation : int, tuple Rate for the filter upsampling. When ``dilation > 1`` layer will become dilated convolution (or atrous convolution). Defaults to ``1``. weight : array-like, Tensorfow variable, scalar or Initializer Defines layer's weights. Shape of the weight will be equal to ``(filter rows, filter columns, input channels, output channels)``. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`HeNormal(gain=2) <neupy.init.HeNormal>`. bias : 1D array-like, Tensorfow variable, scalar, Initializer or None Defines layer's bias. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`Constant(0) <neupy.init.Constant>`. The ``None`` value excludes bias from the calculations and do not add it into parameters list. {BaseLayer.name} Examples -------- 2D Convolution >>> from neupy import layers >>> >>> layers.join( ... layers.Input((28, 28, 3)), ... layers.Convolution((3, 3, 16)), ... ) 1D Convolution >>> from neupy.layers import * >>> network = join( ... Input((30, 10)), ... Reshape((30, 1, 10)), # convert 3D to 4D ... Convolution((3, 1, 16)), ... Reshape((-1, 16)) # convert 4D back to 3D ... ) >>> network (?, 30, 10) -> [... 4 layers ...] -> (?, 28, 16) Methods ------- {BaseLayer.Methods} Attributes ---------- {BaseLayer.Attributes} """ size = TypedListProperty(element_type=int, n_elements=3) weight = ParameterProperty() bias = ParameterProperty(allow_none=True) padding = PaddingProperty() stride = Spatial2DProperty() dilation = Spatial2DProperty() # We use gain=2 because it's suitable choice for relu non-linearity # and relu is the most common non-linearity used for CNN. def __init__(self, size, padding='valid', stride=1, dilation=1, weight=init.HeNormal(gain=2), bias=0, name=None): super(Convolution, self).__init__(name=name) self.size = size self.padding = padding self.stride = stride self.dilation = dilation self.weight = weight self.bias = bias def fail_if_shape_invalid(self, input_shape): if input_shape and input_shape.ndims != 4: raise LayerConnectionError( "Convolutional layer expects an input with 4 " "dimensions, got {} with shape {}" "".format(len(input_shape), input_shape)) def output_shape_per_dim(self, *args, **kwargs): return conv_output_shape(*args, **kwargs) def expected_output_shape(self, input_shape): n_samples = input_shape[0] row_filter_size, col_filter_size, n_kernels = self.size row_stride, col_stride = self.stride row_dilation, col_dilation = self.dilation if isinstance(self.padding, (list, tuple)): row_padding, col_padding = self.padding else: row_padding, col_padding = self.padding, self.padding return ( n_samples, self.output_shape_per_dim( input_shape[1], row_filter_size, row_padding, row_stride, row_dilation ), self.output_shape_per_dim( input_shape[2], col_filter_size, col_padding, col_stride, col_dilation ), n_kernels, ) def get_output_shape(self, input_shape): input_shape = tf.TensorShape(input_shape) self.fail_if_shape_invalid(input_shape) if input_shape.ndims is None: n_samples = input_shape[0] n_kernels = self.size[-1] return tf.TensorShape((n_samples, None, None, n_kernels)) return tf.TensorShape(self.expected_output_shape(input_shape)) def create_variables(self, input_shape): self.input_shape = input_shape n_channels = input_shape[-1] n_rows, n_cols, n_filters = self.size # Compare to the regular convolution weights, # transposed one has switched input and output channels. self.weight = self.variable( value=self.weight, name='weight', shape=(n_rows, n_cols, n_channels, n_filters)) if self.bias is not None: self.bias = self.variable( value=self.bias, name='bias', shape=as_tuple(n_filters)) def output(self, input, **kwargs): input = tf.convert_to_tensor(input, tf.float32) self.fail_if_shape_invalid(input.shape) padding = self.padding if not isinstance(padding, six.string_types): height_pad, width_pad = padding input = tf.pad(input, [ [0, 0], [height_pad, height_pad], [width_pad, width_pad], [0, 0], ]) # VALID option will make sure that # convolution won't use any padding. padding = 'VALID' output = tf.nn.convolution( input, self.weight, padding=padding, strides=self.stride, dilation_rate=self.dilation, data_format="NHWC", ) if self.bias is not None: bias = tf.reshape(self.bias, (1, 1, 1, -1)) output += bias return output def __repr__(self): return self._repr_arguments( self.size, padding=self.padding, stride=self.stride, dilation=self.dilation, weight=self.weight, bias=self.bias, name=self.name, )
class LSTM(BaseRNNLayer): """ Long Short Term Memory (LSTM) Layer. Parameters ---------- {BaseRNNLayer.size} input_weights : Initializer, ndarray Weight parameters for input connection. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. hidden_weights : Initializer, ndarray Weight parameters for hidden connection. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. cell_weights : Initializer, ndarray Weight parameters for cell connection. Require only when ``peepholes=True`` otherwise it will be ignored. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. bias : Initializer, ndarray Bias parameters for all gates. Defaults to :class:`Constant(0) <neupy.init.Constant>`. activation_functions : dict, callable Activation functions for different gates. Defaults to: .. code-block:: python # import tensorflow as tf dict( ingate=tf.nn.sigmoid, forgetgate=tf.nn.sigmoid, outgate=tf.nn.sigmoid, cell=tf.tanh, ) If application requires modification to only one parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(ingate=tf.tanh) Other parameters like ``forgetgate`` or ``outgate`` will be equal to their default values. learn_init : bool If ``True``, make ``cell_init`` and ``hidden_init`` trainable variables. Defaults to ``False``. cell_init : array-like, Tensorfow variable, scalar or Initializer Initializer for initial cell state (:math:`c_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. hidden_init : array-like, Tensorfow variable, scalar or Initializer Initializer for initial hidden state (:math:`h_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. backwards : bool If ``True``, process the sequence backwards and then reverse the output again such that the output from the layer is always from :math:`x_1` to :math:`x_n`. Defaults to ``False`` {BaseRNNLayer.only_return_final} peepholes : bool If ``True``, the LSTM uses peephole connections. When ``False``, cell parameters are ignored. Defaults to ``False``. unroll_scan : bool If ``True`` the recursion is unrolled instead of using scan. For some graphs this gives a significant speed up but it might also consume more memory. When ``unroll_scan=True``, backpropagation always includes the full sequence, so ``n_gradient_steps`` must be set to ``-1`` and the input sequence length must be known at compile time (i.e., cannot be given as ``None``). Defaults to ``False``. gradient_clipping : float or int If nonzero, the gradient messages are clipped to the given value during the backward pass. Defaults to ``0``. {BaseLayer.Parameters} Notes ----- Code was adapted from the `Lasagne <https://github.com/Lasagne/Lasagne>`_ library. Examples -------- Sequence classification .. code-block:: python from neupy import layers, algorithms n_time_steps = 40 n_categories = 20 embedded_size = 10 network = algorithms.RMSProp( [ layers.Input(n_time_steps), layers.Embedding(n_categories, embedded_size), layers.LSTM(20), layers.Sigmoid(1), ] ) """ input_weights = ParameterProperty(default=init.HeNormal()) hidden_weights = ParameterProperty(default=init.HeNormal()) cell_weights = ParameterProperty(default=init.HeNormal()) biases = ParameterProperty(default=init.Constant(0)) activation_functions = MultiCallableProperty(default=dict( ingate=tf.nn.sigmoid, forgetgate=tf.nn.sigmoid, outgate=tf.nn.sigmoid, cell=tf.tanh, )) learn_init = Property(default=False, expected_type=bool) cell_init = ParameterProperty(default=init.Constant(0)) hidden_init = ParameterProperty(default=init.Constant(0)) unroll_scan = Property(default=False, expected_type=bool) backwards = Property(default=False, expected_type=bool) peepholes = Property(default=False, expected_type=bool) gradient_clipping = NumberProperty(default=0, minval=0) def initialize(self): super(LSTM, self).initialize() n_inputs = np.prod(self.input_shape[1:]) # If peephole (cell to gate) connections were enabled, initialize # peephole connections. These are elementwise products with the cell # state, so they are represented as vectors. if self.peepholes: self.weight_cell_to_ingate = self.add_parameter( value=self.cell_weights, name='weight_cell_to_ingate', shape=(self.size, )) self.weight_cell_to_forgetgate = self.add_parameter( value=self.cell_weights, name='weight_cell_to_forgetgate', shape=(self.size, )) self.weight_cell_to_outgate = self.add_parameter( value=self.cell_weights, name='weight_cell_to_outgate', shape=(self.size, )) self.input_weights = self.add_parameter( value=self.input_weights, name='input_weights', shape=(n_inputs, 4 * self.size), ) self.hidden_weights = self.add_parameter( value=self.hidden_weights, name='hidden_weights', shape=(self.size, 4 * self.size), ) self.biases = self.add_parameter( value=self.biases, name='biases', shape=(4 * self.size, ), ) # Initialization parameters self.add_parameter( value=self.cell_init, shape=(1, self.size), name="cell_init", trainable=self.learn_init, ) self.add_parameter( value=self.hidden_init, shape=(1, self.size), name="hidden_init", trainable=self.learn_init, ) def output(self, input_value): # Because scan iterates over the first dimension we # dimshuffle to (n_time_steps, n_batch, n_features) input_value = tf.transpose(input_value, [1, 0, 2]) input_shape = tf.shape(input_value) n_batch = input_shape[1] def one_lstm_step(states, input_n): with tf.name_scope('lstm-cell'): cell_previous, hid_previous = states input_n = tf.matmul(input_n, self.input_weights) + self.biases # Calculate gates pre-activations and slice gates = input_n + tf.matmul(hid_previous, self.hidden_weights) # Clip gradients if self.gradient_clipping != 0: gates = clip_gradient(gates, self.gradient_clipping) # Extract the pre-activation gate values ingate, forgetgate, cell_input, outgate = tf.split(gates, 4, axis=1) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.weight_cell_to_ingate forgetgate += (cell_previous * self.weight_cell_to_forgetgate) # Apply nonlinearities ingate = self.activation_functions.ingate(ingate) forgetgate = self.activation_functions.forgetgate(forgetgate) cell_input = self.activation_functions.cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.weight_cell_to_outgate outgate = self.activation_functions.outgate(outgate) # Compute new hidden unit activation hid = outgate * tf.tanh(cell) return [cell, hid] cell_init = tf.tile(self.cell_init, (n_batch, 1)) hidden_init = tf.tile(self.hidden_init, (n_batch, 1)) sequence = input_value if self.backwards: sequence = tf.reverse(sequence, axis=[0]) if self.unroll_scan: # Explicitly unroll the recurrence instead of using scan hid_out = unroll_scan( fn=one_lstm_step, sequence=sequence, outputs_info=[cell_init, hidden_init], ) else: _, hid_out = tf.scan( fn=one_lstm_step, elems=input_value, initializer=[cell_init, hidden_init], name='lstm-scan', ) # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: return hid_out[-1] # if scan is backward reverse the output if self.backwards: hid_out = tf.reverse(hid_out, axis=[0]) # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = tf.transpose(hid_out, [1, 0, 2]) return hid_out
class BaseAssociative(BaseNetwork): """ Base class for associative learning. Parameters ---------- n_inputs : int Number of features (columns) in the input data. n_outputs : int Number of outputs in the network. weight : array-like, Initializer Neural network weights. Value defined manualy should have shape ``(n_inputs, n_outputs)``. Defaults to :class:`Normal() <neupy.init.Normal>`. {BaseNetwork.Parameters} Methods ------- {BaseSkeleton.predict} train(X_train, epochs=100) Train neural network. {BaseSkeleton.fit} """ n_inputs = IntProperty(minval=1, required=True) n_outputs = IntProperty(minval=1, required=True) weight = ParameterProperty(default=init.Normal()) def __init__(self, **options): super(BaseAssociative, self).__init__(**options) self.init_weights() def init_weights(self): valid_weight_shape = (self.n_inputs, self.n_outputs) if isinstance(self.weight, init.Initializer): self.weight = self.weight.sample(valid_weight_shape, return_array=True) if self.weight.shape != valid_weight_shape: raise ValueError( "Weight matrix has invalid shape. Got {}, expected {}" "".format(self.weight.shape, valid_weight_shape)) self.weight = self.weight.astype(float) def format_input_data(self, X): X = format_data(X, is_feature1d=(self.n_inputs == 1)) if X.ndim != 2: raise ValueError("Cannot make prediction, because input " "data has more than 2 dimensions") if X.shape[1] != self.n_inputs: raise ValueError("Input data expected to have {} features, " "but got {}".format(self.n_inputs, X.shape[1])) return X def train(self, X_train, epochs=100): X_train = self.format_input_data(X_train) return super(BaseAssociative, self).train(X_train=X_train, epochs=epochs)
class BatchNorm(BaseLayer): """ Batch-normalization layer. Parameters ---------- axes : int, tuple with int or None The axis or axes along which normalization is applied. ``None`` means that normalization will be applied over all axes except the first one. In case of 4D tensor it will be equal to ``(0, 2, 3)``. Defaults to ``None``. epsilon : float Epsilon is a positive constant that adds to the standard deviation to prevent the division by zero. Defaults to ``1e-5``. alpha : float Coefficient for the exponential moving average of batch-wise means and standard deviations computed during training; the closer to one, the more it will depend on the last batches seen. Value needs to be between ``0`` and ``1``. Defaults to ``0.1``. gamma : array-like, Theano variable, scalar or Initializer Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=1)``. beta : array-like, Theano variable, scalar or Initializer Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=0)``. Methods ------- {BaseLayer.Methods} Attributes ---------- {BaseLayer.Attributes} References ---------- .. [1] Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift, http://arxiv.org/pdf/1502.03167v3.pdf """ axes = AxesProperty(default=None) alpha = ProperFractionProperty(default=0.1) epsilon = NumberProperty(default=1e-5, minval=0) gamma = ParameterProperty(default=Constant(value=1)) beta = ParameterProperty(default=Constant(value=0)) def initialize(self): super(BatchNorm, self).initialize() input_shape = as_tuple(None, self.input_shape) ndim = len(input_shape) if self.axes is None: # If ndim == 4 then axes = (0, 2, 3) # If ndim == 2 then axes = (0,) self.axes = tuple(axis for axis in range(ndim) if axis != 1) if any(axis >= ndim for axis in self.axes): raise ValueError("Cannot apply batch normalization on the axis " "that doesn't exist.") opposite_axes = find_opposite_axes(self.axes, ndim) parameter_shape = [input_shape[axis] for axis in opposite_axes] if any(parameter is None for parameter in parameter_shape): unknown_dim_index = parameter_shape.index(None) raise ValueError("Cannot apply batch normalization on the axis " "with unknown size over the dimension #{} " "(0-based indeces).".format(unknown_dim_index)) self.running_mean = theano.shared( name='running_mean_{}'.format(self.layer_id), value=asfloat(np.zeros(parameter_shape))) self.running_inv_std = theano.shared( name='running_inv_std_{}'.format(self.layer_id), value=asfloat(np.ones(parameter_shape))) if isinstance(self.gamma, Initializer): self.gamma = self.gamma.sample(parameter_shape) if isinstance(self.beta, Initializer): self.beta = self.beta.sample(parameter_shape) self.gamma = theano.shared( name='gamma_{}'.format(self.layer_id), value=asfloat(self.gamma), ) self.beta = theano.shared( name='beta_{}'.format(self.layer_id), value=asfloat(self.beta), ) self.parameters = [self.gamma, self.beta] def output(self, input_value): epsilon = asfloat(self.epsilon) alpha = asfloat(self.alpha) gamma, beta = self.gamma, self.beta ndim = input_value.ndim axes = self.axes running_mean = self.running_mean running_inv_std = self.running_inv_std input_mean = input_value.mean(axes) input_var = input_value.var(axes) input_inv_std = T.inv(T.sqrt(input_var + epsilon)) self.updates = [ (running_inv_std, asfloat(1 - alpha) * running_inv_std + alpha * input_inv_std), (running_mean, asfloat(1 - alpha) * running_mean + alpha * input_mean) ] if not self.training_state: mean = running_mean inv_std = running_inv_std else: mean = input_mean inv_std = input_inv_std opposite_axes = find_opposite_axes(axes, ndim) beta = dimshuffle(beta, ndim, opposite_axes) gamma = dimshuffle(gamma, ndim, opposite_axes) mean = dimshuffle(mean, ndim, opposite_axes) inv_std = dimshuffle(inv_std, ndim, opposite_axes) normalized_value = (input_value - mean) * inv_std return gamma * normalized_value + beta
class BaseStepAssociative(BaseAssociative): """ Base class for associative algorithms which have 2 layers and first one is has step function as activation. Parameters ---------- {BaseAssociative.n_inputs} {BaseAssociative.n_outputs} n_unconditioned : int Number of unconditioned units in neraul networks. All these units wouldn't update during the training procedure. Unconditioned should be the first feature in the dataset. weight : array-like Neural network weights. Value defined manualy should have shape ``(n_inputs, n_outputs)``. Defaults to ``None`` which means that all unconditional weights will be equal to ``1``. Other weights equal to ``0``. bias : array-like, Initializer Neural network bias units. Defaults to :class:`Constant(-0.5) <neupy.init.Constant>`. {BaseNetwork.Parameters} Methods ------- {BaseAssociative.Methods} """ n_inputs = IntProperty(minval=2, required=True) n_unconditioned = IntProperty(minval=1, required=True) weight = ArrayProperty() bias = ParameterProperty(default=init.Constant(-0.5)) def init_weights(self): if self.n_inputs <= self.n_unconditioned: raise ValueError( "Number of uncondition features should be less than total " "number of features. `n_inputs`={} and `n_unconditioned`={}" "".format(self.n_inputs, self.n_unconditioned)) valid_weight_shape = (self.n_inputs, self.n_outputs) valid_bias_shape = (self.n_outputs, ) if self.weight is None: self.weight = np.zeros(valid_weight_shape) self.weight[:self.n_unconditioned, :] = 1 if isinstance(self.bias, init.Initializer): self.bias = self.bias.sample(valid_bias_shape, return_array=True) super(BaseStepAssociative, self).init_weights() if self.bias.shape != valid_bias_shape: raise ValueError( "Bias vector has invalid shape. Got {}, expected {}" "".format(self.bias.shape, valid_bias_shape)) self.bias = self.bias.astype(float) def predict(self, X): X = format_data(X, is_feature1d=False) raw_output = X.dot(self.weight) + self.bias return np.where(raw_output > 0, 1, 0) def train(self, X_train, *args, **kwargs): X_train = format_data(X_train, is_feature1d=False) return super(BaseStepAssociative, self).train(X_train, *args, **kwargs) def one_training_update(self, X_train, y_train): weight = self.weight n_unconditioned = self.n_unconditioned predict = self.predict weight_delta = self.weight_delta error = 0 for x_row in X_train: x_row = np.expand_dims(x_row, axis=0) layer_output = predict(x_row) delta = weight_delta(x_row, layer_output) weight[n_unconditioned:, :] += delta # This error can tell us whether network has converged # to some value of weihts. Low errors will mean that weights # hasn't been updated much during the training epoch. error += np.linalg.norm(delta) return error
class GRU(BaseRNNLayer): """ Gated Recurrent Unit (GRU) Layer. Parameters ---------- {BaseRNNLayer.size} weights : dict or Initializer Weight parameters for different gates. Defaults to :class:`XavierUniform() <neupy.init.XavierUniform>`. - In case if application requires the same initialization method for all weights, then it's possible to specify initialization method that would be automaticaly applied to all weight parameters in the GRU layer. .. code-block:: python layers.GRU(2, weights=init.Normal(0.1)) - In case if application requires different initialization values for different weights then it's possible to specify an exact weight by name. .. code-block:: python dict( weight_in_to_updategate=init.XavierUniform(), weight_hid_to_updategate=init.XavierUniform(), weight_in_to_resetgate=init.XavierUniform(), weight_hid_to_resetgate=init.XavierUniform(), weight_in_to_hidden_update=init.XavierUniform(), weight_hid_to_hidden_update=init.XavierUniform(), ) If application requires modification to only one (or multiple) parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(weight_in_to_updategate=init.Normal(0.1)) Other parameters like ``weight_in_to_resetgate`` will be equal to their default values. biases : dict or Initializer Bias parameters for different gates. Defaults to :class:`Constant(0) <neupy.init.Constant>`. - In case if application requires the same initialization method for all biases, then it's possible to specify initialization method that would be automaticaly applied to all bias parameters in the GRU layer. .. code-block:: python layers.GRU(2, biases=init.Constant(1)) - In case if application requires different initialization values for different weights then it's possible to specify an exact weight by name. .. code-block:: python dict( bias_updategate=init.Constant(0), bias_resetgate=init.Constant(0), bias_hidden_update=init.Constant(0), ) If application requires modification to only one (or multiple) parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(bias_resetgate=init.Constant(1)) Other parameters like ``bias_updategate`` will be equal to their default values. activation_functions : dict, callable Activation functions for different gates. Defaults to: .. code-block:: python # import theano.tensor as T dict( resetgate=T.nnet.sigmoid, updategate=T.nnet.sigmoid, hidden_update=T.tanh, ) If application requires modification to only one parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(resetgate=T.tanh) Other parameters like ``updategate`` or ``hidden_update`` will be equal to their default values. learn_init : bool If ``True``, make ``hid_init`` trainable variable. Defaults to ``False``. hid_init : array-like, Theano variable, scalar or Initializer Initializer for initial hidden state (:math:`h_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. {BaseRNNLayer.only_return_final} backwards : bool If ``True``, process the sequence backwards and then reverse the output again such that the output from the layer is always from :math:`x_1` to :math:`x_n`. Defaults to ``False``. precompute_input : bool if ``True``, precompute ``input_to_hid`` before iterating through the sequence. This can result in a speed up at the expense of an increase in memory usage. Defaults to ``True``. unroll_scan : bool If ``True`` the recursion is unrolled instead of using scan. For some graphs this gives a significant speed up but it might also consume more memory. When ``unroll_scan=True``, backpropagation always includes the full sequence, so ``n_gradient_steps`` must be set to ``-1`` and the input sequence length must be known at compile time (i.e., cannot be given as ``None``). Defaults to ``False``. {BaseLayer.Parameters} Notes ----- Code was adapted from the `Lasagne <https://github.com/Lasagne/Lasagne>`_ library. Examples -------- Sequence classification .. code-block:: python from neupy import layers, algorithms n_time_steps = 40 n_categories = 20 embedded_size = 10 network = algorithms.RMSProp( [ layers.Input(n_time_steps), layers.Embedding(n_categories, embedded_size), layers.GRU(20), layers.Sigmoid(1), ] ) """ weights = MultiParameterProperty( default=dict( weight_in_to_updategate=init.XavierUniform(), weight_hid_to_updategate=init.XavierUniform(), weight_in_to_resetgate=init.XavierUniform(), weight_hid_to_resetgate=init.XavierUniform(), weight_in_to_hidden_update=init.XavierUniform(), weight_hid_to_hidden_update=init.XavierUniform(), )) biases = MultiParameterProperty( default=dict( bias_updategate=init.Constant(0), bias_resetgate=init.Constant(0), bias_hidden_update=init.Constant(0), )) activation_functions = MultiCallableProperty( default=dict( resetgate=T.nnet.sigmoid, updategate=T.nnet.sigmoid, hidden_update=T.tanh, )) learn_init = Property(default=False, expected_type=bool) hid_init = ParameterProperty(default=init.Constant(0)) backwards = Property(default=False, expected_type=bool) unroll_scan = Property(default=False, expected_type=bool) precompute_input = Property(default=True, expected_type=bool) n_gradient_steps = IntProperty(default=-1) gradient_clipping = NumberProperty(default=0, minval=0) def initialize(self): super(GRU, self).initialize() n_inputs = np.prod(self.input_shape[1:]) weights = self.weights biases = self.biases # Update gate parameters self.weight_in_to_updategate = self.add_parameter( value=weights.weight_in_to_updategate, name='weight_in_to_updategate', shape=(n_inputs, self.size)) self.weight_hid_to_updategate = self.add_parameter( value=weights.weight_hid_to_updategate, name='weight_hid_to_updategate', shape=(self.size, self.size)) self.bias_updategate = self.add_parameter( value=biases.bias_updategate, name='bias_updategate', shape=(self.size,)) # Reset gate parameters self.weight_in_to_resetgate = self.add_parameter( value=weights.weight_in_to_resetgate, name='weight_in_to_resetgate', shape=(n_inputs, self.size)) self.weight_hid_to_resetgate = self.add_parameter( value=weights.weight_hid_to_resetgate, name='weight_hid_to_resetgate', shape=(self.size, self.size)) self.bias_resetgate = self.add_parameter( value=biases.bias_resetgate, name='bias_forgetgate', shape=(self.size,)) # Hidden update gate parameters self.weight_in_to_hidden_update = self.add_parameter( value=weights.weight_in_to_hidden_update, name='weight_in_to_hidden_update', shape=(n_inputs, self.size)) self.weight_hid_to_hidden_update = self.add_parameter( value=weights.weight_hid_to_hidden_update, name='weight_hid_to_hidden_update', shape=(self.size, self.size)) self.bias_hidden_update = self.add_parameter( value=biases.bias_hidden_update, name='bias_hidden_update', shape=(self.size,)) self.add_parameter(value=self.hid_init, shape=(1, self.size), name="hid_init", trainable=self.learn_init) def output(self, input_value): # Treat all dimensions after the second as flattened # feature dimensions if input_value.ndim > 3: input_value = T.flatten(input_value, 3) # Because scan iterates over the first dimension we # dimshuffle to (n_time_steps, n_batch, n_features) input_value = input_value.dimshuffle(1, 0, 2) seq_len, n_batch, _ = input_value.shape # Stack input weight matrices into a (num_inputs, 3 * num_units) # matrix, which speeds up computation weight_in_stacked = T.concatenate([ self.weight_in_to_updategate, self.weight_in_to_resetgate, self.weight_in_to_hidden_update], axis=1) # Same for hidden weight matrices weight_hid_stacked = T.concatenate([ self.weight_hid_to_updategate, self.weight_hid_to_resetgate, self.weight_hid_to_hidden_update], axis=1) # Stack biases into a (3 * num_units) vector bias_stacked = T.concatenate([ self.bias_updategate, self.bias_resetgate, self.bias_hidden_update], axis=0) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # weight_in_stacked is (n_features, 3 * num_units). # Input: (n_time_steps, n_batch, 3 * num_units). input_value = T.dot(input_value, weight_in_stacked) + bias_stacked # When theano.scan calls step, input_n will be # (n_batch, 3 * num_units). We define a slicing function # that extract the input to each GRU gate def slice_w(x, n): s = x[:, n * self.size:(n + 1) * self.size] if self.size == 1: s = T.addbroadcast(s, 1) # Theano cannot infer this by itself return s # Create single recurrent computation step function # input_n is the n'th vector of the input def one_gru_step(input_n, hid_previous, *args): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, # and W_{hc} h_{t - 1} hid_input = T.dot(hid_previous, weight_hid_stacked) if self.gradient_clipping: input_n = theano.gradient.grad_clip( input_n, -self.gradient_clipping, self.gradient_clipping) hid_input = theano.gradient.grad_clip( hid_input, -self.gradient_clipping, self.gradient_clipping) if not self.precompute_input: # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, # and W_{xc}x_t + b_c input_n = T.dot(input_n, weight_in_stacked) + bias_stacked # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) resetgate = self.activation_functions.resetgate(resetgate) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) updategate = self.activation_functions.updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate * hidden_update_hid if self.gradient_clipping: hidden_update = theano.gradient.grad_clip( hidden_update, -self.gradient_clipping, self.gradient_clipping) hidden_update = self.activation_functions.hidden_update( hidden_update) # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate) * hid_previous + updategate * hidden_update return hid hid_init = T.dot(T.ones((n_batch, 1)), self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_sequences = [weight_hid_stacked] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_sequences += [weight_in_stacked, bias_stacked] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer n_time_steps = self.input_shape[0] # Explicitly unroll the recurrence instead of using scan hid_out, = unroll_scan( fn=one_gru_step, sequences=[input_value], outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_sequences, n_steps=n_time_steps) else: # Scan op iterates over first dimension of input and # repeatedly applies the step function hid_out, _ = theano.scan( fn=one_gru_step, sequences=[input_value], outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_sequences, truncate_gradient=self.n_gradient_steps, strict=True) # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: return hid_out[-1] # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
class Embedding(BaseLayer): """ Embedding layer accepts indices as an input and returns rows from the weight matrix associated with these indices. It's useful when inputs are categorical features or for the word embedding tasks. Parameters ---------- input_size : int Layer's input vector dimension. It's, typically, associated with number of categories or number of unique words that input vector has. output_size : int Layer's output vector dimension. weight : array-like, Tensorfow variable, scalar or Initializer Defines layer's weights. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. {BaseLayer.name} Methods ------- {BaseLayer.Methods} Attributes ---------- {BaseLayer.Attributes} Examples -------- This example converts dataset that has only categorical variables into format that suitable for Embedding layer. >>> import numpy as np >>> from neupy.layers import * >>> >>> dataset = np.array([ ... ['cold', 'high'], ... ['hot', 'low'], ... ['cold', 'low'], ... ['hot', 'low'], ... ]) >>> >>> unique_value, dataset_indices = np.unique( ... dataset, return_inverse=True ... ) >>> dataset_indices = dataset_indices.reshape((4, 2)) >>> dataset_indices array([[0, 1], [2, 3], [0, 3], [2, 3]]) >>> >>> n_features = dataset.shape[1] >>> n_unique_categories = len(unique_value) >>> embedded_size = 1 >>> >>> network = join( ... Input(n_features), ... Embedding(n_unique_categories, embedded_size), ... # Output from the embedding layer is 3D ... # To make output 2D we need to reshape dimensions ... Reshape(), ... ) """ input_size = IntProperty(minval=1) output_size = IntProperty(minval=1) weight = ParameterProperty() def __init__(self, input_size, output_size, weight=init.HeNormal(), name=None): super(Embedding, self).__init__(name=name) self.input_size = input_size self.output_size = output_size self.weight = weight def get_output_shape(self, input_shape): input_shape = tf.TensorShape(input_shape) return input_shape.concatenate(self.output_size) def create_variables(self, input_shape): self.input_shape = input_shape self.weight = self.variable( value=self.weight, name='weight', shape=as_tuple(self.input_size, self.output_size)) def output(self, input_value, **kwargs): input_value = tf.cast(input_value, tf.int32) return tf.gather(self.weight, input_value) def __repr__(self): return self._repr_arguments( self.input_size, self.output_size, name=self.name, weight=self.weight, )
class RBM(BaseAlgorithm, BaseNetwork, MinibatchTrainingMixin): """ Boolean/Bernoulli Restricted Boltzmann Machine (RBM). Algorithm assumes that inputs are either binary values or values between 0 and 1. Parameters ---------- n_visible : int Number of visible units. n_hidden : int Number of hidden units. {MinibatchTrainingMixin.batch_size} weight : array-like, Theano variable, Initializer or scalar Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`XavierNormal <neupy.init.XavierNormal>`. hidden_bias : array-like, Theano variable, Initializer or scalar Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`Constant(value=0) <neupy.init.Constant>`. visible_bias : array-like, Theano variable, Initializer or scalar Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`Constant(value=0) <neupy.init.Constant>`. {BaseNetwork.Parameters} Methods ------- train(input_train, epochs=100) Trains network. {BaseSkeleton.fit} visible_to_hidden(visible_input) Populates data throught the network and returns output from the hidden layer. hidden_to_visible(hidden_input) Propagates output from the hidden layer backward to the visible. gibbs_sampling(visible_input, n_iter=1) Makes Gibbs sampling ``n`` times using visible input. Examples -------- >>> import numpy as np >>> from neupy import algorithms >>> >>> data = np.array([ ... [1, 0, 1, 0], ... [1, 0, 1, 0], ... [1, 0, 0, 0], # incomplete sample ... [1, 0, 1, 0], ... ... [0, 1, 0, 1], ... [0, 0, 0, 1], # incomplete sample ... [0, 1, 0, 1], ... [0, 1, 0, 1], ... [0, 1, 0, 1], ... [0, 1, 0, 1], ... ]) >>> >>> rbm = algorithms.RBM(n_visible=4, n_hidden=1) >>> rbm.train(data, epochs=100) >>> >>> hidden_states = rbm.visible_to_hidden(data) >>> hidden_states.round(2) array([[ 0.99], [ 0.99], [ 0.95], [ 0.99], [ 0. ], [ 0.01], [ 0. ], [ 0. ], [ 0. ], [ 0. ]]) References ---------- [1] G. Hinton, A Practical Guide to Training Restricted Boltzmann Machines, 2010. http://www.cs.toronto.edu/~hinton/absps/guideTR.pdf """ n_visible = IntProperty(minval=1) n_hidden = IntProperty(minval=1) weight = ParameterProperty(default=init.XavierNormal()) hidden_bias = ParameterProperty(default=init.Constant(value=0)) visible_bias = ParameterProperty(default=init.Constant(value=0)) def __init__(self, n_visible, n_hidden, **options): self.theano_random = theano_random_stream() super(ConfigurableABC, self).__init__(n_hidden=n_hidden, n_visible=n_visible, **options) self.weight = create_shared_parameter(value=self.weight, name='algo:rbm/matrix:weight', shape=(n_visible, n_hidden)) self.hidden_bias = create_shared_parameter( value=self.hidden_bias, name='algo:rbm/vector:hidden-bias', shape=(n_hidden, ), ) self.visible_bias = create_shared_parameter( value=self.visible_bias, name='algo:rbm/vector:visible-bias', shape=(n_visible, ), ) super(RBM, self).__init__(**options) def init_input_output_variables(self): self.variables.update( network_input=T.matrix(name='algo:rbm/var:network-input'), ) def init_variables(self): self.variables.update(h_samples=theano.shared( name='algo:rbm/matrix:hidden-samples', value=asint(np.zeros((self.batch_size, self.n_hidden))), ), ) def init_methods(self): def free_energy(visible_sample): wx_b = T.dot(visible_sample, self.weight) + self.hidden_bias visible_bias_term = T.dot(visible_sample, self.visible_bias) hidden_term = T.log(asfloat(1) + T.exp(wx_b)).sum(axis=1) return -visible_bias_term - hidden_term def visible_to_hidden(visible_sample): wx_b = T.dot(visible_sample, self.weight) + self.hidden_bias return T.nnet.sigmoid(wx_b) def hidden_to_visible(hidden_sample): wx_b = T.dot(hidden_sample, self.weight.T) + self.visible_bias return T.nnet.sigmoid(wx_b) def sample_hidden_from_visible(visible_sample): theano_random = self.theano_random hidden_prob = visible_to_hidden(visible_sample) hidden_sample = theano_random.binomial(n=1, p=hidden_prob, dtype=theano.config.floatX) return hidden_sample def sample_visible_from_hidden(hidden_sample): theano_random = self.theano_random visible_prob = hidden_to_visible(hidden_sample) visible_sample = theano_random.binomial(n=1, p=visible_prob, dtype=theano.config.floatX) return visible_sample network_input = self.variables.network_input n_samples = asfloat(network_input.shape[0]) theano_random = self.theano_random weight = self.weight h_bias = self.hidden_bias v_bias = self.visible_bias h_samples = self.variables.h_samples step = asfloat(self.step) sample_indeces = theano_random.random_integers( low=0, high=n_samples - 1, size=(self.batch_size, )) v_pos = ifelse( T.eq(n_samples, self.batch_size), network_input, # In case if final batch has less number of # samples then expected network_input[sample_indeces]) h_pos = visible_to_hidden(v_pos) v_neg = sample_visible_from_hidden(h_samples) h_neg = visible_to_hidden(v_neg) weight_update = v_pos.T.dot(h_pos) - v_neg.T.dot(h_neg) h_bias_update = (h_pos - h_neg).mean(axis=0) v_bias_update = (v_pos - v_neg).mean(axis=0) # Stochastic pseudo-likelihood feature_index_to_flip = theano_random.random_integers( low=0, high=self.n_visible - 1, ) rounded_input = T.round(network_input) rounded_input = network_input rounded_input_flip = T.set_subtensor( rounded_input[:, feature_index_to_flip], 1 - rounded_input[:, feature_index_to_flip]) error = T.mean(self.n_visible * T.log( T.nnet.sigmoid( free_energy(rounded_input_flip) - free_energy(rounded_input)))) self.methods.update(train_epoch=theano.function( [network_input], error, name='algo:rbm/func:train-epoch', updates=[ (weight, weight + step * weight_update / n_samples), (h_bias, h_bias + step * h_bias_update), (v_bias, v_bias + step * v_bias_update), (h_samples, asint(theano_random.binomial(n=1, p=h_neg))), ]), prediction_error=theano.function( [network_input], error, name='algo:rbm/func:prediction-error', ), visible_to_hidden=theano.function( [network_input], visible_to_hidden(network_input), name='algo:rbm/func:visible-to-hidden', ), hidden_to_visible=theano.function( [network_input], hidden_to_visible(network_input), name='algo:rbm/func:hidden-to-visible', ), gibbs_sampling=theano.function( [network_input], sample_visible_from_hidden( sample_hidden_from_visible(network_input)), name='algo:rbm/func:gibbs-sampling', )) def train(self, input_train, input_test=None, epochs=100, summary='table'): """ Train RBM. Parameters ---------- input_train : 1D or 2D array-like input_test : 1D or 2D array-like or None Defaults to ``None``. epochs : int Number of training epochs. Defaults to ``100``. summary : {'table', 'inline'} Training summary type. Defaults to ``'table'``. """ return super(RBM, self).train(input_train=input_train, target_train=None, input_test=input_test, target_test=None, epochs=epochs, epsilon=None, summary=summary) def train_epoch(self, input_train, target_train=None): """ Train one epoch. Parameters ---------- input_train : array-like (n_samples, n_features) Returns ------- float """ errors = self.apply_batches( function=self.methods.train_epoch, input_data=input_train, description='Training batches', show_error_output=True, ) n_samples = len(input_train) return average_batch_errors(errors, n_samples, self.batch_size) def visible_to_hidden(self, visible_input): """ Populates data throught the network and returns output from the hidden layer. Parameters ---------- visible_input : array-like (n_samples, n_visible_features) Returns ------- array-like """ is_input_feature1d = (self.n_visible == 1) visible_input = format_data(visible_input, is_input_feature1d) outputs = self.apply_batches(function=self.methods.visible_to_hidden, input_data=visible_input, description='Hidden from visible batches', show_progressbar=True, show_error_output=False) return np.concatenate(outputs, axis=0) def hidden_to_visible(self, hidden_input): """ Propagates output from the hidden layer backward to the visible. Parameters ---------- hidden_input : array-like (n_samples, n_hidden_features) Returns ------- array-like """ is_input_feature1d = (self.n_hidden == 1) hidden_input = format_data(hidden_input, is_input_feature1d) outputs = self.apply_batches(function=self.methods.hidden_to_visible, input_data=hidden_input, description='Visible from hidden batches', show_progressbar=True, show_error_output=False) return np.concatenate(outputs, axis=0) def prediction_error(self, input_data, target_data=None): """ Compute the pseudo-likelihood of input samples. Parameters ---------- input_data : array-like Values of the visible layer Returns ------- float Value of the pseudo-likelihood. """ is_input_feature1d = (self.n_visible == 1) input_data = format_data(input_data, is_input_feature1d) errors = self.apply_batches( function=self.methods.prediction_error, input_data=input_data, description='Validation batches', show_error_output=True, ) return average_batch_errors(errors, n_samples=len(input_data), batch_size=self.batch_size) def gibbs_sampling(self, visible_input, n_iter=1): """ Makes Gibbs sampling n times using visible input. Parameters ---------- visible_input : 1d or 2d array n_iter : int Number of Gibbs sampling iterations. Defaults to ``1``. Returns ------- array-like Output from the visible units after perfoming n Gibbs samples. Array will contain only binary units (0 and 1). """ is_input_feature1d = (self.n_visible == 1) visible_input = format_data(visible_input, is_input_feature1d) gibbs_sampling = self.methods.gibbs_sampling input_ = visible_input for iteration in range(n_iter): input_ = gibbs_sampling(input_) return input_
class BatchNorm(BaseLayer): """ Batch-normalization layer. Parameters ---------- axes : int, tuple with int or None The axis or axes along which normalization is applied. ``None`` means that normalization will be applied over all axes except the first one. In case of 4D tensor it will be equal to ``(0, 1, 2)``. Defaults to ``None``. epsilon : float Epsilon is a positive constant that adds to the standard deviation to prevent the division by zero. Defaults to ``1e-5``. alpha : float Coefficient for the exponential moving average of batch-wise means and standard deviations computed during training; the closer to one, the more it will depend on the last batches seen. Value needs to be between ``0`` and ``1``. Defaults to ``0.1``. gamma : array-like, Tensorfow variable, scalar or Initializer Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=1)``. beta : array-like, Tensorfow variable, scalar or Initializer Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=0)``. running_mean : array-like, Tensorfow variable, scalar or Initializer Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=0)``. running_inv_std : array-like, Tensorfow variable, scalar or Initializer Default initialization methods you can find :ref:`here <init-methods>`. Defaults to ``Constant(value=1)``. {BaseLayer.Parameters} Methods ------- {BaseLayer.Methods} Attributes ---------- {BaseLayer.Attributes} References ---------- .. [1] Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift, http://arxiv.org/pdf/1502.03167v3.pdf """ axes = AxesProperty(default=None) epsilon = NumberProperty(default=1e-5, minval=0) alpha = ProperFractionProperty(default=0.1) beta = ParameterProperty(default=init.Constant(value=0)) gamma = ParameterProperty(default=init.Constant(value=1)) running_mean = ParameterProperty(default=init.Constant(value=0)) running_inv_std = ParameterProperty(default=init.Constant(value=1)) def initialize(self): super(BatchNorm, self).initialize() input_shape = as_tuple(None, self.input_shape) ndim = len(input_shape) if self.axes is None: # If ndim == 4 then axes = (0, 1, 2) # If ndim == 2 then axes = (0,) self.axes = tuple(range(ndim - 1)) if any(axis >= ndim for axis in self.axes): raise ValueError("Cannot apply batch normalization on the axis " "that doesn't exist.") opposite_axes = find_opposite_axes(self.axes, ndim) parameter_shape = [ input_shape[axis] if axis in opposite_axes else 1 for axis in range(ndim) ] if any(parameter is None for parameter in parameter_shape): unknown_dim_index = parameter_shape.index(None) raise ValueError("Cannot apply batch normalization on the axis " "with unknown size over the dimension #{} " "(0-based indeces).".format(unknown_dim_index)) self.add_parameter(value=self.running_mean, shape=parameter_shape, name='running_mean', trainable=False) self.add_parameter(value=self.running_inv_std, shape=parameter_shape, name='running_inv_std', trainable=False) self.add_parameter(value=self.gamma, name='gamma', shape=parameter_shape, trainable=True) self.add_parameter(value=self.beta, name='beta', shape=parameter_shape, trainable=True) def output(self, input_value): alpha = asfloat(self.alpha) running_mean = self.running_mean running_inv_std = self.running_inv_std if not self.training_state: mean, inv_std = running_mean, running_inv_std else: mean = tf.reduce_mean( input_value, self.axes, keepdims=True, name="mean", ) variance = tf.reduce_mean( tf.squared_difference(input_value, tf.stop_gradient(mean)), self.axes, keepdims=True, name="variance", ) inv_std = tf.rsqrt(variance + asfloat(self.epsilon)) self.updates = [ (running_inv_std, asfloat(1 - alpha) * running_inv_std + alpha * inv_std), (running_mean, asfloat(1 - alpha) * running_mean + alpha * mean) ] normalized_value = (input_value - mean) * inv_std return self.gamma * normalized_value + self.beta