def test_gain_relu_he_normal_scale(self): he_initializer = init.HeNormal(gain=1, seed=0) sample_1 = self.eval(he_initializer.sample((4, 4))) he_initializer = init.HeNormal(gain=2, seed=0) sample_2 = self.eval(he_initializer.sample((4, 4))) self.assertAlmostEqual(np.mean(sample_2 / sample_1), math.sqrt(2), places=5)
def test_gain_relu_he_normal_scale(self): environment.reproducible() he_initializer = init.HeNormal(gain=1) sample_1 = he_initializer.sample((3, 2)) environment.reproducible() he_initializer = init.HeNormal(gain='relu') sample_2 = he_initializer.sample((3, 2)) self.assertAlmostEqual(np.mean(sample_2 / sample_1), math.sqrt(2))
def __init__( self, n_units, only_return_final=True, # Trainable parameters input_weights=init.HeNormal(), hidden_weights=init.HeNormal(), cell_weights=init.HeNormal(), biases=0, # Activation functions ingate=tf.nn.sigmoid, forgetgate=tf.nn.sigmoid, outgate=tf.nn.sigmoid, cell=tf.tanh, # Cell states cell_init=0, hidden_init=0, learn_init=False, # Misc unroll_scan=False, backwards=False, peepholes=False, gradient_clipping=0, name=None): super(LSTM, self).__init__( n_units=n_units, only_return_final=only_return_final, name=name, ) self.input_weights = input_weights self.hidden_weights = hidden_weights self.cell_weights = cell_weights self.biases = biases self.ingate = ingate self.forgetgate = forgetgate self.outgate = outgate self.cell = cell self.learn_init = learn_init self.cell_init = cell_init self.hidden_init = hidden_init self.unroll_scan = unroll_scan self.backwards = backwards self.peepholes = peepholes self.gradient_clipping = gradient_clipping
def test_he_normal(self): he_normal = init.HeNormal() weight = self.eval(he_normal.sample((40, 40))) self.assertNormalyDistributed(weight) self.assertAlmostEqual(weight.mean(), 0, places=1) self.assertAlmostEqual(weight.std(), math.sqrt(1. / 40), places=2)
def test_he_normal(self): he_normal = init.HeNormal() weight = he_normal.sample((10, 30)) self.assertNormalyDistributed(weight) self.assertAlmostEqual(weight.mean(), 0, places=1) self.assertAlmostEqual(weight.std(), math.sqrt(2. / 10), places=2)
def __init__(self, input_size, output_size, weight=init.HeNormal(), name=None): super(Embedding, self).__init__(name=name) self.input_size = input_size self.output_size = output_size self.weight = weight
class Relu(ActivationLayer): """ The layer with the rectifier (ReLu) activation function. Parameters ---------- alpha : float Alpha parameter defines the decreasing rate for the negative values. If ``alpha`` is non-zero value then layer behave like a leaky ReLu. Defaults to ``0``. {ActivationLayer.size} weight : array-like, Tensorfow variable, scalar or Initializer Defines layer's weights. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`HeNormal(gain=2) <neupy.init.HeNormal>`. {ParameterBasedLayer.bias} {BaseLayer.Parameters} Methods ------- {ActivationLayer.Methods} Attributes ---------- {ActivationLayer.Attributes} Examples -------- Feedforward Neural Networks (FNN) >>> from neupy.layers import * >>> network = Input(10) > Relu(20) > Relu(1) Convolutional Neural Networks (CNN) >>> from neupy.layers import * >>> network = join( ... Input((32, 32, 3)), ... Convolution((3, 3, 16)) > Relu(), ... Convolution((3, 3, 32)) > Relu(), ... Reshape(), ... Softmax(10), ... ) """ alpha = NumberProperty(default=0, minval=0) weight = ParameterProperty(default=init.HeNormal(gain=2)) def activation_function(self, input_value): if self.alpha == 0: return tf.nn.relu(input_value) return tf.nn.leaky_relu(input_value, asfloat(self.alpha))
def __init__(self, n_units=None, weight=init.HeNormal(), bias=0, name=None): super(Linear, self).__init__(name=name) self.n_units = n_units self.weight = weight self.bias = bias
def __init__(self, size, padding='valid', stride=1, dilation=1, weight=init.HeNormal(gain=2), bias=0, name=None): super(Convolution, self).__init__(name=name) self.size = size self.padding = padding self.stride = stride self.dilation = dilation self.weight = weight self.bias = bias
def __init__( self, n_units, only_return_final=True, # Trainable parameters input_weights=init.HeNormal(), hidden_weights=init.HeNormal(), biases=0, # Activation functions resetgate=tf.nn.sigmoid, updategate=tf.nn.sigmoid, hidden_update=tf.tanh, # Cell states hidden_init=0, learn_init=False, # Misc unroll_scan=False, backwards=False, gradient_clipping=0, name=None): super(GRU, self).__init__( n_units=n_units, only_return_final=only_return_final, name=name, ) self.input_weights = input_weights self.hidden_weights = hidden_weights self.biases = biases self.resetgate = resetgate self.updategate = updategate self.hidden_update = hidden_update self.hidden_init = hidden_init self.learn_init = learn_init self.unroll_scan = unroll_scan self.backwards = backwards self.gradient_clipping = gradient_clipping
def __init__(self, n_units=None, alpha=0, weight=init.HeNormal(gain=2), bias=init.Constant(value=0), name=None): self.alpha = alpha super(Relu, self).__init__(n_units=n_units, weight=weight, bias=bias, name=name)
def __init__(self, size, padding='valid', stride=1, weight=init.HeNormal(gain=2), bias=0, name=None): super(Deconvolution, self).__init__(size=size, padding=padding, stride=stride, dilation=1, weight=weight, bias=bias, name=name)
def __init__(self, n_units=None, alpha_axes=-1, alpha=0.25, weight=init.HeNormal(gain=2), bias=0, name=None): self.alpha = alpha self.alpha_axes = as_tuple(alpha_axes) if 0 in self.alpha_axes: raise ValueError("Cannot specify alpha for 0-axis") super(PRelu, self).__init__(n_units=n_units, weight=weight, bias=bias, name=name)
class Relu(ActivationLayer): """ The layer with the rectifier (ReLu) activation function. Parameters ---------- alpha : float Alpha parameter defines the decreasing rate for the negative values. If ``alpha`` is non-zero value then layer behave like a leaky ReLu. Defaults to ``0``. {ActivationLayer.size} weight : array-like, Tensorfow variable, scalar or Initializer Defines layer's weights. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`HeNormal(gain=2) <neupy.init.HeNormal>`. {ParameterBasedLayer.bias} {BaseLayer.Parameters} Methods ------- {ActivationLayer.Methods} Attributes ---------- {ActivationLayer.Attributes} """ alpha = NumberProperty(default=0, minval=0) weight = ParameterProperty(default=init.HeNormal(gain=2)) def activation_function(self, input_value): if self.alpha == 0: return tf.nn.relu(input_value) return tf.nn.leaky_relu(input_value, asfloat(self.alpha))
def test_he_initializer_repr(self): he_initializer = init.HeNormal() self.assertEqual("HeNormal(gain=1.0)", str(he_initializer))
class Embedding(BaseLayer): """ Embedding layer accepts indeces as an input and returns rows from the weight matrix associated with these indeces. Useful in case of categorical features or for the word embedding tasks. Parameters ---------- input_size : int Layer's input vector dimension. Usualy associated with number of categories or number of unique words that input vector has. output_size : int Layer's output vector dimension. weight : array-like, Tensorfow variable, scalar or Initializer Defines layer's weights. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. {BaseLayer.Parameters} Methods ------- {BaseLayer.Methods} Attributes ---------- {BaseLayer.Attributes} Examples -------- This example converts dataset that has only categorical variables into format that suitable for Embedding layer. >>> import numpy as np >>> from neupy import layers >>> >>> dataset = np.array([ ... ['cold', 'high'], ... ['hot', 'low'], ... ['cold', 'low'], ... ['hot', 'low'], ... ]) >>> >>> unique_value, dataset_indeces = np.unique( ... dataset, return_inverse=True ... ) >>> dataset_indeces = dataset_indeces.reshape((4, 2)) >>> dataset_indeces array([[0, 1], [2, 3], [0, 3], [2, 3]]) >>> >>> n_features = dataset.shape[1] >>> n_unique_categories = len(unique_value) >>> embedded_size = 1 >>> >>> connection = layers.join( ... layers.Input(n_features), ... layers.Embedding(n_unique_categories, embedded_size), ... # Output from the embedding layer is 3D ... # To make output 2D we need to reshape dimensions ... layers.Reshape(), ... ) """ input_size = IntProperty(minval=1) output_size = IntProperty(minval=1) weight = ParameterProperty(default=init.HeNormal()) def __init__(self, input_size, output_size, **options): super(Embedding, self).__init__(input_size=input_size, output_size=output_size, **options) @property def output_shape(self): if self.input_shape is not None: return as_tuple(self.input_shape, self.output_size) def initialize(self): super(Embedding, self).initialize() self.add_parameter(value=self.weight, name='weight', shape=as_tuple(self.input_size, self.output_size), trainable=True) def output(self, input_value): input_value = tf.cast(input_value, tf.int32) return tf.gather(self.weight, input_value) def __repr__(self): classname = self.__class__.__name__ return '{name}({input_size}, {output_size})'.format( name=classname, input_size=self.input_size, output_size=self.output_size, )
class Convolution(ParameterBasedLayer): """ Convolutional layer. Parameters ---------- size : tuple of int Filter shape. In should be defined as a tuple with three integers ``(filter rows, filter columns, output channels)``. padding : {{``same``, ``valid``}}, int, tuple Zero padding for the input tensor. - ``valid`` - Padding won't be added to the tensor. Result will be the same as for ``padding=0`` - ``same`` - Padding will depend on the number of rows and columns in the filter. This padding makes sure that image with the ``stride=1`` won't change its width and height. It's the same as ``padding=(filter rows // 2, filter columns // 2)``. - Custom value for the padding can be specified as an integer, like ``padding=1`` or it can be specified as a tuple when different dimensions have different padding values, for example ``padding=(2, 3)``. Defaults to ``valid``. stride : tuple with ints, int. Stride size. Defaults to ``(1, 1)`` dilation : int, tuple Rate for the fiter upsampling. When ``dilation > 1`` layer will become diated convolution (or atrous convolution). Defaults to ``1``. weight : array-like, Tensorfow variable, scalar or Initializer Defines layer's weights. Shape of the weight will be equal to ``(filter rows, filter columns, input channels, output channels)``. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`HeNormal(gain=2) <neupy.init.HeNormal>`. {ParameterBasedLayer.bias} {BaseLayer.Parameters} Examples -------- 2D Convolution >>> from neupy import layers >>> >>> layers.join( ... layers.Input((28, 28, 3)), ... layers.Convolution((3, 3, 16)), ... ) 1D Convolution >>> from neupy import layers >>> >>> layers.join( ... layers.Input((30, 10)), ... layers.Reshape((30, 1, 10)), ... layers.Convolution((3, 1, 16)), ... ) Methods ------- {ParameterBasedLayer.Methods} Attributes ---------- {ParameterBasedLayer.Attributes} """ # We use gain=2 because it's suitable choice for relu non-linearity # and relu is the most common non-linearity used for CNN. weight = ParameterProperty(default=init.HeNormal(gain=2)) size = TypedListProperty(required=True, element_type=int) padding = PaddingProperty(default='valid') stride = Spatial2DProperty(default=(1, 1)) dilation = Spatial2DProperty(default=1) def validate(self, input_shape): if input_shape and len(input_shape) != 3: raise LayerConnectionError( "Convolutional layer expects an input with 3 " "dimensions, got {} with shape {}" "".format(len(input_shape), input_shape)) def output_shape_per_dim(self, *args, **kwargs): return conv_output_shape(*args, **kwargs) def find_output_from_input_shape(self, input_shape): padding = self.padding rows, cols, _ = input_shape row_filter_size, col_filter_size, n_kernels = self.size row_stride, col_stride = self.stride row_dilation, col_dilation = self.dilation or (1, 1) if isinstance(padding, (list, tuple)): row_padding, col_padding = padding else: row_padding, col_padding = padding, padding output_rows = self.output_shape_per_dim( rows, row_filter_size, row_padding, row_stride, row_dilation, ) output_cols = self.output_shape_per_dim( cols, col_filter_size, col_padding, col_stride, col_dilation, ) return (output_rows, output_cols, n_kernels) @property def output_shape(self): if self.input_shape is not None: return self.find_output_from_input_shape(self.input_shape) @property def weight_shape(self): n_channels = self.input_shape[-1] n_rows, n_cols, n_filters = self.size return (n_rows, n_cols, n_channels, n_filters) @property def bias_shape(self): return as_tuple(self.size[-1]) def output(self, input_value): padding = self.padding if not isinstance(padding, six.string_types): height_pad, weight_pad = padding input_value = tf.pad(input_value, [ [0, 0], [height_pad, height_pad], [weight_pad, weight_pad], [0, 0], ]) # VALID option will make sure that # convolution won't use any padding. padding = 'VALID' output = tf.nn.convolution( input_value, self.weight, padding=padding, strides=self.stride, dilation_rate=self.dilation, data_format="NHWC" ) if self.bias is not None: bias = tf.reshape(self.bias, (1, 1, 1, -1)) output += bias return output
class ParameterBasedLayer(BaseLayer): """ Layer that creates weight and bias parameters. Parameters ---------- size : int Layer's output size. weight : array-like, Tensorfow variable, scalar or Initializer Defines layer's weights. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. bias : 1D array-like, Tensorfow variable, scalar, Initializer or None Defines layer's bias. Default initialization methods you can find :ref:`here <init-methods>`. Defaults to :class:`Constant(0) <neupy.init.Constant>`. The ``None`` value excludes bias from the calculations and do not add it into parameters list. {BaseLayer.Parameters} Methods ------- {BaseLayer.Methods} Attributes ---------- {BaseLayer.Attributes} """ size = IntProperty(minval=1) weight = ParameterProperty(default=init.HeNormal()) bias = ParameterProperty(default=init.Constant(value=0), allow_none=True) def __init__(self, size, **options): super(ParameterBasedLayer, self).__init__(size=size, **options) @property def weight_shape(self): return as_tuple(self.input_shape, self.output_shape) @property def bias_shape(self): if self.bias is not None: return as_tuple(self.output_shape) def initialize(self): super(ParameterBasedLayer, self).initialize() self.add_parameter(value=self.weight, name='weight', shape=self.weight_shape, trainable=True) if self.bias is not None: self.add_parameter(value=self.bias, name='bias', shape=self.bias_shape, trainable=True) def __repr__(self): classname = self.__class__.__name__ return '{name}({size})'.format(name=classname, size=self.size)
class LSTM(BaseRNNLayer): """ Long Short Term Memory (LSTM) Layer. Parameters ---------- {BaseRNNLayer.size} input_weights : Initializer, ndarray Weight parameters for input connection. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. hidden_weights : Initializer, ndarray Weight parameters for hidden connection. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. cell_weights : Initializer, ndarray Weight parameters for cell connection. Require only when ``peepholes=True`` otherwise it will be ignored. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. bias : Initializer, ndarray Bias parameters for all gates. Defaults to :class:`Constant(0) <neupy.init.Constant>`. activation_functions : dict, callable Activation functions for different gates. Defaults to: .. code-block:: python # import tensorflow as tf dict( ingate=tf.nn.sigmoid, forgetgate=tf.nn.sigmoid, outgate=tf.nn.sigmoid, cell=tf.tanh, ) If application requires modification to only one parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(ingate=tf.tanh) Other parameters like ``forgetgate`` or ``outgate`` will be equal to their default values. learn_init : bool If ``True``, make ``cell_init`` and ``hidden_init`` trainable variables. Defaults to ``False``. cell_init : array-like, Tensorfow variable, scalar or Initializer Initializer for initial cell state (:math:`c_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. hidden_init : array-like, Tensorfow variable, scalar or Initializer Initializer for initial hidden state (:math:`h_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. backwards : bool If ``True``, process the sequence backwards and then reverse the output again such that the output from the layer is always from :math:`x_1` to :math:`x_n`. Defaults to ``False`` {BaseRNNLayer.only_return_final} peepholes : bool If ``True``, the LSTM uses peephole connections. When ``False``, cell parameters are ignored. Defaults to ``False``. unroll_scan : bool If ``True`` the recursion is unrolled instead of using scan. For some graphs this gives a significant speed up but it might also consume more memory. When ``unroll_scan=True``, backpropagation always includes the full sequence, so ``n_gradient_steps`` must be set to ``-1`` and the input sequence length must be known at compile time (i.e., cannot be given as ``None``). Defaults to ``False``. gradient_clipping : float or int If nonzero, the gradient messages are clipped to the given value during the backward pass. Defaults to ``0``. {BaseLayer.Parameters} Notes ----- Code was adapted from the `Lasagne <https://github.com/Lasagne/Lasagne>`_ library. Examples -------- Sequence classification .. code-block:: python from neupy import layers, algorithms n_time_steps = 40 n_categories = 20 embedded_size = 10 network = algorithms.RMSProp( [ layers.Input(n_time_steps), layers.Embedding(n_categories, embedded_size), layers.LSTM(20), layers.Sigmoid(1), ] ) """ input_weights = ParameterProperty(default=init.HeNormal()) hidden_weights = ParameterProperty(default=init.HeNormal()) cell_weights = ParameterProperty(default=init.HeNormal()) biases = ParameterProperty(default=init.Constant(0)) activation_functions = MultiCallableProperty(default=dict( ingate=tf.nn.sigmoid, forgetgate=tf.nn.sigmoid, outgate=tf.nn.sigmoid, cell=tf.tanh, )) learn_init = Property(default=False, expected_type=bool) cell_init = ParameterProperty(default=init.Constant(0)) hidden_init = ParameterProperty(default=init.Constant(0)) unroll_scan = Property(default=False, expected_type=bool) backwards = Property(default=False, expected_type=bool) peepholes = Property(default=False, expected_type=bool) gradient_clipping = NumberProperty(default=0, minval=0) def initialize(self): super(LSTM, self).initialize() n_inputs = np.prod(self.input_shape[1:]) # If peephole (cell to gate) connections were enabled, initialize # peephole connections. These are elementwise products with the cell # state, so they are represented as vectors. if self.peepholes: self.weight_cell_to_ingate = self.add_parameter( value=self.cell_weights, name='weight_cell_to_ingate', shape=(self.size, )) self.weight_cell_to_forgetgate = self.add_parameter( value=self.cell_weights, name='weight_cell_to_forgetgate', shape=(self.size, )) self.weight_cell_to_outgate = self.add_parameter( value=self.cell_weights, name='weight_cell_to_outgate', shape=(self.size, )) self.input_weights = self.add_parameter( value=self.input_weights, name='input_weights', shape=(n_inputs, 4 * self.size), ) self.hidden_weights = self.add_parameter( value=self.hidden_weights, name='hidden_weights', shape=(self.size, 4 * self.size), ) self.biases = self.add_parameter( value=self.biases, name='biases', shape=(4 * self.size, ), ) # Initialization parameters self.add_parameter( value=self.cell_init, shape=(1, self.size), name="cell_init", trainable=self.learn_init, ) self.add_parameter( value=self.hidden_init, shape=(1, self.size), name="hidden_init", trainable=self.learn_init, ) def output(self, input_value): # Because scan iterates over the first dimension we # dimshuffle to (n_time_steps, n_batch, n_features) input_value = tf.transpose(input_value, [1, 0, 2]) input_shape = tf.shape(input_value) n_batch = input_shape[1] def one_lstm_step(states, input_n): with tf.name_scope('lstm-cell'): cell_previous, hid_previous = states input_n = tf.matmul(input_n, self.input_weights) + self.biases # Calculate gates pre-activations and slice gates = input_n + tf.matmul(hid_previous, self.hidden_weights) # Clip gradients if self.gradient_clipping != 0: gates = clip_gradient(gates, self.gradient_clipping) # Extract the pre-activation gate values ingate, forgetgate, cell_input, outgate = tf.split(gates, 4, axis=1) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.weight_cell_to_ingate forgetgate += (cell_previous * self.weight_cell_to_forgetgate) # Apply nonlinearities ingate = self.activation_functions.ingate(ingate) forgetgate = self.activation_functions.forgetgate(forgetgate) cell_input = self.activation_functions.cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.weight_cell_to_outgate outgate = self.activation_functions.outgate(outgate) # Compute new hidden unit activation hid = outgate * tf.tanh(cell) return [cell, hid] cell_init = tf.tile(self.cell_init, (n_batch, 1)) hidden_init = tf.tile(self.hidden_init, (n_batch, 1)) sequence = input_value if self.backwards: sequence = tf.reverse(sequence, axis=[0]) if self.unroll_scan: # Explicitly unroll the recurrence instead of using scan hid_out = unroll_scan( fn=one_lstm_step, sequence=sequence, outputs_info=[cell_init, hidden_init], ) else: _, hid_out = tf.scan( fn=one_lstm_step, elems=input_value, initializer=[cell_init, hidden_init], name='lstm-scan', ) # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: return hid_out[-1] # if scan is backward reverse the output if self.backwards: hid_out = tf.reverse(hid_out, axis=[0]) # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = tf.transpose(hid_out, [1, 0, 2]) return hid_out
class GRU(BaseRNNLayer): """ Gated Recurrent Unit (GRU) Layer. Parameters ---------- {BaseRNNLayer.size} input_weights : Initializer, ndarray Weight parameters for input connection. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. hidden_weights : Initializer, ndarray Weight parameters for hidden connection. Defaults to :class:`HeNormal() <neupy.init.HeNormal>`. bias : Initializer, ndarray Bias parameters for all gates. Defaults to :class:`Constant(0) <neupy.init.Constant>`. activation_functions : dict, callable Activation functions for different gates. Defaults to: .. code-block:: python # import tensorflow as tf dict( resetgate=tf.nn.sigmoid, updategate=tf.nn.sigmoid, hidden_update=tf.tanh, ) If application requires modification to only one parameter then it's better to specify the one that you need to modify and ignore other parameters .. code-block:: python dict(resetgate=tf.tanh) Other parameters like ``updategate`` or ``hidden_update`` will be equal to their default values. learn_init : bool If ``True``, make ``hidden_init`` trainable variable. Defaults to ``False``. hidden_init : array-like, Tensorfow variable, scalar or Initializer Initializer for initial hidden state (:math:`h_0`). Defaults to :class:`Constant(0) <neupy.init.Constant>`. {BaseRNNLayer.only_return_final} backwards : bool If ``True``, process the sequence backwards and then reverse the output again such that the output from the layer is always from :math:`x_1` to :math:`x_n`. Defaults to ``False``. unroll_scan : bool If ``True`` the recursion is unrolled instead of using scan. For some graphs this gives a significant speed up but it might also consume more memory. When ``unroll_scan=True``, backpropagation always includes the full sequence, so ``n_gradient_steps`` must be set to ``-1`` and the input sequence length must be known at compile time (i.e., cannot be given as ``None``). Defaults to ``False``. {BaseLayer.Parameters} Notes ----- Code was adapted from the `Lasagne <https://github.com/Lasagne/Lasagne>`_ library. Examples -------- Sequence classification .. code-block:: python from neupy import layers, algorithms n_time_steps = 40 n_categories = 20 embedded_size = 10 network = algorithms.RMSProp( [ layers.Input(n_time_steps), layers.Embedding(n_categories, embedded_size), layers.GRU(20), layers.Sigmoid(1), ] ) """ input_weights = ParameterProperty(default=init.HeNormal()) hidden_weights = ParameterProperty(default=init.HeNormal()) biases = ParameterProperty(default=init.Constant(0)) activation_functions = MultiCallableProperty(default=dict( resetgate=tf.nn.sigmoid, updategate=tf.nn.sigmoid, hidden_update=tf.tanh, )) learn_init = Property(default=False, expected_type=bool) hidden_init = ParameterProperty(default=init.Constant(0)) backwards = Property(default=False, expected_type=bool) unroll_scan = Property(default=False, expected_type=bool) gradient_clipping = NumberProperty(default=0, minval=0) def initialize(self): super(GRU, self).initialize() n_inputs = np.prod(self.input_shape[1:]) self.input_weights = self.add_parameter( value=self.input_weights, name='input_weights', shape=(n_inputs, 3 * self.size), ) self.hidden_weights = self.add_parameter( value=self.hidden_weights, name='hidden_weights', shape=(self.size, 3 * self.size), ) self.biases = self.add_parameter( value=self.biases, name='biases', shape=(3 * self.size, ), ) self.add_parameter(value=self.hidden_init, shape=(1, self.size), name="hidden_init", trainable=self.learn_init) def output(self, input_value): # Because scan iterates over the first dimension we # dimshuffle to (n_time_steps, n_batch, n_features) input_value = tf.transpose(input_value, [1, 0, 2]) input_shape = tf.shape(input_value) n_batch = input_shape[1] # Create single recurrent computation step function # input_n is the n'th vector of the input def one_gru_step(states, input_n): with tf.name_scope('gru-cell'): hid_previous, = states input_n = tf.matmul(input_n, self.input_weights) + self.biases # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, # and W_{hc} h_{t - 1} hid_input = tf.matmul(hid_previous, self.hidden_weights) if self.gradient_clipping != 0: input_n = clip_gradient(input_n, self.gradient_clipping) hid_input = clip_gradient(hid_input, self.gradient_clipping) hid_resetgate, hid_updategate, hid_hidden = tf.split(hid_input, 3, axis=1) in_resetgate, in_updategate, in_hidden = tf.split(input_n, 3, axis=1) # Reset and update gates resetgate = self.activation_functions.resetgate(hid_resetgate + in_resetgate) updategate = self.activation_functions.updategate( hid_updategate + in_updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update = in_hidden + resetgate * hid_hidden if self.gradient_clipping != 0: hidden_update = clip_gradient(hidden_update, self.gradient_clipping) hidden_update = self.activation_functions.hidden_update( hidden_update) # Compute (1 - u_t)h_{t - 1} + u_t c_t return [ hid_previous - updategate * (hid_previous - hidden_update) ] hidden_init = tf.tile(self.hidden_init, (n_batch, 1)) sequence = input_value if self.backwards: sequence = tf.reverse(sequence, axis=[0]) if self.unroll_scan: # Explicitly unroll the recurrence instead of using scan hid_out = unroll_scan(fn=one_gru_step, sequence=sequence, outputs_info=[hidden_init]) else: hid_out, = tf.scan( fn=one_gru_step, elems=input_value, initializer=[hidden_init], name='gru-scan', ) # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: return hid_out[-1] # if scan is backward reverse the output if self.backwards: hid_out = tf.reverse(hid_out, axis=[0]) # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = tf.transpose(hid_out, [1, 0, 2]) return hid_out
def test_gain_relu(self): he_initializer = init.HeNormal(gain='relu') self.assertEqual(he_initializer.gain, math.sqrt(2))