Esempio n. 1
0
class Relu(ActivationLayer):
    """
    The layer with the rectifier (ReLu) activation function.

    Parameters
    ----------
    alpha : float
        Alpha parameter defines the decreasing rate
        for the negative values. If ``alpha``
        is non-zero value then layer behave like a
        leaky ReLu. Defaults to ``0``.

    {ActivationLayer.Parameters}

    Methods
    -------
    {ActivationLayer.Methods}

    Attributes
    ----------
    {ActivationLayer.Attributes}
    """
    alpha = NumberProperty(default=0, minval=0)
    weight = ParameterProperty(default=init.XavierNormal(gain='relu'))

    def activation_function(self, input_value):
        alpha = asfloat(self.alpha)
        return T.nnet.relu(input_value, alpha)
Esempio n. 2
0
class PRelu(ActivationLayer):
    """
    The layer with the parametrized ReLu activation
    function.

    Parameters
    ----------
    alpha_axes : int or tuple
        Axes that will not include unique alpha parameter.
        Single integer value defines the same as a tuple with one value.
        Defaults to ``1``.

    alpha : array-like, Theano shared variable, scalar or Initializer
        Alpha parameter per each non-shared axis for the ReLu.
        Scalar value means that each element in the tensor will be
        equal to the specified value.
        Default initialization methods you can find
        :ref:`here <init-methods>`.
        Defaults to ``Constant(value=0.25)``.

    {ActivationLayer.Parameters}

    Methods
    -------
    {ActivationLayer.Methods}

    Attributes
    ----------
    {ActivationLayer.Attributes}

    References
    ----------
    .. [1] https://arxiv.org/pdf/1502.01852v1.pdf
    """
    alpha_axes = AxesProperty(default=1)
    alpha = ParameterProperty(default=init.Constant(value=0.25))

    def __init__(self, *args, **options):
        super(PRelu, self).__init__(*args, **options)

        if 0 in self.alpha_axes:
            raise ValueError("Cannot specify alpha for 0-axis")

    def validate(self, input_shape):
        if max(self.alpha_axes) > len(input_shape):
            max_axis_index = len(input_shape) - 1
            raise ValueError("Cannot specify alpha for the axis #{}. "
                             "Maximum available axis is #{} (0-based indeces)."
                             "".format(max(self.alpha_axes), max_axis_index))

    def initialize(self):
        super(PRelu, self).initialize()

        alpha_shape = [self.output_shape[axis - 1] for axis in self.alpha_axes]
        self.add_parameter(value=self.alpha, name='alpha',
                           shape=alpha_shape, trainable=True)

    def activation_function(self, input_value):
        alpha = dimshuffle(self.alpha, input_value.ndim, self.alpha_axes)
        return T.nnet.relu(input_value, alpha)
Esempio n. 3
0
class Relu(ActivationLayer):
    """
    The layer with the rectifier (ReLu) activation function.

    Parameters
    ----------
    alpha : float
        Alpha parameter defines the decreasing rate
        for the negative values. If ``alpha``
        is non-zero value then layer behave like a
        leaky ReLu. Defaults to ``0``.

    {ActivationLayer.size}

    weight : array-like, Tensorfow variable, scalar or Initializer
        Defines layer's weights. Default initialization methods
        you can find :ref:`here <init-methods>`.
        Defaults to :class:`HeNormal(gain=2) <neupy.init.HeNormal>`.

    {ParameterBasedLayer.bias}

    {BaseLayer.Parameters}

    Methods
    -------
    {ActivationLayer.Methods}

    Attributes
    ----------
    {ActivationLayer.Attributes}

    Examples
    --------
    Feedforward Neural Networks (FNN)

    >>> from neupy.layers import *
    >>> network = Input(10) > Relu(20) > Relu(1)

    Convolutional Neural Networks (CNN)

    >>> from neupy.layers import *
    >>> network = join(
    ...     Input((32, 32, 3)),
    ...     Convolution((3, 3, 16)) > Relu(),
    ...     Convolution((3, 3, 32)) > Relu(),
    ...     Reshape(),
    ...     Softmax(10),
    ... )
    """
    alpha = NumberProperty(default=0, minval=0)
    weight = ParameterProperty(default=init.HeNormal(gain=2))

    def activation_function(self, input_value):
        if self.alpha == 0:
            return tf.nn.relu(input_value)
        return tf.nn.leaky_relu(input_value, asfloat(self.alpha))
Esempio n. 4
0
class BaseAssociative(UnsupervisedLearningMixin, BaseNetwork):
    """
    Base class for associative learning.

    Parameters
    ----------
    n_inputs : int
        Number of input units.
    n_outputs : int
        Number of output units.
    weight : array-like, Initializer
        Neural network weights.
        Value defined manualy should have shape ``(n_inputs, n_outputs)``.
        Defaults to :class:`Normal() <neupy.core.init.Normal>`.
    {BaseNetwork.step}
    {BaseNetwork.show_epoch}
    {BaseNetwork.shuffle_data}
    {BaseNetwork.epoch_end_signal}
    {BaseNetwork.train_end_signal}
    {Verbose.verbose}

    Methods
    -------
    {BaseSkeleton.predict}
    train(input_train, epochs=100):
        Train neural network.
    {BaseSkeleton.fit}
    """

    n_inputs = IntProperty(minval=1, required=True)
    n_outputs = IntProperty(minval=1, required=True)
    weight = ParameterProperty(default=Normal())

    def __init__(self, **options):
        super(BaseAssociative, self).__init__(**options)
        self.init_layers()

    def init_layers(self):
        valid_weight_shape = (self.n_inputs, self.n_outputs)

        if isinstance(self.weight, Initializer):
            self.weight = self.weight.sample(valid_weight_shape)

        if self.weight.shape != valid_weight_shape:
            raise ValueError("Weight matrix has invalid shape. Got {}, "
                             "expected {}".format(self.weight.shape,
                                                  valid_weight_shape))

        self.weight = self.weight.astype(float)

    def train(self, input_train, epochs=100):
        return super(BaseAssociative, self).train(input_train,
                                                  epochs=epochs,
                                                  epsilon=None)
Esempio n. 5
0
class Relu(ActivationLayer):
    """
    The layer with the rectifier (ReLu) activation function.

    Parameters
    ----------
    alpha : float
        Alpha parameter defines the decreasing rate
        for the negative values. If ``alpha``
        is non-zero value then layer behave like a
        leaky ReLu. Defaults to ``0``.

    {ActivationLayer.size}

    weight : array-like, Tensorfow variable, scalar or Initializer
        Defines layer's weights. Default initialization methods
        you can find :ref:`here <init-methods>`.
        Defaults to :class:`HeNormal(gain=2) <neupy.init.HeNormal>`.

    {ParameterBasedLayer.bias}

    {BaseLayer.Parameters}

    Methods
    -------
    {ActivationLayer.Methods}

    Attributes
    ----------
    {ActivationLayer.Attributes}
    """
    alpha = NumberProperty(default=0, minval=0)
    weight = ParameterProperty(default=init.HeNormal(gain=2))

    def activation_function(self, input_value):
        if self.alpha == 0:
            return tf.nn.relu(input_value)
        return tf.nn.leaky_relu(input_value, asfloat(self.alpha))
Esempio n. 6
0
class Oja(BaseNetwork):
    """
    Oja is an unsupervised technique used for the
    dimensionality reduction tasks.

    Notes
    -----
    - In practice use step as very small value.
      For instance, value ``1e-7`` can be a good choice.

    - Normalize the input data before use Oja algorithm.
      Input data shouldn't contains large values.

    - Set up smaller values for weight if error for a few
      first iterations is big compare to the input values scale.
      For instance, if your input data have values between
      ``0`` and ``1`` error value equal to ``100`` is big.

    - During the training network report mean absolute error (MAE)

    Parameters
    ----------
    minimized_data_size : int
        Expected number of features after minimization,
        defaults to ``1``.

    weight : array-like or ``None``
        Defines networks weights.
        Defaults to :class:`XavierNormal() <neupy.init.XavierNormal>`.

    {BaseNetwork.Parameters}

    Methods
    -------
    reconstruct(X)
        Reconstruct original dataset from the minimized input.

    train(X, epochs=100)
        Trains the network to the data X. Network trains until maximum
        number of ``epochs`` was reached.

    predict(X)
        Returns hidden representation of the input data ``X``. Basically,
        it applies dimensionality reduction.

    {BaseSkeleton.fit}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> data = np.array([[2, 2], [1, 1], [4, 4], [5, 5]])
    >>>
    >>> ojanet = algorithms.Oja(
    ...     minimized_data_size=1,
    ...     step=0.01,
    ...     verbose=False
    ... )
    >>>
    >>> ojanet.train(data, epochs=100)
    >>> minimized = ojanet.predict(data)
    >>> minimized
    array([[-2.82843122],
           [-1.41421561],
           [-5.65686243],
           [-7.07107804]])
    >>> ojanet.reconstruct(minimized)
    array([[ 2.00000046,  2.00000046],
           [ 1.00000023,  1.00000023],
           [ 4.00000093,  4.00000093],
           [ 5.00000116,  5.00000116]])
    """
    minimized_data_size = IntProperty(minval=1)
    weight = ParameterProperty(default=init.XavierNormal())

    def one_training_update(self, X, y_train):
        weight = self.weight

        minimized = np.dot(X, weight)
        reconstruct = np.dot(minimized, weight.T)
        error = X - reconstruct

        weight += self.step * np.dot(error.T, minimized)
        mae = np.sum(np.abs(error)) / X.size

        # Clean objects from the memory
        del minimized
        del reconstruct
        del error

        return mae

    def train(self, X, epochs=100):
        X = format_data(X)
        n_input_features = X.shape[1]

        if isinstance(self.weight, init.Initializer):
            weight_shape = (n_input_features, self.minimized_data_size)
            self.weight = self.weight.sample(weight_shape, return_array=True)

        if n_input_features != self.weight.shape[0]:
            raise ValueError("Invalid number of features. Expected {}, got {}"
                             "".format(self.weight.shape[0], n_input_features))

        super(Oja, self).train(X, epochs=epochs)

    def reconstruct(self, X):
        if not isinstance(self.weight, np.ndarray):
            raise NotTrained("Network hasn't been trained yet")

        X = format_data(X)
        if X.shape[1] != self.minimized_data_size:
            raise ValueError("Invalid input data feature space, expected "
                             "{}, got {}.".format(X.shape[1],
                                                  self.minimized_data_size))

        return np.dot(X, self.weight.T)

    def predict(self, X):
        if not isinstance(self.weight, np.ndarray):
            raise NotTrained("Network hasn't been trained yet")

        X = format_data(X)
        return np.dot(X, self.weight)
Esempio n. 7
0
class BaseAssociative(BaseNetwork):
    """
    Base class for associative learning.

    Parameters
    ----------
    n_inputs : int
        Number of features (columns) in the input data.

    n_outputs : int
        Number of outputs in the  network.

    weight : array-like, Initializer
        Neural network weights.
        Value defined manualy should have shape ``(n_inputs, n_outputs)``.
        Defaults to :class:`Normal() <neupy.init.Normal>`.

    {BaseNetwork.step}

    {BaseNetwork.show_epoch}

    {BaseNetwork.shuffle_data}

    {BaseNetwork.epoch_end_signal}

    {BaseNetwork.train_end_signal}

    {Verbose.verbose}

    Methods
    -------
    {BaseSkeleton.predict}

    train(input_train, summary='table', epochs=100)
        Train neural network.

    {BaseSkeleton.fit}
    """
    n_inputs = IntProperty(minval=1, required=True)
    n_outputs = IntProperty(minval=1, required=True)
    weight = ParameterProperty(default=init.Normal())

    def __init__(self, **options):
        super(BaseAssociative, self).__init__(**options)
        self.init_layers()

    def init_layers(self):
        valid_weight_shape = (self.n_inputs, self.n_outputs)

        if isinstance(self.weight, init.Initializer):
            self.weight = self.weight.sample(
                valid_weight_shape, return_array=True)

        if self.weight.shape != valid_weight_shape:
            raise ValueError(
                "Weight matrix has invalid shape. Got {}, expected {}"
                "".format(self.weight.shape, valid_weight_shape))

        self.weight = self.weight.astype(float)

    def format_input_data(self, input_data):
        is_feature1d = self.n_inputs == 1
        input_data = format_data(input_data, is_feature1d)

        if input_data.ndim != 2:
            raise ValueError("Cannot make prediction, because input "
                             "data has more than 2 dimensions")

        n_samples, n_features = input_data.shape

        if n_features != self.n_inputs:
            raise ValueError("Input data expected to have {} features, "
                             "but got {}".format(self.n_inputs, n_features))

        return input_data

    def train(self, input_train, summary='table', epochs=100):
        input_train = self.format_input_data(input_train)

        return super(BaseAssociative, self).train(
            input_train=input_train, target_train=None,
            input_test=None, target_test=None,
            epochs=epochs, epsilon=None,
            summary=summary)
Esempio n. 8
0
class Oja(UnsupervisedLearningMixin, BaseNetwork):
    """
    Oja unsupervised algorithm that minimize input data feature
    space.

    Notes
    -----
    * In practice use step as very small value. For example ``1e-7``.
    * Normalize the input data before use Oja algorithm. Input data \
    shouldn't contains large values.
    * Set up smaller values for weight if error for a few first iterations \
    is big compare to the input values scale. For example, if your input \
    data have values between 0 and 1 error value equal to 100 is big.

    Parameters
    ----------
    minimized_data_size : int
        Expected number of features after minimization, defaults to ``1``
    weight : array-like or ``None``
        Defines networks weights.
        Defaults to :class:`XavierNormal() <neupy.core.init.XavierNormal>`.
    {BaseNetwork.step}
    {BaseNetwork.show_epoch}
    {BaseNetwork.epoch_end_signal}
    {BaseNetwork.train_end_signal}
    {Verbose.verbose}

    Methods
    -------
    reconstruct(input_data):
        Reconstruct your minimized data.
    {BaseSkeleton.predict}
    {UnsupervisedLearningMixin.train}
    {BaseSkeleton.fit}

    Raises
    ------
    ValueError
        * Try reconstruct without training.
        * Invalid number of input data features for ``train`` and \
        ``reconstruct`` methods.

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> data = np.array([[2, 2], [1, 1], [4, 4], [5, 5]])
    >>>
    >>> ojanet = algorithms.Oja(
    ...     minimized_data_size=1,
    ...     step=0.01,
    ...     verbose=False
    ... )
    >>>
    >>> ojanet.train(data, epsilon=1e-5)
    >>> minimized = ojanet.predict(data)
    >>> minimized
    array([[-2.82843122],
           [-1.41421561],
           [-5.65686243],
           [-7.07107804]])
    >>> ojanet.reconstruct(minimized)
    array([[ 2.00000046,  2.00000046],
           [ 1.00000023,  1.00000023],
           [ 4.00000093,  4.00000093],
           [ 5.00000116,  5.00000116]])
    """
    minimized_data_size = IntProperty(minval=1)
    weight = ParameterProperty(default=init.XavierNormal())

    def init_properties(self):
        del self.shuffle_data
        super(Oja, self).init_properties()

    def train_epoch(self, input_data, target_train):
        weight = self.weight

        minimized = np.dot(input_data, weight)
        reconstruct = np.dot(minimized, weight.T)
        error = input_data - reconstruct

        weight += self.step * np.dot(error.T, minimized)

        mae = np.sum(np.abs(error)) / input_data.size

        # Clear memory
        del minimized
        del reconstruct
        del error

        return mae

    def train(self, input_data, epsilon=1e-2, epochs=100):
        input_data = format_data(input_data)
        n_input_features = input_data.shape[1]

        if isinstance(self.weight, init.Initializer):
            weight_shape = (n_input_features, self.minimized_data_size)
            self.weight = self.weight.sample(weight_shape)

        if n_input_features != self.weight.shape[0]:
            raise ValueError(
                "Invalid number of features. Expected {}, got {}".format(
                    self.weight.shape[0],
                    n_input_features
                )
            )

        super(Oja, self).train(input_data, epsilon=epsilon, epochs=epochs)

    def reconstruct(self, input_data):
        if not isinstance(self.weight, np.ndarray):
            raise NotTrainedException("Train network before use "
                                      "reconstruct method.")

        input_data = format_data(input_data)
        if input_data.shape[1] != self.minimized_data_size:
            raise ValueError(
                "Invalid input data feature space, expected "
                "{}, got {}.".format(
                    input_data.shape[1],
                    self.minimized_data_size
                )
            )

        return np.dot(input_data, self.weight.T)

    def predict(self, input_data):
        if not isinstance(self.weight, np.ndarray):
            raise NotTrainedException("Train network before use "
                                      "prediction method.")

        input_data = format_data(input_data)
        return np.dot(input_data, self.weight)
Esempio n. 9
0
class RBM(BaseAlgorithm, BaseNetwork, MinibatchTrainingMixin, DumpableObject):
    """
    Boolean/Bernoulli Restricted Boltzmann Machine (RBM).
    Algorithm assumes that inputs are either binary
    values or values between 0 and 1.

    Parameters
    ----------
    n_visible : int
        Number of visible units. Number of features (columns)
        in the input data.

    n_hidden : int
        Number of hidden units. The large the number the more
        information network can capture from the data, but it
        also mean that network is more likely to overfit.

    batch_size : int
        Size of the mini-batch. Defaults to ``10``.

    weight : array-like, Tensorfow variable, Initializer or scalar
        Default initialization methods
        you can find :ref:`here <init-methods>`.
        Defaults to :class:`Normal <neupy.init.Normal>`.

    hidden_bias : array-like, Tensorfow variable, Initializer or scalar
        Default initialization methods
        you can find :ref:`here <init-methods>`.
        Defaults to :class:`Constant(value=0) <neupy.init.Constant>`.

    visible_bias : array-like, Tensorfow variable, Initializer or scalar
        Default initialization methods
        you can find :ref:`here <init-methods>`.
        Defaults to :class:`Constant(value=0) <neupy.init.Constant>`.

    {BaseNetwork.Parameters}

    Methods
    -------
    train(input_train, epochs=100)
        Trains network.

    {BaseSkeleton.fit}

    visible_to_hidden(visible_input)
        Populates data throught the network and returns output
        from the hidden layer.

    hidden_to_visible(hidden_input)
        Propagates output from the hidden layer backward
        to the visible.

    gibbs_sampling(visible_input, n_iter=1)
        Makes Gibbs sampling ``n`` times using visible input.

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> data = np.array([
    ...     [1, 0, 1, 0],
    ...     [1, 0, 1, 0],
    ...     [1, 0, 0, 0],  # incomplete sample
    ...     [1, 0, 1, 0],
    ...
    ...     [0, 1, 0, 1],
    ...     [0, 0, 0, 1],  # incomplete sample
    ...     [0, 1, 0, 1],
    ...     [0, 1, 0, 1],
    ...     [0, 1, 0, 1],
    ...     [0, 1, 0, 1],
    ... ])
    >>>
    >>> rbm = algorithms.RBM(n_visible=4, n_hidden=1)
    >>> rbm.train(data, epochs=100)
    >>>
    >>> hidden_states = rbm.visible_to_hidden(data)
    >>> hidden_states.round(2)
    array([[ 0.99],
           [ 0.99],
           [ 0.95],
           [ 0.99],
           [ 0.  ],
           [ 0.01],
           [ 0.  ],
           [ 0.  ],
           [ 0.  ],
           [ 0.  ]])

    References
    ----------
    [1] G. Hinton, A Practical Guide to Training Restricted
        Boltzmann Machines, 2010.
        http://www.cs.toronto.edu/~hinton/absps/guideTR.pdf
    """
    n_visible = IntProperty(minval=1)
    n_hidden = IntProperty(minval=1)
    batch_size = IntProperty(minval=1, default=10)

    weight = ParameterProperty(default=init.Normal())
    hidden_bias = ParameterProperty(default=init.Constant(value=0))
    visible_bias = ParameterProperty(default=init.Constant(value=0))

    def __init__(self, n_visible, n_hidden, **options):
        options.update({'n_visible': n_visible, 'n_hidden': n_hidden})
        super(RBM, self).__init__(**options)

    def init_input_output_variables(self):
        with tf.variable_scope('rbm'):
            self.weight = create_shared_parameter(value=self.weight,
                                                  name='weight',
                                                  shape=(self.n_visible,
                                                         self.n_hidden))
            self.hidden_bias = create_shared_parameter(
                value=self.hidden_bias,
                name='hidden-bias',
                shape=(self.n_hidden, ),
            )
            self.visible_bias = create_shared_parameter(
                value=self.visible_bias,
                name='visible-bias',
                shape=(self.n_visible, ),
            )

            self.variables.update(network_input=tf.placeholder(
                tf.float32,
                (None, self.n_visible),
                name="network-input",
            ),
                                  network_hidden_input=tf.placeholder(
                                      tf.float32,
                                      (None, self.n_hidden),
                                      name="network-hidden-input",
                                  ))

    def init_variables(self):
        with tf.variable_scope('rbm'):
            self.variables.update(h_samples=tf.Variable(
                tf.zeros([self.batch_size, self.n_hidden]),
                name="hidden-samples",
                dtype=tf.float32,
            ), )

    def init_methods(self):
        def free_energy(visible_sample):
            with tf.name_scope('free-energy'):
                wx = tf.matmul(visible_sample, self.weight)
                wx_b = wx + self.hidden_bias

                visible_bias_term = dot(visible_sample, self.visible_bias)

                # We can get infinity when wx_b is a relatively large number
                # (maybe 100). Taking exponent makes it even larger and
                # for with float32 it can convert it to infinity. But because
                # number is so large we don't care about +1 value before taking
                # logarithms and therefore we can just pick value as it is
                # since our operation won't change anything.
                hidden_terms = tf.where(
                    # exp(30) is such a big number that +1 won't
                    # make any difference in the outcome.
                    tf.greater(wx_b, 30),
                    wx_b,
                    tf.log1p(tf.exp(wx_b)),
                )

                hidden_term = tf.reduce_sum(hidden_terms, axis=1)
                return -(visible_bias_term + hidden_term)

        def visible_to_hidden(visible_sample):
            with tf.name_scope('visible-to-hidden'):
                wx = tf.matmul(visible_sample, self.weight)
                wx_b = wx + self.hidden_bias
                return tf.nn.sigmoid(wx_b)

        def hidden_to_visible(hidden_sample):
            with tf.name_scope('hidden-to-visible'):
                wx = tf.matmul(hidden_sample, self.weight, transpose_b=True)
                wx_b = wx + self.visible_bias
                return tf.nn.sigmoid(wx_b)

        def sample_hidden_from_visible(visible_sample):
            with tf.name_scope('sample-hidden-to-visible'):
                hidden_prob = visible_to_hidden(visible_sample)
                hidden_sample = random_binomial(hidden_prob)
                return hidden_sample

        def sample_visible_from_hidden(hidden_sample):
            with tf.name_scope('sample-visible-to-hidden'):
                visible_prob = hidden_to_visible(hidden_sample)
                visible_sample = random_binomial(visible_prob)
                return visible_sample

        network_input = self.variables.network_input
        network_hidden_input = self.variables.network_hidden_input
        input_shape = tf.shape(network_input)
        n_samples = input_shape[0]

        weight = self.weight
        h_bias = self.hidden_bias
        v_bias = self.visible_bias
        h_samples = self.variables.h_samples
        step = asfloat(self.step)

        with tf.name_scope('positive-values'):
            # We have to use `cond` instead of `where`, because
            # different if-else cases might have different shapes
            # and it triggers exception in tensorflow.
            v_pos = tf.cond(
                tf.equal(n_samples, self.batch_size), lambda: network_input,
                lambda: random_sample(network_input, self.batch_size))
            h_pos = visible_to_hidden(v_pos)

        with tf.name_scope('negative-values'):
            v_neg = sample_visible_from_hidden(h_samples)
            h_neg = visible_to_hidden(v_neg)

        with tf.name_scope('weight-update'):
            weight_update = (
                tf.matmul(v_pos, h_pos, transpose_a=True) -
                tf.matmul(v_neg, h_neg, transpose_a=True)) / asfloat(n_samples)

        with tf.name_scope('hidden-bias-update'):
            h_bias_update = tf.reduce_mean(h_pos - h_neg, axis=0)

        with tf.name_scope('visible-bias-update'):
            v_bias_update = tf.reduce_mean(v_pos - v_neg, axis=0)

        with tf.name_scope('flipped-input-features'):
            # Each row will have random feature marked with number 1
            # Other values will be equal to 0
            possible_feature_corruptions = tf.eye(self.n_visible)
            corrupted_features = random_sample(possible_feature_corruptions,
                                               n_samples)

            rounded_input = tf.round(network_input)
            # If we scale input values from [0, 1] range to [-1, 1]
            # than it will be easier to flip feature values with simple
            # multiplication.
            scaled_rounded_input = 2 * rounded_input - 1
            scaled_flipped_rounded_input = (
                # for corrupted_features we convert 0 to 1 and 1 to -1
                # in this way after multiplication we will flip all
                # signs where -1 in the transformed corrupted_features
                (-2 * corrupted_features + 1) * scaled_rounded_input)
            # Scale it back to the [0, 1] range
            flipped_rounded_input = (scaled_flipped_rounded_input + 1) / 2

        with tf.name_scope('pseudo-likelihood-loss'):
            # Stochastic pseudo-likelihood
            error = tf.reduce_mean(self.n_visible * tf.log_sigmoid(
                free_energy(flipped_rounded_input) -
                free_energy(rounded_input)))

        with tf.name_scope('gibbs-sampling'):
            gibbs_sampling = sample_visible_from_hidden(
                sample_hidden_from_visible(network_input))

        initialize_uninitialized_variables()
        self.methods.update(train_epoch=function(
            [network_input],
            error,
            name='rbm/train-epoch',
            updates=[
                (weight, weight + step * weight_update),
                (h_bias, h_bias + step * h_bias_update),
                (v_bias, v_bias + step * v_bias_update),
                (h_samples, random_binomial(p=h_neg)),
            ]),
                            prediction_error=function(
                                [network_input],
                                error,
                                name='rbm/prediction-error',
                            ),
                            diff1=function(
                                [network_input],
                                free_energy(flipped_rounded_input),
                                name='rbm/diff1-error',
                            ),
                            diff2=function(
                                [network_input],
                                free_energy(rounded_input),
                                name='rbm/diff2-error',
                            ),
                            visible_to_hidden=function(
                                [network_input],
                                visible_to_hidden(network_input),
                                name='rbm/visible-to-hidden',
                            ),
                            hidden_to_visible=function(
                                [network_hidden_input],
                                hidden_to_visible(network_hidden_input),
                                name='rbm/hidden-to-visible',
                            ),
                            gibbs_sampling=function(
                                [network_input],
                                gibbs_sampling,
                                name='rbm/gibbs-sampling',
                            ))

    def train(self, input_train, input_test=None, epochs=100, summary='table'):
        """
        Train RBM.

        Parameters
        ----------
        input_train : 1D or 2D array-like
        input_test : 1D or 2D array-like or None
            Defaults to ``None``.
        epochs : int
            Number of training epochs. Defaults to ``100``.
        summary : {'table', 'inline'}
            Training summary type. Defaults to ``'table'``.
        """
        return super(RBM, self).train(input_train=input_train,
                                      target_train=None,
                                      input_test=input_test,
                                      target_test=None,
                                      epochs=epochs,
                                      epsilon=None,
                                      summary=summary)

    def train_epoch(self, input_train, target_train=None):
        """
        Train one epoch.

        Parameters
        ----------
        input_train : array-like (n_samples, n_features)

        Returns
        -------
        float
        """
        errors = self.apply_batches(
            function=self.methods.train_epoch,
            input_data=input_train,
            description='Training batches',
            show_error_output=True,
        )

        n_samples = len(input_train)
        return average_batch_errors(errors, n_samples, self.batch_size)

    def visible_to_hidden(self, visible_input):
        """
        Populates data throught the network and returns output
        from the hidden layer.

        Parameters
        ----------
        visible_input : array-like (n_samples, n_visible_features)

        Returns
        -------
        array-like
        """
        is_input_feature1d = (self.n_visible == 1)
        visible_input = format_data(visible_input, is_input_feature1d)

        outputs = self.apply_batches(
            function=self.methods.visible_to_hidden,
            input_data=visible_input,
            description='Hidden from visible batches',
            show_progressbar=True,
            show_error_output=False,
            scalar_output=False,
        )
        return np.concatenate(outputs, axis=0)

    def hidden_to_visible(self, hidden_input):
        """
        Propagates output from the hidden layer backward
        to the visible.

        Parameters
        ----------
        hidden_input : array-like (n_samples, n_hidden_features)

        Returns
        -------
        array-like
        """
        is_input_feature1d = (self.n_hidden == 1)
        hidden_input = format_data(hidden_input, is_input_feature1d)

        outputs = self.apply_batches(
            function=self.methods.hidden_to_visible,
            input_data=hidden_input,
            description='Visible from hidden batches',
            show_progressbar=True,
            show_error_output=False,
            scalar_output=False,
        )
        return np.concatenate(outputs, axis=0)

    def prediction_error(self, input_data, target_data=None):
        """
        Compute the pseudo-likelihood of input samples.

        Parameters
        ----------
        input_data : array-like
            Values of the visible layer

        Returns
        -------
        float
            Value of the pseudo-likelihood.
        """
        is_input_feature1d = (self.n_visible == 1)
        input_data = format_data(input_data, is_input_feature1d)

        errors = self.apply_batches(
            function=self.methods.prediction_error,
            input_data=input_data,
            description='Validation batches',
            show_error_output=True,
        )
        return average_batch_errors(
            errors,
            n_samples=len(input_data),
            batch_size=self.batch_size,
        )

    def gibbs_sampling(self, visible_input, n_iter=1):
        """
        Makes Gibbs sampling n times using visible input.

        Parameters
        ----------
        visible_input : 1d or 2d array
        n_iter : int
            Number of Gibbs sampling iterations. Defaults to ``1``.

        Returns
        -------
        array-like
            Output from the visible units after perfoming n
            Gibbs samples. Array will contain only binary
            units (0 and 1).
        """
        is_input_feature1d = (self.n_visible == 1)
        visible_input = format_data(visible_input, is_input_feature1d)

        gibbs_sampling = self.methods.gibbs_sampling

        input_ = visible_input
        for iteration in range(n_iter):
            input_ = gibbs_sampling(input_)

        return input_
Esempio n. 10
0
class ParameterBasedLayer(BaseLayer):
    """
    Layer that creates weight and bias parameters.

    Parameters
    ----------
    size : int
        Layer's output size.

    weight : array-like, Theano variable, scalar or Initializer
        Defines layer's weights. Default initialization methods
        you can find :ref:`here <init-methods>`.
        Defaults to :class:`XavierNormal() <neupy.init.XavierNormal>`.

    bias : 1D array-like, Theano variable, scalar, Initializer or None
        Defines layer's bias.
        Default initialization methods you can find
        :ref:`here <init-methods>`. Defaults to
        :class:`Constant(0) <neupy.init.Constant>`.
        The ``None`` value excludes bias from the calculations and
        do not add it into parameters list.

    {BaseLayer.Parameters}

    Methods
    -------
    {BaseLayer.Methods}

    Attributes
    ----------
    {BaseLayer.Attributes}
    """
    size = IntProperty(minval=1)
    weight = ParameterProperty(default=init.XavierNormal())
    bias = ParameterProperty(default=init.Constant(value=0), allow_none=True)

    def __init__(self, size, **options):
        super(ParameterBasedLayer, self).__init__(size=size, **options)

    @property
    def weight_shape(self):
        return as_tuple(self.input_shape, self.output_shape)

    @property
    def bias_shape(self):
        if self.bias is not None:
            return as_tuple(self.output_shape)

    def initialize(self):
        super(ParameterBasedLayer, self).initialize()

        self.add_parameter(value=self.weight, name='weight',
                           shape=self.weight_shape, trainable=True)

        if self.bias is not None:
            self.add_parameter(value=self.bias, name='bias',
                               shape=self.bias_shape, trainable=True)

    def __repr__(self):
        classname = self.__class__.__name__
        return '{name}({size})'.format(name=classname, size=self.size)
Esempio n. 11
0
class GRU(BaseRNNLayer):
    """
    Gated Recurrent Unit (GRU) Layer.

    Parameters
    ----------
    {BaseRNNLayer.n_units}

    {BaseRNNLayer.only_return_final}

    input_weights : Initializer, ndarray
        Weight parameters for input connection.
        Defaults to :class:`HeNormal() <neupy.init.HeNormal>`.

    hidden_weights : Initializer, ndarray
        Weight parameters for hidden connection.
        Defaults to :class:`HeNormal() <neupy.init.HeNormal>`.

    biases : Initializer, ndarray
        Bias parameters for all gates.
        Defaults to :class:`Constant(0) <neupy.init.Constant>`.

    resetgate : function
        Activation function for the reset gate.
        Defaults to ``tf.nn.sigmoid``.

    updategate : function
        Activation function for the update gate.
        Defaults to ``tf.nn.sigmoid``.

    hidden_update : function
        Activation function for the hidden state update.
        Defaults to ``tf.tanh``.

    learn_init : bool
        If ``True``, make ``hidden_init`` trainable variable.
        Defaults to ``False``.

    hidden_init : array-like, Tensorfow variable, scalar or Initializer
        Initializer for initial hidden state (:math:`h_0`).
        Defaults to :class:`Constant(0) <neupy.init.Constant>`.

    backwards : bool
        If ``True``, process the sequence backwards and then reverse the
        output again such that the output from the layer is always
        from :math:`x_1` to :math:`x_n`. Defaults to ``False``.

    unroll_scan : bool
        If ``True`` the recursion is unrolled instead of using scan.
        For some graphs this gives a significant speed up but it
        might also consume more memory. When ``unroll_scan=True``,
        backpropagation always includes the full sequence, so
        ``n_gradient_steps`` must be set to ``-1`` and the input
        sequence length must be known at compile time (i.e.,
        cannot be given as ``None``). Defaults to ``False``.

    {BaseLayer.name}

    Notes
    -----
    Code was adapted from the
    `Lasagne <https://github.com/Lasagne/Lasagne>`_ library.

    Examples
    --------
    Sequence classification

    >>> from neupy.layers import *
    >>>
    >>> n_time_steps = 40
    >>> n_categories = 20
    >>> embedded_size = 10
    >>>
    >>> network = join(
    ...     Input(n_time_steps),
    ...     Embedding(n_categories, embedded_size),
    ...     GRU(20),
    ...     Sigmoid(1),
    ... )
    >>> network
    (?, 40) -> [... 4 layers ...] -> (?, 1)
    """
    input_weights = ParameterProperty()
    hidden_weights = ParameterProperty()
    biases = ParameterProperty()

    resetgate = Property(expected_type=types.FunctionType)
    updategate = Property(expected_type=types.FunctionType)
    hidden_update = Property(expected_type=types.FunctionType)

    hidden_init = ParameterProperty()
    learn_init = Property(default=False, expected_type=bool)

    backwards = Property(expected_type=bool)
    unroll_scan = Property(expected_type=bool)
    gradient_clipping = NumberProperty(default=0, minval=0)

    def __init__(
            self,
            n_units,
            only_return_final=True,
            # Trainable parameters
            input_weights=init.HeNormal(),
            hidden_weights=init.HeNormal(),
            biases=0,
            # Activation functions
            resetgate=tf.nn.sigmoid,
            updategate=tf.nn.sigmoid,
            hidden_update=tf.tanh,
            # Cell states
            hidden_init=0,
            learn_init=False,
            # Misc
            unroll_scan=False,
            backwards=False,
            gradient_clipping=0,
            name=None):

        super(GRU, self).__init__(
            n_units=n_units,
            only_return_final=only_return_final,
            name=name,
        )

        self.input_weights = input_weights
        self.hidden_weights = hidden_weights
        self.biases = biases

        self.resetgate = resetgate
        self.updategate = updategate
        self.hidden_update = hidden_update

        self.hidden_init = hidden_init
        self.learn_init = learn_init

        self.unroll_scan = unroll_scan
        self.backwards = backwards
        self.gradient_clipping = gradient_clipping

    def create_variables(self, input_shape):
        self.input_weights = self.variable(
            value=self.input_weights,
            name='input_weights',
            shape=(input_shape[-1], 3 * self.n_units),
        )
        self.hidden_weights = self.variable(
            value=self.hidden_weights,
            name='hidden_weights',
            shape=(self.n_units, 3 * self.n_units),
        )
        self.biases = self.variable(
            value=self.biases,
            name='biases',
            shape=(3 * self.n_units, ),
        )
        self.hidden_init = self.variable(value=self.hidden_init,
                                         shape=(1, self.n_units),
                                         name="hidden_init",
                                         trainable=self.learn_init)

    def output(self, input, **kwargs):
        # Because scan iterates over the first dimension we
        # dimshuffle to (n_time_steps, n_samples, n_features)
        input = tf.transpose(input, [1, 0, 2])

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def one_gru_step(states, input_n):
            with tf.name_scope('gru-cell'):
                hid_previous, = states
                input_n = tf.matmul(input_n, self.input_weights) + self.biases

                # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1},
                # and W_{hc} h_{t - 1}
                hid_input = tf.matmul(hid_previous, self.hidden_weights)

                if self.gradient_clipping != 0:
                    input_n = clip_gradient(input_n, self.gradient_clipping)
                    hid_input = clip_gradient(hid_input,
                                              self.gradient_clipping)

                hid_resetgate, hid_updategate, hid_hidden = tf.split(hid_input,
                                                                     3,
                                                                     axis=1)

                in_resetgate, in_updategate, in_hidden = tf.split(input_n,
                                                                  3,
                                                                  axis=1)

                # Reset and update gates
                resetgate = self.resetgate(hid_resetgate + in_resetgate)
                updategate = self.updategate(hid_updategate + in_updategate)

                # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1})
                hidden_update = in_hidden + resetgate * hid_hidden

                if self.gradient_clipping != 0:
                    hidden_update = clip_gradient(hidden_update,
                                                  self.gradient_clipping)

                hidden_update = self.hidden_update(hidden_update)

                # Compute (1 - u_t)h_{t - 1} + u_t c_t
                return [
                    hid_previous - updategate * (hid_previous - hidden_update)
                ]

        input_shape = tf.shape(input)
        n_samples = input_shape[1]  # batch dim has been moved
        hidden_init = tf.tile(self.hidden_init, (n_samples, 1))
        sequence = input

        if self.backwards:
            sequence = tf.reverse(sequence, axis=[0])

        if self.unroll_scan:
            # Explicitly unroll the recurrence instead of using scan
            hid_out = unroll_scan(fn=one_gru_step,
                                  sequence=sequence,
                                  outputs_info=[hidden_init])
        else:
            hid_out, = tf.scan(
                fn=one_gru_step,
                elems=sequence,
                initializer=[hidden_init],
                name='gru-scan',
            )

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            return hid_out[-1]

        # if scan is backward reverse the output
        if self.backwards:
            hid_out = tf.reverse(hid_out, axis=[0])

        # dimshuffle back to (n_samples, n_time_steps, n_features))
        hid_out = tf.transpose(hid_out, [1, 0, 2])
        return hid_out
Esempio n. 12
0
class GRU(BaseRNNLayer):
    """
    Gated Recurrent Unit (GRU) Layer.

    Parameters
    ----------
    {BaseRNNLayer.size}

    input_weights : Initializer, ndarray
        Weight parameters for input connection.
        Defaults to :class:`HeNormal() <neupy.init.HeNormal>`.

    hidden_weights : Initializer, ndarray
        Weight parameters for hidden connection.
        Defaults to :class:`HeNormal() <neupy.init.HeNormal>`.

    bias : Initializer, ndarray
        Bias parameters for all gates.
        Defaults to :class:`Constant(0) <neupy.init.Constant>`.

    activation_functions : dict, callable
        Activation functions for different gates. Defaults to:

        .. code-block:: python

            # import tensorflow as tf
            dict(
                resetgate=tf.nn.sigmoid,
                updategate=tf.nn.sigmoid,
                hidden_update=tf.tanh,
            )

        If application requires modification to only one parameter
        then it's better to specify the one that you need to modify
        and ignore other parameters

        .. code-block:: python

            dict(resetgate=tf.tanh)

        Other parameters like ``updategate`` or ``hidden_update``
        will be equal to their default values.

    learn_init : bool
        If ``True``, make ``hidden_init`` trainable variable.
        Defaults to ``False``.

    hidden_init : array-like, Tensorfow variable, scalar or Initializer
        Initializer for initial hidden state (:math:`h_0`).
        Defaults to :class:`Constant(0) <neupy.init.Constant>`.

    {BaseRNNLayer.only_return_final}

    backwards : bool
        If ``True``, process the sequence backwards and then reverse the
        output again such that the output from the layer is always
        from :math:`x_1` to :math:`x_n`. Defaults to ``False``.

    unroll_scan : bool
        If ``True`` the recursion is unrolled instead of using scan.
        For some graphs this gives a significant speed up but it
        might also consume more memory. When ``unroll_scan=True``,
        backpropagation always includes the full sequence, so
        ``n_gradient_steps`` must be set to ``-1`` and the input
        sequence length must be known at compile time (i.e.,
        cannot be given as ``None``). Defaults to ``False``.

    {BaseLayer.Parameters}

    Notes
    -----
    Code was adapted from the
    `Lasagne <https://github.com/Lasagne/Lasagne>`_ library.

    Examples
    --------

    Sequence classification

    .. code-block:: python

        from neupy import layers, algorithms

        n_time_steps = 40
        n_categories = 20
        embedded_size = 10

        network = algorithms.RMSProp(
            [
                layers.Input(n_time_steps),
                layers.Embedding(n_categories, embedded_size),
                layers.GRU(20),
                layers.Sigmoid(1),
            ]
        )
    """
    input_weights = ParameterProperty(default=init.HeNormal())
    hidden_weights = ParameterProperty(default=init.HeNormal())
    biases = ParameterProperty(default=init.Constant(0))

    activation_functions = MultiCallableProperty(default=dict(
        resetgate=tf.nn.sigmoid,
        updategate=tf.nn.sigmoid,
        hidden_update=tf.tanh,
    ))

    learn_init = Property(default=False, expected_type=bool)
    hidden_init = ParameterProperty(default=init.Constant(0))

    backwards = Property(default=False, expected_type=bool)
    unroll_scan = Property(default=False, expected_type=bool)
    gradient_clipping = NumberProperty(default=0, minval=0)

    def initialize(self):
        super(GRU, self).initialize()
        n_inputs = np.prod(self.input_shape[1:])

        self.input_weights = self.add_parameter(
            value=self.input_weights,
            name='input_weights',
            shape=(n_inputs, 3 * self.size),
        )
        self.hidden_weights = self.add_parameter(
            value=self.hidden_weights,
            name='hidden_weights',
            shape=(self.size, 3 * self.size),
        )
        self.biases = self.add_parameter(
            value=self.biases,
            name='biases',
            shape=(3 * self.size, ),
        )

        self.add_parameter(value=self.hidden_init,
                           shape=(1, self.size),
                           name="hidden_init",
                           trainable=self.learn_init)

    def output(self, input_value):
        # Because scan iterates over the first dimension we
        # dimshuffle to (n_time_steps, n_batch, n_features)
        input_value = tf.transpose(input_value, [1, 0, 2])
        input_shape = tf.shape(input_value)
        n_batch = input_shape[1]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def one_gru_step(states, input_n):
            with tf.name_scope('gru-cell'):
                hid_previous, = states
                input_n = tf.matmul(input_n, self.input_weights) + self.biases

                # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1},
                # and W_{hc} h_{t - 1}
                hid_input = tf.matmul(hid_previous, self.hidden_weights)

                if self.gradient_clipping != 0:
                    input_n = clip_gradient(input_n, self.gradient_clipping)
                    hid_input = clip_gradient(hid_input,
                                              self.gradient_clipping)

                hid_resetgate, hid_updategate, hid_hidden = tf.split(hid_input,
                                                                     3,
                                                                     axis=1)

                in_resetgate, in_updategate, in_hidden = tf.split(input_n,
                                                                  3,
                                                                  axis=1)

                # Reset and update gates
                resetgate = self.activation_functions.resetgate(hid_resetgate +
                                                                in_resetgate)

                updategate = self.activation_functions.updategate(
                    hid_updategate + in_updategate)

                # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1})
                hidden_update = in_hidden + resetgate * hid_hidden

                if self.gradient_clipping != 0:
                    hidden_update = clip_gradient(hidden_update,
                                                  self.gradient_clipping)

                hidden_update = self.activation_functions.hidden_update(
                    hidden_update)

                # Compute (1 - u_t)h_{t - 1} + u_t c_t
                return [
                    hid_previous - updategate * (hid_previous - hidden_update)
                ]

        hidden_init = tf.tile(self.hidden_init, (n_batch, 1))
        sequence = input_value

        if self.backwards:
            sequence = tf.reverse(sequence, axis=[0])

        if self.unroll_scan:
            # Explicitly unroll the recurrence instead of using scan
            hid_out = unroll_scan(fn=one_gru_step,
                                  sequence=sequence,
                                  outputs_info=[hidden_init])
        else:
            hid_out, = tf.scan(
                fn=one_gru_step,
                elems=input_value,
                initializer=[hidden_init],
                name='gru-scan',
            )

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            return hid_out[-1]

        # if scan is backward reverse the output
        if self.backwards:
            hid_out = tf.reverse(hid_out, axis=[0])

        # dimshuffle back to (n_batch, n_time_steps, n_features))
        hid_out = tf.transpose(hid_out, [1, 0, 2])
        return hid_out
Esempio n. 13
0
class LSTM(BaseRNNLayer):
    """
    Long Short Term Memory (LSTM) Layer.

    Parameters
    ----------
    {BaseRNNLayer.size}

    weights : dict or Initializer
        Weight parameters for different gates.
        Defaults to :class:`XavierUniform() <neupy.init.XavierUniform>`.

        - In case if application requires the same initialization method
          for all weights, then it's possible to specify initialization
          method that would be automaticaly applied to all weight
          parameters in the LSTM layer.

          .. code-block:: python

              layers.LSTM(2, weights=init.Normal(0.1))

        - In case if application requires different initialization
          values for different weights then it's possible to specify
          an exact weight by name.

          .. code-block:: python

              dict(
                  weight_in_to_ingate=init.XavierUniform(),
                  weight_hid_to_ingate=init.XavierUniform(),
                  weight_cell_to_ingate=init.XavierUniform(),

                  weight_in_to_forgetgate=init.XavierUniform(),
                  weight_hid_to_forgetgate=init.XavierUniform(),
                  weight_cell_to_forgetgate=init.XavierUniform(),

                  weight_in_to_outgate=init.XavierUniform(),
                  weight_hid_to_outgate=init.XavierUniform(),
                  weight_cell_to_outgate=init.XavierUniform(),

                  weight_in_to_cell=init.XavierUniform(),
                  weight_hid_to_cell=init.XavierUniform(),
              )

          If application requires modification to only one (or multiple)
          parameter then it's better to specify the one that you need to
          modify and ignore other parameters

          .. code-block:: python

              dict(weight_in_to_ingate=init.Normal(0.1))

          Other parameters like ``weight_cell_to_outgate`` will be
          equal to their default values.

    biases : dict or Initializer
        Bias parameters for different gates.
        Defaults to :class:`Constant(0) <neupy.init.Constant>`.

        - In case if application requires the same initialization method
          for all biases, then it's possible to specify initialization
          method that would be automaticaly applied to all bias parameters
          in the LSTM layer.

          .. code-block:: python

              layers.LSTM(2, biases=init.Constant(1))

        - In case if application requires different initialization
          values for different weights then it's possible to specify
          an exact weight by name.

          .. code-block:: python

              dict(
                  bias_ingate=init.Constant(0),
                  bias_forgetgate=init.Constant(0),
                  bias_cell=init.Constant(0),
                  bias_outgate=init.Constant(0),
              )

          If application requires modification to only one (or multiple)
          parameter then it's better to specify the one that you need to
          modify and ignore other parameters

          .. code-block:: python

              dict(bias_ingate=init.Constant(1))

          Other parameters like ``bias_cell`` will be
          equal to their default values.

    activation_functions : dict, callable
        Activation functions for different gates. Defaults to:

        .. code-block:: python

            # import theano.tensor as T
            dict(
                ingate=T.nnet.sigmoid,
                forgetgate=T.nnet.sigmoid,
                outgate=T.nnet.sigmoid,
                cell=T.tanh,
            )

        If application requires modification to only one parameter
        then it's better to specify the one that you need to modify
        and ignore other parameters

        .. code-block:: python

            dict(ingate=T.tanh)

        Other parameters like ``forgetgate`` or ``outgate`` will be
        equal to their default values.

    learn_init : bool
        If ``True``, make ``cell_init`` and ``hid_init`` trainable
        variables. Defaults to ``False``.

    cell_init : array-like, Theano variable, scalar or Initializer
        Initializer for initial cell state (:math:`c_0`).
        Defaults to :class:`Constant(0) <neupy.init.Constant>`.

    hid_init : array-like, Theano variable, scalar or Initializer
        Initializer for initial hidden state (:math:`h_0`).
        Defaults to :class:`Constant(0) <neupy.init.Constant>`.

    backwards : bool
        If ``True``, process the sequence backwards and then reverse the
        output again such that the output from the layer is always
        from :math:`x_1` to :math:`x_n`. Defaults to ``False``

    {BaseRNNLayer.only_return_final}

    precompute_input : bool
        if ``True``, precompute ``input_to_hid`` before iterating
        through the sequence. This can result in a speed up at the
        expense of an increase in memory usage.
        Defaults to ``True``.

    peepholes : bool
        If ``True``, the LSTM uses peephole connections.
        When ``False``, cell parameters  are ignored.
        Defaults to ``False``.

    unroll_scan : bool
        If ``True`` the recursion is unrolled instead of using scan.
        For some graphs this gives a significant speed up but it
        might also consume more memory. When ``unroll_scan=True``,
        backpropagation always includes the full sequence, so
        ``n_gradient_steps`` must be set to ``-1`` and the input
        sequence length must be known at compile time (i.e.,
        cannot be given as ``None``). Defaults to ``False``.

    gradient_clipping : flaot or int
        If nonzero, the gradient messages are clipped to the
        given value during the backward pass. Defaults to ``0``.

    n_gradient_steps : int
        Number of timesteps to include in the backpropagated gradient.
        If ``-1``, backpropagate through the entire sequence.
        Defaults to ``-1``.

    {BaseLayer.Parameters}

    Notes
    -----
    Code was adapted from the
    `Lasagne <https://github.com/Lasagne/Lasagne>`_ library.

    Examples
    --------

    Sequence classification

    .. code-block:: python

        from neupy import layers, algorithms

        n_time_steps = 40
        n_categories = 20
        embedded_size = 10

        network = algorithms.RMSProp(
            [
                layers.Input(n_time_steps),
                layers.Embedding(n_categories, embedded_size),
                layers.LSTM(20),
                layers.Sigmoid(1),
            ]
        )
    """
    weights = MultiParameterProperty(
        default=dict(
            weight_in_to_ingate=init.XavierUniform(),
            weight_hid_to_ingate=init.XavierUniform(),
            weight_cell_to_ingate=init.XavierUniform(),

            weight_in_to_forgetgate=init.XavierUniform(),
            weight_hid_to_forgetgate=init.XavierUniform(),
            weight_cell_to_forgetgate=init.XavierUniform(),

            weight_in_to_outgate=init.XavierUniform(),
            weight_hid_to_outgate=init.XavierUniform(),
            weight_cell_to_outgate=init.XavierUniform(),

            weight_in_to_cell=init.XavierUniform(),
            weight_hid_to_cell=init.XavierUniform(),
        ))
    biases = MultiParameterProperty(
        default=dict(
            bias_ingate=init.Constant(0),
            bias_forgetgate=init.Constant(0),
            bias_cell=init.Constant(0),
            bias_outgate=init.Constant(0),
        ))
    activation_functions = MultiCallableProperty(
        default=dict(
            ingate=T.nnet.sigmoid,
            forgetgate=T.nnet.sigmoid,
            outgate=T.nnet.sigmoid,
            cell=T.tanh,
        ))

    learn_init = Property(default=False, expected_type=bool)
    cell_init = ParameterProperty(default=init.Constant(0))
    hid_init = ParameterProperty(default=init.Constant(0))

    unroll_scan = Property(default=False, expected_type=bool)
    backwards = Property(default=False, expected_type=bool)
    precompute_input = Property(default=True, expected_type=bool)
    peepholes = Property(default=False, expected_type=bool)

    n_gradient_steps = IntProperty(default=-1)
    gradient_clipping = NumberProperty(default=0, minval=0)

    def initialize(self):
        super(LSTM, self).initialize()

        n_inputs = np.prod(self.input_shape[1:])
        weights = self.weights
        biases = self.biases

        # Input gate parameters
        self.weight_in_to_ingate = self.add_parameter(
            value=weights.weight_in_to_ingate,
            name='weight_in_to_ingate',
            shape=(n_inputs, self.size))
        self.weight_hid_to_ingate = self.add_parameter(
            value=weights.weight_hid_to_ingate,
            name='weight_hid_to_ingate',
            shape=(self.size, self.size))
        self.bias_ingate = self.add_parameter(
            value=biases.bias_ingate, name='bias_ingate',
            shape=(self.size,))

        # Forget gate parameters
        self.weight_in_to_forgetgate = self.add_parameter(
            value=weights.weight_in_to_forgetgate,
            name='weight_in_to_forgetgate',
            shape=(n_inputs, self.size))
        self.weight_hid_to_forgetgate = self.add_parameter(
            value=weights.weight_hid_to_forgetgate,
            name='weight_hid_to_forgetgate',
            shape=(self.size, self.size))
        self.bias_forgetgate = self.add_parameter(
            value=biases.bias_forgetgate, name='bias_forgetgate',
            shape=(self.size,))

        # Cell parameters
        self.weight_in_to_cell = self.add_parameter(
            value=weights.weight_in_to_cell,
            name='weight_in_to_cell',
            shape=(n_inputs, self.size))
        self.weight_hid_to_cell = self.add_parameter(
            value=weights.weight_hid_to_cell,
            name='weight_hid_to_cell',
            shape=(self.size, self.size))
        self.bias_cell = self.add_parameter(
            value=biases.bias_cell, name='bias_cell',
            shape=(self.size,))

        # If peephole (cell to gate) connections were enabled, initialize
        # peephole connections.  These are elementwise products with the cell
        # state, so they are represented as vectors.
        if self.peepholes:
            self.weight_cell_to_ingate = self.add_parameter(
                value=weights.weight_cell_to_ingate,
                name='weight_cell_to_ingate',
                shape=(self.size,))
            self.weight_cell_to_forgetgate = self.add_parameter(
                value=weights.weight_cell_to_forgetgate,
                name='weight_cell_to_forgetgate',
                shape=(self.size,))
            self.weight_cell_to_outgate = self.add_parameter(
                value=weights.weight_cell_to_outgate,
                name='weight_cell_to_outgate',
                shape=(self.size,))

        # Output gate parameters
        self.weight_in_to_outgate = self.add_parameter(
            value=weights.weight_in_to_outgate,
            name='weight_in_to_outgate',
            shape=(n_inputs, self.size))
        self.weight_hid_to_outgate = self.add_parameter(
            value=weights.weight_hid_to_outgate,
            name='weight_hid_to_outgate',
            shape=(self.size, self.size))
        self.bias_outgate = self.add_parameter(
            value=biases.bias_outgate, name='bias_outgate',
            shape=(self.size,))

        # Initialization parameters
        self.add_parameter(value=self.cell_init, shape=(1, self.size),
                           name="cell_init", trainable=self.learn_init)
        self.add_parameter(value=self.hid_init, shape=(1, self.size),
                           name="hid_init", trainable=self.learn_init)

    def output(self, input_value):
        # Treat all dimensions after the second as flattened
        # feature dimensions
        if input_value.ndim > 3:
            input_value = T.flatten(input_value, 3)

        # Because scan iterates over the first dimension we
        # dimshuffle to (n_time_steps, n_batch, n_features)
        input_value = input_value.dimshuffle(1, 0, 2)
        seq_len, n_batch, _ = input_value.shape

        # Stack input weight matrices into a (num_inputs, 4 * num_units)
        # matrix, which speeds up computation
        weight_in_stacked = T.concatenate([
            self.weight_in_to_ingate,
            self.weight_in_to_forgetgate,
            self.weight_in_to_cell,
            self.weight_in_to_outgate], axis=1)

        # Same for hidden weight matrices
        weight_hid_stacked = T.concatenate([
            self.weight_hid_to_ingate,
            self.weight_hid_to_forgetgate,
            self.weight_hid_to_cell,
            self.weight_hid_to_outgate], axis=1)

        # Stack biases into a (4 * num_units) vector
        bias_stacked = T.concatenate([
            self.bias_ingate,
            self.bias_forgetgate,
            self.bias_cell,
            self.bias_outgate], axis=0)

        if self.precompute_input:
            # Because the input is given for all time steps, we can
            # precompute_input the inputs dot weight matrices before scanning.
            # weight_in_stacked is (n_features, 4 * num_units).
            # Input: (n_time_steps, n_batch, 4 * num_units).
            input_value = T.dot(input_value, weight_in_stacked) + bias_stacked

        # When theano.scan calls step, input_n will be
        # (n_batch, 4 * num_units). We define a slicing function
        # that extract the input to each LSTM gate
        def slice_w(x, n):
            return x[:, n * self.size:(n + 1) * self.size]

        def one_lstm_step(input_n, cell_previous, hid_previous, *args):
            if not self.precompute_input:
                input_n = T.dot(input_n, weight_in_stacked) + bias_stacked

            # Calculate gates pre-activations and slice
            gates = input_n + T.dot(hid_previous, weight_hid_stacked)

            # Clip gradients
            if self.gradient_clipping:
                gates = theano.gradient.grad_clip(
                    gates, -self.gradient_clipping, self.gradient_clipping)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous * self.weight_cell_to_ingate
                forgetgate += cell_previous * self.weight_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.activation_functions.ingate(ingate)
            forgetgate = self.activation_functions.forgetgate(forgetgate)
            cell_input = self.activation_functions.cell(cell_input)

            # Compute new cell value
            cell = forgetgate * cell_previous + ingate * cell_input

            if self.peepholes:
                outgate += cell * self.weight_cell_to_outgate

            outgate = self.activation_functions.outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate * T.tanh(cell)
            return [cell, hid]

        ones = T.ones((n_batch, 1))
        cell_init = T.dot(ones, self.cell_init)
        hid_init = T.dot(ones, self.hid_init)

        non_sequences = [weight_hid_stacked]
        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_sequences += [weight_in_stacked, bias_stacked]

        # The "peephole" weight matrices are only used
        # when self.peepholes=True
        if self.peepholes:
            non_sequences += [self.weight_cell_to_ingate,
                              self.weight_cell_to_forgetgate,
                              self.weight_cell_to_outgate]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            n_time_steps = self.input_shape[0]

            # Explicitly unroll the recurrence instead of using scan
            _, hid_out = unroll_scan(
                fn=one_lstm_step,
                sequences=[input_value],
                outputs_info=[cell_init, hid_init],
                go_backwards=self.backwards,
                non_sequences=non_sequences,
                n_steps=n_time_steps)

        else:
            (_, hid_out), _ = theano.scan(
                fn=one_lstm_step,
                sequences=input_value,
                outputs_info=[cell_init, hid_init],
                go_backwards=self.backwards,
                truncate_gradient=self.n_gradient_steps,
                non_sequences=non_sequences,
                strict=True)

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            return hid_out[-1]

        # dimshuffle back to (n_batch, n_time_steps, n_features))
        hid_out = hid_out.dimshuffle(1, 0, 2)

        # if scan is backward reverse the output
        if self.backwards:
            hid_out = hid_out[:, ::-1]

        return hid_out
Esempio n. 14
0
class BaseStepAssociative(BaseAssociative):
    """
    Base class for associative algorithms which have 2 layers and first
    one is has step function as activation.

    Parameters
    ----------
    {BaseAssociative.n_inputs}

    {BaseAssociative.n_outputs}

    n_unconditioned : int
        Number of unconditioned units in neraul networks. All these
        units wouldn't update during the training procedure.
        Unconditioned should be the first feature in the dataset.

    weight : array-like
        Neural network weights.
        Value defined manualy should have shape ``(n_inputs, n_outputs)``.
        Defaults to ``None`` which means that all unconditional
        weights will be equal to ``1``. Other weights equal to ``0``.

    bias : array-like, Initializer
        Neural network bias units.
        Defaults to :class:`Constant(-0.5) <neupy.init.Constant>`.

    {BaseNetwork.step}

    {BaseNetwork.show_epoch}

    {BaseNetwork.shuffle_data}

    {BaseNetwork.epoch_end_signal}

    {BaseNetwork.train_end_signal}

    {Verbose.verbose}

    Methods
    -------
    {BaseSkeleton.predict}

    {BaseAssociative.train}

    {BaseSkeleton.fit}
    """
    n_inputs = IntProperty(minval=2, required=True)
    n_unconditioned = IntProperty(minval=1, required=True)

    weight = ArrayProperty()
    bias = ParameterProperty(default=init.Constant(-0.5))

    def init_layers(self):
        if self.n_inputs <= self.n_unconditioned:
            raise ValueError(
                "Number of uncondition features should be less than total "
                "number of features. `n_inputs`={} and "
                "`n_unconditioned`={}".format(self.n_inputs,
                                              self.n_unconditioned))

        valid_weight_shape = (self.n_inputs, self.n_outputs)
        valid_bias_shape = (self.n_outputs, )

        if self.weight is None:
            self.weight = np.zeros(valid_weight_shape)
            self.weight[:self.n_unconditioned, :] = 1

        if isinstance(self.bias, init.Initializer):
            self.bias = self.bias.sample(valid_bias_shape)

        super(BaseStepAssociative, self).init_layers()

        if self.bias.shape != valid_bias_shape:
            raise ValueError("Bias vector has invalid shape. Got {}, "
                             "expected {}".format(self.bias.shape,
                                                  valid_bias_shape))

        self.bias = self.bias.astype(float)

    def predict(self, input_data):
        input_data = format_data(input_data, is_feature1d=False)
        raw_output = input_data.dot(self.weight) + self.bias
        return np.where(raw_output > 0, 1, 0)

    def train(self, input_train, *args, **kwargs):
        input_train = format_data(input_train, is_feature1d=False)
        return super(BaseStepAssociative, self).train(input_train, *args,
                                                      **kwargs)

    def train_epoch(self, input_train, target_train):
        weight = self.weight
        n_unconditioned = self.n_unconditioned
        predict = self.predict
        weight_delta = self.weight_delta

        for input_row in input_train:
            input_row = np.reshape(input_row, (1, input_row.size))
            layer_output = predict(input_row)
            weight[n_unconditioned:, :] += weight_delta(
                input_row, layer_output)
Esempio n. 15
0
class PRelu(ActivationLayer):
    """
    The layer with the parametrized ReLu activation
    function.

    Parameters
    ----------
    alpha_axes : int or tuple
        Axes that will not include unique alpha parameter.
        Single integer value defines the same as a tuple with one value.
        Defaults to ``1``.
    alpha : array-like, Theano shared variable, scalar or Initializer
        Alpha parameter per each non-shared axis for the ReLu.
        Scalar value means that each element in the tensor will be
        equal to the specified value.
        Default initialization methods you can find
        :ref:`here <init-methods>`.
        Defaults to ``Constant(value=0.25)``.
    {ActivationLayer.Parameters}

    Methods
    -------
    {ActivationLayer.Methods}

    Attributes
    ----------
    {ActivationLayer.Attributes}

    References
    ----------
    .. [1] https://arxiv.org/pdf/1502.01852v1.pdf
    """
    alpha_axes = AxesProperty(default=1)
    alpha = ParameterProperty(default=Constant(value=0.25))

    def initialize(self):
        super(PRelu, self).initialize()

        alpha = self.alpha
        alpha_axes = self.alpha_axes
        output_shape = self.output_shape

        if 0 in alpha_axes:
            raise ValueError("Cannot specify alpha per input sample.")

        if max(alpha_axes) > len(output_shape):
            raise ValueError("Cannot specify alpha for the axis #{}. "
                             "Maximum available axis is #{}"
                             "".format(max(alpha_axes),
                                       len(output_shape) - 1))

        alpha_shape = [output_shape[axis - 1] for axis in alpha_axes]

        if isinstance(alpha, Initializer):
            alpha = alpha.sample(alpha_shape)

        self.alpha = create_shared_parameter(value=alpha,
                                             name='alpha_{}'.format(
                                                 self.layer_id),
                                             shape=alpha_shape)
        self.parameters.append(self.alpha)

    def activation_function(self, input_value):
        alpha = dimshuffle(self.alpha, input_value.ndim, self.alpha_axes)
        return T.nnet.relu(input_value, alpha)
Esempio n. 16
0
class PRelu(ActivationLayer):
    """
    The layer with the parametrized ReLu activation
    function.

    Parameters
    ----------
    alpha_axes : int or tuple
        Axes that will not include unique alpha parameter.
        Single integer value defines the same as a tuple with one value.
        Defaults to ``-1``.

    alpha : array-like, Tensorfow variable, scalar or Initializer
        Alpha parameter per each non-shared axis for the ReLu.
        Scalar value means that each element in the tensor will be
        equal to the specified value.
        Default initialization methods you can find
        :ref:`here <init-methods>`.
        Defaults to ``Constant(value=0.25)``.

    {ActivationLayer.Parameters}

    Methods
    -------
    {ActivationLayer.Methods}

    Attributes
    ----------
    {ActivationLayer.Attributes}

    Examples
    --------
    Feedforward Neural Networks (FNN)

    >>> from neupy.layers import *
    >>> network = Input(10) > PRelu(20) > PRelu(1)

    Convolutional Neural Networks (CNN)

    >>> from neupy.layers import *
    >>> network = join(
    ...     Input((32, 32, 3)),
    ...     Convolution((3, 3, 16)) > PRelu(),
    ...     Convolution((3, 3, 32)) > PRelu(),
    ...     Reshape(),
    ...     Softmax(10),
    ... )

    References
    ----------
    .. [1] https://arxiv.org/pdf/1502.01852v1.pdf
    """
    alpha_axes = AxesProperty(default=-1)
    alpha = ParameterProperty(default=init.Constant(value=0.25))

    def __init__(self, *args, **options):
        super(PRelu, self).__init__(*args, **options)

        if 0 in self.alpha_axes:
            raise ValueError("Cannot specify alpha for 0-axis")

    def validate(self, input_shape):
        if max(self.alpha_axes) > len(input_shape):
            max_axis_index = len(input_shape) - 1
            raise ValueError("Cannot specify alpha for the axis #{}. "
                             "Maximum available axis is {} (0-based indeces)."
                             "".format(max(self.alpha_axes), max_axis_index))

    def initialize(self):
        super(PRelu, self).initialize()
        output_shape = as_tuple(None, self.output_shape)

        alpha_shape = [output_shape[axis] for axis in self.alpha_axes]
        self.add_parameter(
            value=self.alpha,
            name='alpha',
            shape=alpha_shape,
            trainable=True,
        )

    def activation_function(self, input_value):
        input_value = tf.convert_to_tensor(input_value, dtype=tf.float32)
        ndim = len(input_value.get_shape())

        dimensions = np.arange(ndim)
        alpha_axes = dimensions[list(self.alpha_axes)]

        alpha = dimshuffle(self.alpha, ndim, alpha_axes)
        return tf.nn.leaky_relu(tf.to_float(input_value), tf.to_float(alpha))
Esempio n. 17
0
class Linear(BaseLayer):
    """
    Layer with linear activation function. It applies linear transformation
    when the ``n_units`` parameter specified and acts as an identity
    when it's not specified.

    Parameters
    ----------
    n_units : int or None
        Number of units in the layers. It also corresponds to the number of
        output features that will be produced per sample after passing it
        through this layer. The ``None`` value means that layer will not have
        parameters and it will only apply activation function to the input
        without linear transformation output for the specified input value.
        Defaults to ``None``.

    weight : array-like, Tensorfow variable, scalar or Initializer
        Defines layer's weights. Default initialization methods
        you can find :ref:`here <init-methods>`.
        Defaults to :class:`HeNormal() <neupy.init.HeNormal>`.

    bias : 1D array-like, Tensorfow variable, scalar, Initializer or None
        Defines layer's bias. Default initialization methods you can find
        :ref:`here <init-methods>`. Defaults to
        :class:`Constant(0) <neupy.init.Constant>`.
        The ``None`` value excludes bias from the calculations and
        do not add it into parameters list.

    {BaseLayer.name}

    Methods
    -------
    {BaseLayer.Methods}

    activation_function(input)
        Applies activation function to the input.

    Attributes
    ----------
    {BaseLayer.Attributes}

    Examples
    --------
    Linear Regression

    >>> from neupy.layers import *
    >>> network = Input(10) >> Linear(5)
    """
    n_units = IntProperty(minval=1, allow_none=True)
    weight = ParameterProperty()
    bias = ParameterProperty(allow_none=True)

    def __init__(self,
                 n_units=None,
                 weight=init.HeNormal(),
                 bias=0,
                 name=None):

        super(Linear, self).__init__(name=name)

        self.n_units = n_units
        self.weight = weight
        self.bias = bias

    def get_output_shape(self, input_shape):
        input_shape = tf.TensorShape(input_shape)

        if self.n_units is None:
            return input_shape

        if input_shape and input_shape.ndims != 2:
            raise LayerConnectionError(
                "Input shape expected to have 2 dimensions, got {} instead. "
                "Shape: {}".format(input_shape.ndims, input_shape))

        n_samples = input_shape[0]
        return tf.TensorShape((n_samples, self.n_units))

    def create_variables(self, input_shape):
        if self.n_units is None:
            return

        input_shape = tf.TensorShape(input_shape)
        self.input_shape = input_shape
        _, n_input_features = input_shape

        if n_input_features.value is None:
            raise WeightInitializationError(
                "Cannot create variables for the layer `{}`, because "
                "number of input features is unknown. Input shape: {}"
                "Layer: {}".format(self.name, input_shape, self))

        self.weight = self.variable(value=self.weight,
                                    name='weight',
                                    shape=as_tuple(n_input_features,
                                                   self.n_units))

        if self.bias is not None:
            self.bias = self.variable(value=self.bias,
                                      name='bias',
                                      shape=as_tuple(self.n_units))

    def output(self, input, **kwargs):
        input = tf.convert_to_tensor(input, dtype=tf.float32)

        if self.n_units is None:
            return self.activation_function(input)

        if self.bias is None:
            output = tf.matmul(input, self.weight)
            return self.activation_function(output)

        output = tf.matmul(input, self.weight) + self.bias
        return self.activation_function(output)

    def activation_function(self, input_value):
        return input_value

    def __repr__(self):
        if self.n_units is None:
            return self._repr_arguments(name=self.name)

        return self._repr_arguments(
            self.n_units,
            name=self.name,
            weight=self.weight,
            bias=self.bias,
        )
Esempio n. 18
0
class GroupNorm(Identity):
    """
    Group Normalization layer. This layer is a simple alternative to the
    Batch Normalization layer for cases when batch size is small.

    Parameters
    ----------
    n_groups : int
        During normalization all the channels will be break down into
        separate groups and mean and variance will be estimated per group.
        This parameter controls number of groups.

    gamma : array-like, Tensorfow variable, scalar or Initializer
        Scale. Default initialization methods you can
        find :ref:`here <init-methods>`.
        Defaults to ``Constant(value=1)``.

    beta : array-like, Tensorfow variable, scalar or Initializer
        Offset. Default initialization methods you can
        find :ref:`here <init-methods>`.
        Defaults to ``Constant(value=0)``.

    epsilon : float
        Epsilon ensures that input rescaling procedure that uses estimated
        variance will never cause division by zero. Defaults to ``1e-5``.

    {Identity.name}

    Methods
    -------
    {Identity.Methods}

    Attributes
    ----------
    {Identity.Attributes}

    Examples
    --------
    Convolutional Neural Networks (CNN)

    >>> from neupy.layers import *
    >>> network = join(
    ...     Input((28, 28, 1)),
    ...     Convolution((3, 3, 16)) >> GroupNorm(4) >> Relu(),
    ...     Convolution((3, 3, 16)) >> GroupNorm(4) >> Relu(),
    ...     Reshape(),
    ...     Softmax(10),
    ... )

    References
    ----------
    .. [1] Group Normalization, Yuxin Wu, Kaiming He,
           https://arxiv.org/pdf/1803.08494.pdf
    """
    n_groups = IntProperty(minval=1)
    beta = ParameterProperty()
    gamma = ParameterProperty()
    epsilon = NumberProperty(minval=0)

    def __init__(self, n_groups, beta=0, gamma=1, epsilon=1e-5, name=None):
        super(GroupNorm, self).__init__(name=name)

        self.n_groups = n_groups
        self.beta = beta
        self.gamma = gamma
        self.epsilon = epsilon

    def create_variables(self, input_shape):
        n_channels = input_shape[3]

        if n_channels.value is None:
            raise WeightInitializationError(
                "Cannot initialize variables when number of "
                "channels is unknown. Input shape: {}, Layer: {}"
                "".format(input_shape, self))

        parameter_shape = (1, 1, 1, n_channels)

        self.gamma = self.variable(value=self.gamma,
                                   name='gamma',
                                   shape=parameter_shape)

        self.beta = self.variable(value=self.beta,
                                  name='beta',
                                  shape=parameter_shape)

    def get_output_shape(self, input_shape):
        input_shape = tf.TensorShape(input_shape)

        if input_shape and input_shape.ndims != 4:
            raise LayerConnectionError(
                "Group normalization layer expects 4 dimensional input, "
                "got {} instead. Input shape: {}, Layer: {}"
                "".format(input_shape.ndims, input_shape, self))

        n_channels = input_shape[3]

        if n_channels.value and n_channels % self.n_groups != 0:
            raise LayerConnectionError(
                "Cannot divide {} input channels into {} groups. "
                "Input shape: {}, Layer: {}".format(n_channels, self.n_groups,
                                                    input_shape, self))

        return super(GroupNorm, self).get_output_shape(input_shape)

    def output(self, input):
        input = tf.convert_to_tensor(input, dtype=tf.float32)
        input_shape = tf.shape(input)
        n_groups = self.n_groups

        # We access dimensional information in form of tensors in case
        # if some of the dimensions are undefined. In this way we make
        # sure that reshape will work even if part of the input shape
        # is undefined.
        dims = [input_shape[i] for i in range(4)]
        n_samples, height, width, n_channels = dims

        input = tf.reshape(
            input,
            [n_samples, height, width, n_groups, n_channels // n_groups])

        mean, variance = tf.nn.moments(input, [1, 2, 4], keep_dims=True)
        input = (input - mean) / tf.sqrt(variance + self.epsilon)
        input = tf.reshape(input, input_shape)

        return input * self.gamma + self.beta
Esempio n. 19
0
class BatchNorm(Identity):
    """
    Batch normalization layer.

    Parameters
    ----------
    axes : tuple with ints or None
        Axes along which normalization will be applied. The ``None``
        value means that normalization will be applied over all axes
        except the last one. In case of 4D tensor it will
        be equal to ``(0, 1, 2)``. Defaults to ``None``.

    epsilon : float
        Epsilon is a positive constant that adds to the standard
        deviation to prevent the division by zero.
        Defaults to ``1e-5``.

    alpha : float
        Coefficient for the exponential moving average of
        batch-wise means and standard deviations computed during
        training; the closer to one, the more it will depend on
        the last batches seen. Value needs to be between ``0`` and ``1``.
        Defaults to ``0.1``.

    gamma : array-like, Tensorfow variable, scalar or Initializer
        Scale. Default initialization methods you can
        find :ref:`here <init-methods>`.
        Defaults to ``Constant(value=1)``.

    beta : array-like, Tensorfow variable, scalar or Initializer
        Offset. Default initialization methods you can
        find :ref:`here <init-methods>`.
        Defaults to ``Constant(value=0)``.

    running_mean : array-like, Tensorfow variable, scalar or Initializer
        Default initialization methods you can
        find :ref:`here <init-methods>`.
        Defaults to ``Constant(value=0)``.

    running_inv_std : array-like, Tensorfow variable, scalar or Initializer
        Default initialization methods you can
        find :ref:`here <init-methods>`.
        Defaults to ``Constant(value=1)``.

    {Identity.name}

    Methods
    -------
    {Identity.Methods}

    Attributes
    ----------
    {Identity.Attributes}

    Examples
    --------

    Feedforward Neural Networks (FNN) with batch normalization after
    activation function was applied.

    >>> from neupy.layers import *
    >>> network = join(
    ...     Input(10),
    ...     Relu(5) >> BatchNorm(),
    ...     Relu(5) >> BatchNorm(),
    ...     Sigmoid(1),
    ... )

    Feedforward Neural Networks (FNN) with batch normalization before
    activation function was applied.

    >>> from neupy.layers import *
    >>> network = join(
    ...     Input(10),
    ...     Linear(5) >> BatchNorm() >> Relu(),
    ...     Linear(5) >> BatchNorm() >> Relu(),
    ...     Sigmoid(1),
    ... )

    Convolutional Neural Networks (CNN)

    >>> from neupy.layers import *
    >>> network = join(
    ...     Input((28, 28, 1)),
    ...     Convolution((3, 3, 16)) >> BatchNorm() >> Relu(),
    ...     Convolution((3, 3, 16)) >> BatchNorm() >> Relu(),
    ...     Reshape(),
    ...     Softmax(10),
    ... )

    References
    ----------
    .. [1] Batch Normalization: Accelerating Deep Network Training
           by Reducing Internal Covariate Shift,
           http://arxiv.org/pdf/1502.03167v3.pdf
    """
    axes = TypedListProperty(allow_none=True)
    epsilon = NumberProperty(minval=0)
    alpha = ProperFractionProperty()
    beta = ParameterProperty()
    gamma = ParameterProperty()

    running_mean = ParameterProperty()
    running_inv_std = ParameterProperty()

    def __init__(self,
                 axes=None,
                 alpha=0.1,
                 beta=0,
                 gamma=1,
                 epsilon=1e-5,
                 running_mean=0,
                 running_inv_std=1,
                 name=None):

        super(BatchNorm, self).__init__(name=name)

        self.axes = axes
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.epsilon = epsilon
        self.running_mean = running_mean
        self.running_inv_std = running_inv_std

        if axes is not None and len(set(axes)) != len(axes):
            raise ValueError(
                "Specified axes have to contain only unique values")

    def create_variables(self, input_shape):
        input_shape = tf.TensorShape(input_shape)

        if input_shape.ndims is None:
            raise WeightInitializationError(
                "Cannot initialize variables for the batch normalization "
                "layer, because input shape is undefined. Layer: {}"
                "".format(self))

        if self.axes is None:
            # If ndims == 4 then axes = (0, 1, 2)
            # If ndims == 2 then axes = (0,)
            self.axes = tuple(range(input_shape.ndims - 1))

        if any(axis >= input_shape.ndims for axis in self.axes):
            raise LayerConnectionError(
                "Batch normalization cannot be applied over one of "
                "the axis, because input has only {} dimensions. Layer: {}"
                "".format(input_shape.ndims, self))

        parameter_shape = tuple([
            input_shape[axis].value if axis not in self.axes else 1
            for axis in range(input_shape.ndims)
        ])

        if any(parameter is None for parameter in parameter_shape):
            unknown_dim_index = parameter_shape.index(None)

            raise WeightInitializationError(
                "Cannot create variables for batch normalization, because "
                "input has unknown dimension #{} (0-based indices). "
                "Input shape: {}, Layer: {}".format(unknown_dim_index,
                                                    input_shape, self))

        self.input_shape = input_shape
        self.running_mean = self.variable(value=self.running_mean,
                                          shape=parameter_shape,
                                          name='running_mean',
                                          trainable=False)

        self.running_inv_std = self.variable(value=self.running_inv_std,
                                             shape=parameter_shape,
                                             name='running_inv_std',
                                             trainable=False)

        self.gamma = self.variable(value=self.gamma,
                                   name='gamma',
                                   shape=parameter_shape)

        self.beta = self.variable(value=self.beta,
                                  name='beta',
                                  shape=parameter_shape)

    def output(self, input, training=False):
        input = tf.convert_to_tensor(input, dtype=tf.float32)

        if not training:
            mean = self.running_mean
            inv_std = self.running_inv_std
        else:
            alpha = asfloat(self.alpha)
            mean = tf.reduce_mean(
                input,
                self.axes,
                keepdims=True,
                name="mean",
            )
            variance = tf.reduce_mean(
                tf.squared_difference(input, tf.stop_gradient(mean)),
                self.axes,
                keepdims=True,
                name="variance",
            )
            inv_std = tf.rsqrt(variance + asfloat(self.epsilon))

            tf.add_to_collection(
                tf.GraphKeys.UPDATE_OPS,
                self.running_inv_std.assign(
                    asfloat(1 - alpha) * self.running_inv_std +
                    alpha * inv_std))
            tf.add_to_collection(
                tf.GraphKeys.UPDATE_OPS,
                self.running_mean.assign(
                    asfloat(1 - alpha) * self.running_mean + alpha * mean))

        normalized_value = (input - mean) * inv_std
        return self.gamma * normalized_value + self.beta
Esempio n. 20
0
class PRelu(Linear):
    """
    Layer with the parametrized ReLu used as an activation function.
    Layer learns additional parameter ``alpha`` during the training.

    It applies linear transformation when the ``n_units`` parameter
    specified and parametrized relu function after the transformation.
    When ``n_units`` is not specified, only parametrized relu function
    will be applied to the input.

    Parameters
    ----------
    alpha_axes : int or tuple
        Axes that will not include unique alpha parameter.
        Single integer value defines the same as a tuple with one value.
        Defaults to ``-1``.

    alpha : array-like, Tensorfow variable, scalar or Initializer
        Separate alpha parameter per each non-shared axis for the ReLu.
        Scalar value means that each element in the tensor will be
        equal to the specified value. Default initialization methods you
        can find :ref:`here <init-methods>`.
        Defaults to ``Constant(value=0.25)``.

    {Linear.Parameters}

    Methods
    -------
    {Linear.Methods}

    Attributes
    ----------
    {Linear.Attributes}

    Examples
    --------
    Feedforward Neural Networks (FNN)

    >>> from neupy.layers import *
    >>> network = Input(10) >> PRelu(20) >> PRelu(1)

    Convolutional Neural Networks (CNN)

    >>> from neupy.layers import *
    >>> network = join(
    ...     Input((32, 32, 3)),
    ...     Convolution((3, 3, 16)) >> PRelu(),
    ...     Convolution((3, 3, 32)) >> PRelu(),
    ...     Reshape(),
    ...     Softmax(10),
    ... )

    References
    ----------
    .. [1] Delving Deep into Rectifiers: Surpassing Human-Level
           Performance on ImageNet Classification.
           https://arxiv.org/pdf/1502.01852v1.pdf
    """
    alpha_axes = TypedListProperty()
    alpha = ParameterProperty()

    def __init__(self,
                 n_units=None,
                 alpha_axes=-1,
                 alpha=0.25,
                 weight=init.HeNormal(gain=2),
                 bias=0,
                 name=None):

        self.alpha = alpha
        self.alpha_axes = as_tuple(alpha_axes)

        if 0 in self.alpha_axes:
            raise ValueError("Cannot specify alpha for 0-axis")

        super(PRelu, self).__init__(n_units=n_units,
                                    weight=weight,
                                    bias=bias,
                                    name=name)

    def get_output_shape(self, input_shape):
        input_shape = tf.TensorShape(input_shape)

        if input_shape and max(self.alpha_axes) >= input_shape.ndims:
            max_axis_index = input_shape.ndims - 1

            raise LayerConnectionError(
                "Cannot specify alpha for the axis #{}. Maximum "
                "available axis is {} (0-based indices)."
                "".format(max(self.alpha_axes), max_axis_index))

        return super(PRelu, self).get_output_shape(input_shape)

    def create_variables(self, input_shape):
        super(PRelu, self).create_variables(input_shape)
        output_shape = self.get_output_shape(input_shape)

        self.alpha = self.variable(
            value=self.alpha,
            name='alpha',
            shape=[output_shape[axis] for axis in self.alpha_axes])

    def activation_function(self, input):
        input = tf.convert_to_tensor(input, dtype=tf.float32)
        ndim = input.shape.ndims

        dimensions = np.arange(ndim)
        alpha_axes = dimensions[list(self.alpha_axes)]

        alpha = tf_utils.dimshuffle(self.alpha, ndim, alpha_axes)
        return tf.maximum(0.0, input) + alpha * tf.minimum(0.0, input)

    def __repr__(self):
        if self.n_units is None:
            return self._repr_arguments(name=self.name,
                                        alpha_axes=self.alpha_axes,
                                        alpha=self.alpha)

        return self._repr_arguments(self.n_units,
                                    name=self.name,
                                    alpha_axes=self.alpha_axes,
                                    alpha=self.alpha,
                                    weight=self.weight,
                                    bias=self.bias)
Esempio n. 21
0
class Convolution(ParameterBasedLayer):
    """
    Convolutional layer.

    Parameters
    ----------
    size : tuple of int
        Filter shape. In should be defined as a tuple with three
        integers ``(filter rows, filter columns, output channels)``.

    padding : {{``same``, ``valid``}}, int, tuple
        Zero padding for the input tensor.

        - ``valid`` - Padding won't be added to the tensor. Result will be
          the same as for ``padding=0``

        - ``same`` - Padding will depend on the number of rows and columns
          in the filter. This padding makes sure that image with the
          ``stride=1`` won't change its width and height. It's the same as
          ``padding=(filter rows // 2, filter columns // 2)``.

        - Custom value for the padding can be specified as an integer, like
          ``padding=1`` or it can be specified as a tuple when different
          dimensions have different padding values, for example
          ``padding=(2, 3)``.

        Defaults to ``valid``.

    stride : tuple with ints, int.
        Stride size. Defaults to ``(1, 1)``

    dilation : int, tuple
        Rate for the fiter upsampling. When ``dilation > 1`` layer will
        become diated convolution (or atrous convolution). Defaults to ``1``.

    weight : array-like, Tensorfow variable, scalar or Initializer
        Defines layer's weights. Shape of the weight will be equal to
        ``(filter rows, filter columns, input channels, output channels)``.
        Default initialization methods you can find
        :ref:`here <init-methods>`. Defaults to
        :class:`HeNormal(gain=2) <neupy.init.HeNormal>`.

    {ParameterBasedLayer.bias}

    {BaseLayer.Parameters}

    Examples
    --------
    2D Convolution

    >>> from neupy import layers
    >>>
    >>> layers.join(
    ...     layers.Input((28, 28, 3)),
    ...     layers.Convolution((3, 3, 16)),
    ... )

    1D Convolution

    >>> from neupy import layers
    >>>
    >>> layers.join(
    ...     layers.Input((30, 10)),
    ...     layers.Reshape((30, 1, 10)),
    ...     layers.Convolution((3, 1, 16)),
    ... )

    Methods
    -------
    {ParameterBasedLayer.Methods}

    Attributes
    ----------
    {ParameterBasedLayer.Attributes}
    """
    # We use gain=2 because it's suitable choice for relu non-linearity
    # and relu is the most common non-linearity used for CNN.
    weight = ParameterProperty(default=init.HeNormal(gain=2))
    size = TypedListProperty(required=True, element_type=int)
    padding = PaddingProperty(default='valid')
    stride = Spatial2DProperty(default=(1, 1))
    dilation = Spatial2DProperty(default=1)

    def validate(self, input_shape):
        if input_shape and len(input_shape) != 3:
            raise LayerConnectionError(
                "Convolutional layer expects an input with 3 "
                "dimensions, got {} with shape {}"
                "".format(len(input_shape), input_shape))

    def output_shape_per_dim(self, *args, **kwargs):
        return conv_output_shape(*args, **kwargs)

    def find_output_from_input_shape(self, input_shape):
        padding = self.padding
        rows, cols, _ = input_shape

        row_filter_size, col_filter_size, n_kernels = self.size
        row_stride, col_stride = self.stride
        row_dilation, col_dilation = self.dilation or (1, 1)

        if isinstance(padding, (list, tuple)):
            row_padding, col_padding = padding
        else:
            row_padding, col_padding = padding, padding

        output_rows = self.output_shape_per_dim(
            rows, row_filter_size,
            row_padding, row_stride, row_dilation,
        )
        output_cols = self.output_shape_per_dim(
            cols, col_filter_size,
            col_padding, col_stride, col_dilation,
        )

        return (output_rows, output_cols, n_kernels)

    @property
    def output_shape(self):
        if self.input_shape is not None:
            return self.find_output_from_input_shape(self.input_shape)

    @property
    def weight_shape(self):
        n_channels = self.input_shape[-1]
        n_rows, n_cols, n_filters = self.size
        return (n_rows, n_cols, n_channels, n_filters)

    @property
    def bias_shape(self):
        return as_tuple(self.size[-1])

    def output(self, input_value):
        padding = self.padding

        if not isinstance(padding, six.string_types):
            height_pad, weight_pad = padding
            input_value = tf.pad(input_value, [
                [0, 0],
                [height_pad, height_pad],
                [weight_pad, weight_pad],
                [0, 0],
            ])
            # VALID option will make sure that
            # convolution won't use any padding.
            padding = 'VALID'

        output = tf.nn.convolution(
            input_value,
            self.weight,
            padding=padding,
            strides=self.stride,
            dilation_rate=self.dilation,
            data_format="NHWC"
        )

        if self.bias is not None:
            bias = tf.reshape(self.bias, (1, 1, 1, -1))
            output += bias

        return output
Esempio n. 22
0
class Convolution(BaseLayer):
    """
    Convolutional layer.

    Parameters
    ----------
    size : tuple of int
        Filter shape. In should be defined as a tuple with three
        integers ``(filter rows, filter columns, output channels)``.

    padding : {{``same``, ``valid``}}, int, tuple
        Zero padding for the input tensor.

        - ``valid`` - Padding won't be added to the tensor. Result will be
          the same as for ``padding=0``

        - ``same`` - Padding will depend on the number of rows and columns
          in the filter. This padding makes sure that image with the
          ``stride=1`` won't change its width and height. It's the same as
          ``padding=(filter rows // 2, filter columns // 2)``.

        - Custom value for the padding can be specified as an integer, like
          ``padding=1`` or it can be specified as a tuple when different
          dimensions have different padding values, for example
          ``padding=(2, 3)``.

        Defaults to ``valid``.

    stride : tuple with ints, int.
        Stride size. Defaults to ``(1, 1)``

    dilation : int, tuple
        Rate for the filter upsampling. When ``dilation > 1`` layer will
        become dilated convolution (or atrous convolution). Defaults to ``1``.

    weight : array-like, Tensorfow variable, scalar or Initializer
        Defines layer's weights. Shape of the weight will be equal to
        ``(filter rows, filter columns, input channels, output channels)``.
        Default initialization methods you can find
        :ref:`here <init-methods>`. Defaults to
        :class:`HeNormal(gain=2) <neupy.init.HeNormal>`.

    bias : 1D array-like, Tensorfow variable, scalar, Initializer or None
        Defines layer's bias. Default initialization methods you can find
        :ref:`here <init-methods>`. Defaults to
        :class:`Constant(0) <neupy.init.Constant>`.
        The ``None`` value excludes bias from the calculations and
        do not add it into parameters list.

    {BaseLayer.name}

    Examples
    --------
    2D Convolution

    >>> from neupy import layers
    >>>
    >>> layers.join(
    ...     layers.Input((28, 28, 3)),
    ...     layers.Convolution((3, 3, 16)),
    ... )

    1D Convolution

    >>> from neupy.layers import *
    >>> network = join(
    ...     Input((30, 10)),
    ...     Reshape((30, 1, 10)),  # convert 3D to 4D
    ...     Convolution((3, 1, 16)),
    ...     Reshape((-1, 16))  # convert 4D back to 3D
    ... )
    >>> network
    (?, 30, 10) -> [... 4 layers ...] -> (?, 28, 16)

    Methods
    -------
    {BaseLayer.Methods}

    Attributes
    ----------
    {BaseLayer.Attributes}
    """
    size = TypedListProperty(element_type=int, n_elements=3)
    weight = ParameterProperty()
    bias = ParameterProperty(allow_none=True)

    padding = PaddingProperty()
    stride = Spatial2DProperty()
    dilation = Spatial2DProperty()

    # We use gain=2 because it's suitable choice for relu non-linearity
    # and relu is the most common non-linearity used for CNN.
    def __init__(self, size, padding='valid', stride=1, dilation=1,
                 weight=init.HeNormal(gain=2), bias=0, name=None):

        super(Convolution, self).__init__(name=name)

        self.size = size
        self.padding = padding
        self.stride = stride
        self.dilation = dilation
        self.weight = weight
        self.bias = bias

    def fail_if_shape_invalid(self, input_shape):
        if input_shape and input_shape.ndims != 4:
            raise LayerConnectionError(
                "Convolutional layer expects an input with 4 "
                "dimensions, got {} with shape {}"
                "".format(len(input_shape), input_shape))

    def output_shape_per_dim(self, *args, **kwargs):
        return conv_output_shape(*args, **kwargs)

    def expected_output_shape(self, input_shape):
        n_samples = input_shape[0]
        row_filter_size, col_filter_size, n_kernels = self.size
        row_stride, col_stride = self.stride
        row_dilation, col_dilation = self.dilation

        if isinstance(self.padding, (list, tuple)):
            row_padding, col_padding = self.padding
        else:
            row_padding, col_padding = self.padding, self.padding

        return (
            n_samples,
            self.output_shape_per_dim(
                input_shape[1], row_filter_size,
                row_padding, row_stride, row_dilation
            ),
            self.output_shape_per_dim(
                input_shape[2], col_filter_size,
                col_padding, col_stride, col_dilation
            ),
            n_kernels,
        )

    def get_output_shape(self, input_shape):
        input_shape = tf.TensorShape(input_shape)
        self.fail_if_shape_invalid(input_shape)

        if input_shape.ndims is None:
            n_samples = input_shape[0]
            n_kernels = self.size[-1]
            return tf.TensorShape((n_samples, None, None, n_kernels))

        return tf.TensorShape(self.expected_output_shape(input_shape))

    def create_variables(self, input_shape):
        self.input_shape = input_shape
        n_channels = input_shape[-1]
        n_rows, n_cols, n_filters = self.size

        # Compare to the regular convolution weights,
        # transposed one has switched input and output channels.
        self.weight = self.variable(
            value=self.weight, name='weight',
            shape=(n_rows, n_cols, n_channels, n_filters))

        if self.bias is not None:
            self.bias = self.variable(
                value=self.bias, name='bias',
                shape=as_tuple(n_filters))

    def output(self, input, **kwargs):
        input = tf.convert_to_tensor(input, tf.float32)
        self.fail_if_shape_invalid(input.shape)
        padding = self.padding

        if not isinstance(padding, six.string_types):
            height_pad, width_pad = padding
            input = tf.pad(input, [
                [0, 0],
                [height_pad, height_pad],
                [width_pad, width_pad],
                [0, 0],
            ])
            # VALID option will make sure that
            # convolution won't use any padding.
            padding = 'VALID'

        output = tf.nn.convolution(
            input,
            self.weight,
            padding=padding,
            strides=self.stride,
            dilation_rate=self.dilation,
            data_format="NHWC",
        )

        if self.bias is not None:
            bias = tf.reshape(self.bias, (1, 1, 1, -1))
            output += bias

        return output

    def __repr__(self):
        return self._repr_arguments(
            self.size,
            padding=self.padding,
            stride=self.stride,
            dilation=self.dilation,
            weight=self.weight,
            bias=self.bias,
            name=self.name,
        )
Esempio n. 23
0
class LSTM(BaseRNNLayer):
    """
    Long Short Term Memory (LSTM) Layer.

    Parameters
    ----------
    {BaseRNNLayer.size}

    input_weights : Initializer, ndarray
        Weight parameters for input connection.
        Defaults to :class:`HeNormal() <neupy.init.HeNormal>`.

    hidden_weights : Initializer, ndarray
        Weight parameters for hidden connection.
        Defaults to :class:`HeNormal() <neupy.init.HeNormal>`.

    cell_weights : Initializer, ndarray
        Weight parameters for cell connection. Require only when
        ``peepholes=True`` otherwise it will be ignored.
        Defaults to :class:`HeNormal() <neupy.init.HeNormal>`.

    bias : Initializer, ndarray
        Bias parameters for all gates.
        Defaults to :class:`Constant(0) <neupy.init.Constant>`.

    activation_functions : dict, callable
        Activation functions for different gates. Defaults to:

        .. code-block:: python

            # import tensorflow as tf
            dict(
                ingate=tf.nn.sigmoid,
                forgetgate=tf.nn.sigmoid,
                outgate=tf.nn.sigmoid,
                cell=tf.tanh,
            )

        If application requires modification to only one parameter
        then it's better to specify the one that you need to modify
        and ignore other parameters

        .. code-block:: python

            dict(ingate=tf.tanh)

        Other parameters like ``forgetgate`` or ``outgate`` will be
        equal to their default values.

    learn_init : bool
        If ``True``, make ``cell_init`` and ``hidden_init`` trainable
        variables. Defaults to ``False``.

    cell_init : array-like, Tensorfow variable, scalar or Initializer
        Initializer for initial cell state (:math:`c_0`).
        Defaults to :class:`Constant(0) <neupy.init.Constant>`.

    hidden_init : array-like, Tensorfow variable, scalar or Initializer
        Initializer for initial hidden state (:math:`h_0`).
        Defaults to :class:`Constant(0) <neupy.init.Constant>`.

    backwards : bool
        If ``True``, process the sequence backwards and then reverse the
        output again such that the output from the layer is always
        from :math:`x_1` to :math:`x_n`. Defaults to ``False``

    {BaseRNNLayer.only_return_final}

    peepholes : bool
        If ``True``, the LSTM uses peephole connections.
        When ``False``, cell parameters  are ignored.
        Defaults to ``False``.

    unroll_scan : bool
        If ``True`` the recursion is unrolled instead of using scan.
        For some graphs this gives a significant speed up but it
        might also consume more memory. When ``unroll_scan=True``,
        backpropagation always includes the full sequence, so
        ``n_gradient_steps`` must be set to ``-1`` and the input
        sequence length must be known at compile time (i.e.,
        cannot be given as ``None``). Defaults to ``False``.

    gradient_clipping : float or int
        If nonzero, the gradient messages are clipped to the
        given value during the backward pass. Defaults to ``0``.

    {BaseLayer.Parameters}

    Notes
    -----
    Code was adapted from the
    `Lasagne <https://github.com/Lasagne/Lasagne>`_ library.

    Examples
    --------

    Sequence classification

    .. code-block:: python

        from neupy import layers, algorithms

        n_time_steps = 40
        n_categories = 20
        embedded_size = 10

        network = algorithms.RMSProp(
            [
                layers.Input(n_time_steps),
                layers.Embedding(n_categories, embedded_size),
                layers.LSTM(20),
                layers.Sigmoid(1),
            ]
        )
    """
    input_weights = ParameterProperty(default=init.HeNormal())
    hidden_weights = ParameterProperty(default=init.HeNormal())
    cell_weights = ParameterProperty(default=init.HeNormal())
    biases = ParameterProperty(default=init.Constant(0))

    activation_functions = MultiCallableProperty(default=dict(
        ingate=tf.nn.sigmoid,
        forgetgate=tf.nn.sigmoid,
        outgate=tf.nn.sigmoid,
        cell=tf.tanh,
    ))

    learn_init = Property(default=False, expected_type=bool)
    cell_init = ParameterProperty(default=init.Constant(0))
    hidden_init = ParameterProperty(default=init.Constant(0))

    unroll_scan = Property(default=False, expected_type=bool)
    backwards = Property(default=False, expected_type=bool)
    peepholes = Property(default=False, expected_type=bool)
    gradient_clipping = NumberProperty(default=0, minval=0)

    def initialize(self):
        super(LSTM, self).initialize()
        n_inputs = np.prod(self.input_shape[1:])

        # If peephole (cell to gate) connections were enabled, initialize
        # peephole connections.  These are elementwise products with the cell
        # state, so they are represented as vectors.
        if self.peepholes:
            self.weight_cell_to_ingate = self.add_parameter(
                value=self.cell_weights,
                name='weight_cell_to_ingate',
                shape=(self.size, ))
            self.weight_cell_to_forgetgate = self.add_parameter(
                value=self.cell_weights,
                name='weight_cell_to_forgetgate',
                shape=(self.size, ))
            self.weight_cell_to_outgate = self.add_parameter(
                value=self.cell_weights,
                name='weight_cell_to_outgate',
                shape=(self.size, ))

        self.input_weights = self.add_parameter(
            value=self.input_weights,
            name='input_weights',
            shape=(n_inputs, 4 * self.size),
        )
        self.hidden_weights = self.add_parameter(
            value=self.hidden_weights,
            name='hidden_weights',
            shape=(self.size, 4 * self.size),
        )
        self.biases = self.add_parameter(
            value=self.biases,
            name='biases',
            shape=(4 * self.size, ),
        )

        # Initialization parameters
        self.add_parameter(
            value=self.cell_init,
            shape=(1, self.size),
            name="cell_init",
            trainable=self.learn_init,
        )
        self.add_parameter(
            value=self.hidden_init,
            shape=(1, self.size),
            name="hidden_init",
            trainable=self.learn_init,
        )

    def output(self, input_value):
        # Because scan iterates over the first dimension we
        # dimshuffle to (n_time_steps, n_batch, n_features)
        input_value = tf.transpose(input_value, [1, 0, 2])
        input_shape = tf.shape(input_value)
        n_batch = input_shape[1]

        def one_lstm_step(states, input_n):
            with tf.name_scope('lstm-cell'):
                cell_previous, hid_previous = states
                input_n = tf.matmul(input_n, self.input_weights) + self.biases

                # Calculate gates pre-activations and slice
                gates = input_n + tf.matmul(hid_previous, self.hidden_weights)

                # Clip gradients
                if self.gradient_clipping != 0:
                    gates = clip_gradient(gates, self.gradient_clipping)

                # Extract the pre-activation gate values
                ingate, forgetgate, cell_input, outgate = tf.split(gates,
                                                                   4,
                                                                   axis=1)

                if self.peepholes:
                    # Compute peephole connections
                    ingate += cell_previous * self.weight_cell_to_ingate
                    forgetgate += (cell_previous *
                                   self.weight_cell_to_forgetgate)

                # Apply nonlinearities
                ingate = self.activation_functions.ingate(ingate)
                forgetgate = self.activation_functions.forgetgate(forgetgate)
                cell_input = self.activation_functions.cell(cell_input)

                # Compute new cell value
                cell = forgetgate * cell_previous + ingate * cell_input

                if self.peepholes:
                    outgate += cell * self.weight_cell_to_outgate

                outgate = self.activation_functions.outgate(outgate)

                # Compute new hidden unit activation
                hid = outgate * tf.tanh(cell)
                return [cell, hid]

        cell_init = tf.tile(self.cell_init, (n_batch, 1))
        hidden_init = tf.tile(self.hidden_init, (n_batch, 1))
        sequence = input_value

        if self.backwards:
            sequence = tf.reverse(sequence, axis=[0])

        if self.unroll_scan:
            # Explicitly unroll the recurrence instead of using scan
            hid_out = unroll_scan(
                fn=one_lstm_step,
                sequence=sequence,
                outputs_info=[cell_init, hidden_init],
            )
        else:
            _, hid_out = tf.scan(
                fn=one_lstm_step,
                elems=input_value,
                initializer=[cell_init, hidden_init],
                name='lstm-scan',
            )

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            return hid_out[-1]

        # if scan is backward reverse the output
        if self.backwards:
            hid_out = tf.reverse(hid_out, axis=[0])

        # dimshuffle back to (n_batch, n_time_steps, n_features))
        hid_out = tf.transpose(hid_out, [1, 0, 2])

        return hid_out
Esempio n. 24
0
class BaseAssociative(BaseNetwork):
    """
    Base class for associative learning.

    Parameters
    ----------
    n_inputs : int
        Number of features (columns) in the input data.

    n_outputs : int
        Number of outputs in the  network.

    weight : array-like, Initializer
        Neural network weights.
        Value defined manualy should have shape ``(n_inputs, n_outputs)``.
        Defaults to :class:`Normal() <neupy.init.Normal>`.

    {BaseNetwork.Parameters}

    Methods
    -------
    {BaseSkeleton.predict}

    train(X_train, epochs=100)
        Train neural network.

    {BaseSkeleton.fit}
    """
    n_inputs = IntProperty(minval=1, required=True)
    n_outputs = IntProperty(minval=1, required=True)
    weight = ParameterProperty(default=init.Normal())

    def __init__(self, **options):
        super(BaseAssociative, self).__init__(**options)
        self.init_weights()

    def init_weights(self):
        valid_weight_shape = (self.n_inputs, self.n_outputs)

        if isinstance(self.weight, init.Initializer):
            self.weight = self.weight.sample(valid_weight_shape,
                                             return_array=True)

        if self.weight.shape != valid_weight_shape:
            raise ValueError(
                "Weight matrix has invalid shape. Got {}, expected {}"
                "".format(self.weight.shape, valid_weight_shape))

        self.weight = self.weight.astype(float)

    def format_input_data(self, X):
        X = format_data(X, is_feature1d=(self.n_inputs == 1))

        if X.ndim != 2:
            raise ValueError("Cannot make prediction, because input "
                             "data has more than 2 dimensions")

        if X.shape[1] != self.n_inputs:
            raise ValueError("Input data expected to have {} features, "
                             "but got {}".format(self.n_inputs, X.shape[1]))

        return X

    def train(self, X_train, epochs=100):
        X_train = self.format_input_data(X_train)
        return super(BaseAssociative, self).train(X_train=X_train,
                                                  epochs=epochs)
Esempio n. 25
0
class BatchNorm(BaseLayer):
    """
    Batch-normalization layer.

    Parameters
    ----------
    axes : int, tuple with int or None
        The axis or axes along which normalization is applied.
        ``None`` means that normalization will be applied over
        all axes except the first one. In case of 4D tensor it will
        be equal to ``(0, 2, 3)``. Defaults to ``None``.
    epsilon : float
        Epsilon is a positive constant that adds to the standard
        deviation to prevent the division by zero.
        Defaults to ``1e-5``.
    alpha : float
        Coefficient for the exponential moving average of
        batch-wise means and standard deviations computed during
        training; the closer to one, the more it will depend on
        the last batches seen. Value needs to be between ``0`` and ``1``.
        Defaults to ``0.1``.
    gamma : array-like, Theano variable, scalar or Initializer
        Default initialization methods you can
        find :ref:`here <init-methods>`.
        Defaults to ``Constant(value=1)``.
    beta : array-like, Theano variable, scalar or Initializer
        Default initialization methods you can
        find :ref:`here <init-methods>`.
        Defaults to ``Constant(value=0)``.

    Methods
    -------
    {BaseLayer.Methods}

    Attributes
    ----------
    {BaseLayer.Attributes}

    References
    ----------
    .. [1] Batch Normalization: Accelerating Deep Network Training
           by Reducing Internal Covariate Shift,
           http://arxiv.org/pdf/1502.03167v3.pdf
    """
    axes = AxesProperty(default=None)
    alpha = ProperFractionProperty(default=0.1)
    epsilon = NumberProperty(default=1e-5, minval=0)
    gamma = ParameterProperty(default=Constant(value=1))
    beta = ParameterProperty(default=Constant(value=0))

    def initialize(self):
        super(BatchNorm, self).initialize()

        input_shape = as_tuple(None, self.input_shape)
        ndim = len(input_shape)

        if self.axes is None:
            # If ndim == 4 then axes = (0, 2, 3)
            # If ndim == 2 then axes = (0,)
            self.axes = tuple(axis for axis in range(ndim) if axis != 1)

        if any(axis >= ndim for axis in self.axes):
            raise ValueError("Cannot apply batch normalization on the axis "
                             "that doesn't exist.")

        opposite_axes = find_opposite_axes(self.axes, ndim)
        parameter_shape = [input_shape[axis] for axis in opposite_axes]

        if any(parameter is None for parameter in parameter_shape):
            unknown_dim_index = parameter_shape.index(None)
            raise ValueError("Cannot apply batch normalization on the axis "
                             "with unknown size over the dimension #{} "
                             "(0-based indeces).".format(unknown_dim_index))

        self.running_mean = theano.shared(
            name='running_mean_{}'.format(self.layer_id),
            value=asfloat(np.zeros(parameter_shape)))
        self.running_inv_std = theano.shared(
            name='running_inv_std_{}'.format(self.layer_id),
            value=asfloat(np.ones(parameter_shape)))

        if isinstance(self.gamma, Initializer):
            self.gamma = self.gamma.sample(parameter_shape)

        if isinstance(self.beta, Initializer):
            self.beta = self.beta.sample(parameter_shape)

        self.gamma = theano.shared(
            name='gamma_{}'.format(self.layer_id),
            value=asfloat(self.gamma),
        )
        self.beta = theano.shared(
            name='beta_{}'.format(self.layer_id),
            value=asfloat(self.beta),
        )
        self.parameters = [self.gamma, self.beta]

    def output(self, input_value):
        epsilon = asfloat(self.epsilon)
        alpha = asfloat(self.alpha)
        gamma, beta = self.gamma, self.beta

        ndim = input_value.ndim
        axes = self.axes

        running_mean = self.running_mean
        running_inv_std = self.running_inv_std

        input_mean = input_value.mean(axes)
        input_var = input_value.var(axes)
        input_inv_std = T.inv(T.sqrt(input_var + epsilon))

        self.updates = [
            (running_inv_std,
             asfloat(1 - alpha) * running_inv_std + alpha * input_inv_std),
            (running_mean,
             asfloat(1 - alpha) * running_mean + alpha * input_mean)
        ]

        if not self.training_state:
            mean = running_mean
            inv_std = running_inv_std

        else:
            mean = input_mean
            inv_std = input_inv_std

        opposite_axes = find_opposite_axes(axes, ndim)

        beta = dimshuffle(beta, ndim, opposite_axes)
        gamma = dimshuffle(gamma, ndim, opposite_axes)
        mean = dimshuffle(mean, ndim, opposite_axes)
        inv_std = dimshuffle(inv_std, ndim, opposite_axes)

        normalized_value = (input_value - mean) * inv_std
        return gamma * normalized_value + beta
Esempio n. 26
0
class BaseStepAssociative(BaseAssociative):
    """
    Base class for associative algorithms which have 2 layers and first
    one is has step function as activation.

    Parameters
    ----------
    {BaseAssociative.n_inputs}

    {BaseAssociative.n_outputs}

    n_unconditioned : int
        Number of unconditioned units in neraul networks. All these
        units wouldn't update during the training procedure.
        Unconditioned should be the first feature in the dataset.

    weight : array-like
        Neural network weights.
        Value defined manualy should have shape ``(n_inputs, n_outputs)``.
        Defaults to ``None`` which means that all unconditional
        weights will be equal to ``1``. Other weights equal to ``0``.

    bias : array-like, Initializer
        Neural network bias units.
        Defaults to :class:`Constant(-0.5) <neupy.init.Constant>`.

    {BaseNetwork.Parameters}

    Methods
    -------
    {BaseAssociative.Methods}
    """
    n_inputs = IntProperty(minval=2, required=True)
    n_unconditioned = IntProperty(minval=1, required=True)

    weight = ArrayProperty()
    bias = ParameterProperty(default=init.Constant(-0.5))

    def init_weights(self):
        if self.n_inputs <= self.n_unconditioned:
            raise ValueError(
                "Number of uncondition features should be less than total "
                "number of features. `n_inputs`={} and `n_unconditioned`={}"
                "".format(self.n_inputs, self.n_unconditioned))

        valid_weight_shape = (self.n_inputs, self.n_outputs)
        valid_bias_shape = (self.n_outputs, )

        if self.weight is None:
            self.weight = np.zeros(valid_weight_shape)
            self.weight[:self.n_unconditioned, :] = 1

        if isinstance(self.bias, init.Initializer):
            self.bias = self.bias.sample(valid_bias_shape, return_array=True)

        super(BaseStepAssociative, self).init_weights()

        if self.bias.shape != valid_bias_shape:
            raise ValueError(
                "Bias vector has invalid shape. Got {}, expected {}"
                "".format(self.bias.shape, valid_bias_shape))

        self.bias = self.bias.astype(float)

    def predict(self, X):
        X = format_data(X, is_feature1d=False)
        raw_output = X.dot(self.weight) + self.bias
        return np.where(raw_output > 0, 1, 0)

    def train(self, X_train, *args, **kwargs):
        X_train = format_data(X_train, is_feature1d=False)
        return super(BaseStepAssociative, self).train(X_train, *args, **kwargs)

    def one_training_update(self, X_train, y_train):
        weight = self.weight
        n_unconditioned = self.n_unconditioned
        predict = self.predict
        weight_delta = self.weight_delta

        error = 0

        for x_row in X_train:
            x_row = np.expand_dims(x_row, axis=0)
            layer_output = predict(x_row)

            delta = weight_delta(x_row, layer_output)
            weight[n_unconditioned:, :] += delta

            # This error can tell us whether network has converged
            # to some value of weihts. Low errors will mean that weights
            # hasn't been updated much during the training epoch.
            error += np.linalg.norm(delta)

        return error
Esempio n. 27
0
class GRU(BaseRNNLayer):
    """
    Gated Recurrent Unit (GRU) Layer.

    Parameters
    ----------
    {BaseRNNLayer.size}

    weights : dict or Initializer
        Weight parameters for different gates.
        Defaults to :class:`XavierUniform() <neupy.init.XavierUniform>`.

        - In case if application requires the same initialization method
          for all weights, then it's possible to specify initialization
          method that would be automaticaly applied to all weight
          parameters in the GRU layer.

          .. code-block:: python

              layers.GRU(2, weights=init.Normal(0.1))

        - In case if application requires different initialization
          values for different weights then it's possible to specify
          an exact weight by name.

          .. code-block:: python

              dict(
                  weight_in_to_updategate=init.XavierUniform(),
                  weight_hid_to_updategate=init.XavierUniform(),

                  weight_in_to_resetgate=init.XavierUniform(),
                  weight_hid_to_resetgate=init.XavierUniform(),

                  weight_in_to_hidden_update=init.XavierUniform(),
                  weight_hid_to_hidden_update=init.XavierUniform(),
              )

          If application requires modification to only one (or multiple)
          parameter then it's better to specify the one that you need to
          modify and ignore other parameters

          .. code-block:: python

              dict(weight_in_to_updategate=init.Normal(0.1))

          Other parameters like ``weight_in_to_resetgate`` will be
          equal to their default values.

    biases : dict or Initializer
        Bias parameters for different gates.
        Defaults to :class:`Constant(0) <neupy.init.Constant>`.

        - In case if application requires the same initialization method
          for all biases, then it's possible to specify initialization
          method that would be automaticaly applied to all bias parameters
          in the GRU layer.

          .. code-block:: python

              layers.GRU(2, biases=init.Constant(1))

        - In case if application requires different initialization
          values for different weights then it's possible to specify
          an exact weight by name.

          .. code-block:: python

              dict(
                  bias_updategate=init.Constant(0),
                  bias_resetgate=init.Constant(0),
                  bias_hidden_update=init.Constant(0),
              )

          If application requires modification to only one (or multiple)
          parameter then it's better to specify the one that you need to
          modify and ignore other parameters

          .. code-block:: python

              dict(bias_resetgate=init.Constant(1))

          Other parameters like ``bias_updategate`` will be
          equal to their default values.

    activation_functions : dict, callable
        Activation functions for different gates. Defaults to:

        .. code-block:: python

            # import theano.tensor as T
            dict(
                resetgate=T.nnet.sigmoid,
                updategate=T.nnet.sigmoid,
                hidden_update=T.tanh,
            )

        If application requires modification to only one parameter
        then it's better to specify the one that you need to modify
        and ignore other parameters

        .. code-block:: python

            dict(resetgate=T.tanh)

        Other parameters like ``updategate`` or ``hidden_update``
        will be equal to their default values.

    learn_init : bool
        If ``True``, make ``hid_init`` trainable variable.
        Defaults to ``False``.

    hid_init : array-like, Theano variable, scalar or Initializer
        Initializer for initial hidden state (:math:`h_0`).
        Defaults to :class:`Constant(0) <neupy.init.Constant>`.

    {BaseRNNLayer.only_return_final}

    backwards : bool
        If ``True``, process the sequence backwards and then reverse the
        output again such that the output from the layer is always
        from :math:`x_1` to :math:`x_n`. Defaults to ``False``.

    precompute_input : bool
        if ``True``, precompute ``input_to_hid`` before iterating
        through the sequence. This can result in a speed up at the
        expense of an increase in memory usage.
        Defaults to ``True``.

    unroll_scan : bool
        If ``True`` the recursion is unrolled instead of using scan.
        For some graphs this gives a significant speed up but it
        might also consume more memory. When ``unroll_scan=True``,
        backpropagation always includes the full sequence, so
        ``n_gradient_steps`` must be set to ``-1`` and the input
        sequence length must be known at compile time (i.e.,
        cannot be given as ``None``). Defaults to ``False``.

    {BaseLayer.Parameters}

    Notes
    -----
    Code was adapted from the
    `Lasagne <https://github.com/Lasagne/Lasagne>`_ library.

    Examples
    --------

    Sequence classification

    .. code-block:: python

        from neupy import layers, algorithms

        n_time_steps = 40
        n_categories = 20
        embedded_size = 10

        network = algorithms.RMSProp(
            [
                layers.Input(n_time_steps),
                layers.Embedding(n_categories, embedded_size),
                layers.GRU(20),
                layers.Sigmoid(1),
            ]
        )
    """
    weights = MultiParameterProperty(
        default=dict(
            weight_in_to_updategate=init.XavierUniform(),
            weight_hid_to_updategate=init.XavierUniform(),

            weight_in_to_resetgate=init.XavierUniform(),
            weight_hid_to_resetgate=init.XavierUniform(),

            weight_in_to_hidden_update=init.XavierUniform(),
            weight_hid_to_hidden_update=init.XavierUniform(),
        ))
    biases = MultiParameterProperty(
        default=dict(
            bias_updategate=init.Constant(0),
            bias_resetgate=init.Constant(0),
            bias_hidden_update=init.Constant(0),
        ))
    activation_functions = MultiCallableProperty(
        default=dict(
            resetgate=T.nnet.sigmoid,
            updategate=T.nnet.sigmoid,
            hidden_update=T.tanh,
        ))

    learn_init = Property(default=False, expected_type=bool)
    hid_init = ParameterProperty(default=init.Constant(0))

    backwards = Property(default=False, expected_type=bool)
    unroll_scan = Property(default=False, expected_type=bool)
    precompute_input = Property(default=True, expected_type=bool)

    n_gradient_steps = IntProperty(default=-1)
    gradient_clipping = NumberProperty(default=0, minval=0)

    def initialize(self):
        super(GRU, self).initialize()

        n_inputs = np.prod(self.input_shape[1:])
        weights = self.weights
        biases = self.biases

        # Update gate parameters
        self.weight_in_to_updategate = self.add_parameter(
            value=weights.weight_in_to_updategate,
            name='weight_in_to_updategate',
            shape=(n_inputs, self.size))
        self.weight_hid_to_updategate = self.add_parameter(
            value=weights.weight_hid_to_updategate,
            name='weight_hid_to_updategate',
            shape=(self.size, self.size))
        self.bias_updategate = self.add_parameter(
            value=biases.bias_updategate, name='bias_updategate',
            shape=(self.size,))

        # Reset gate parameters
        self.weight_in_to_resetgate = self.add_parameter(
            value=weights.weight_in_to_resetgate,
            name='weight_in_to_resetgate',
            shape=(n_inputs, self.size))
        self.weight_hid_to_resetgate = self.add_parameter(
            value=weights.weight_hid_to_resetgate,
            name='weight_hid_to_resetgate',
            shape=(self.size, self.size))
        self.bias_resetgate = self.add_parameter(
            value=biases.bias_resetgate, name='bias_forgetgate',
            shape=(self.size,))

        # Hidden update gate parameters
        self.weight_in_to_hidden_update = self.add_parameter(
            value=weights.weight_in_to_hidden_update,
            name='weight_in_to_hidden_update',
            shape=(n_inputs, self.size))
        self.weight_hid_to_hidden_update = self.add_parameter(
            value=weights.weight_hid_to_hidden_update,
            name='weight_hid_to_hidden_update',
            shape=(self.size, self.size))
        self.bias_hidden_update = self.add_parameter(
            value=biases.bias_hidden_update, name='bias_hidden_update',
            shape=(self.size,))

        self.add_parameter(value=self.hid_init, shape=(1, self.size),
                           name="hid_init", trainable=self.learn_init)

    def output(self, input_value):
        # Treat all dimensions after the second as flattened
        # feature dimensions
        if input_value.ndim > 3:
            input_value = T.flatten(input_value, 3)

        # Because scan iterates over the first dimension we
        # dimshuffle to (n_time_steps, n_batch, n_features)
        input_value = input_value.dimshuffle(1, 0, 2)
        seq_len, n_batch, _ = input_value.shape

        # Stack input weight matrices into a (num_inputs, 3 * num_units)
        # matrix, which speeds up computation
        weight_in_stacked = T.concatenate([
            self.weight_in_to_updategate,
            self.weight_in_to_resetgate,
            self.weight_in_to_hidden_update], axis=1)

        # Same for hidden weight matrices
        weight_hid_stacked = T.concatenate([
            self.weight_hid_to_updategate,
            self.weight_hid_to_resetgate,
            self.weight_hid_to_hidden_update], axis=1)

        # Stack biases into a (3 * num_units) vector
        bias_stacked = T.concatenate([
            self.bias_updategate,
            self.bias_resetgate,
            self.bias_hidden_update], axis=0)

        if self.precompute_input:
            # Because the input is given for all time steps, we can
            # precompute_input the inputs dot weight matrices before scanning.
            # weight_in_stacked is (n_features, 3 * num_units).
            # Input: (n_time_steps, n_batch, 3 * num_units).
            input_value = T.dot(input_value, weight_in_stacked) + bias_stacked

        # When theano.scan calls step, input_n will be
        # (n_batch, 3 * num_units). We define a slicing function
        # that extract the input to each GRU gate
        def slice_w(x, n):
            s = x[:, n * self.size:(n + 1) * self.size]
            if self.size == 1:
                s = T.addbroadcast(s, 1)  # Theano cannot infer this by itself
            return s

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def one_gru_step(input_n, hid_previous, *args):
            # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1},
            # and W_{hc} h_{t - 1}
            hid_input = T.dot(hid_previous, weight_hid_stacked)

            if self.gradient_clipping:
                input_n = theano.gradient.grad_clip(
                    input_n,
                    -self.gradient_clipping,
                    self.gradient_clipping)

                hid_input = theano.gradient.grad_clip(
                    hid_input,
                    -self.gradient_clipping,
                    self.gradient_clipping)

            if not self.precompute_input:
                # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u,
                # and W_{xc}x_t + b_c
                input_n = T.dot(input_n, weight_in_stacked) + bias_stacked

            # Reset and update gates
            resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0)
            resetgate = self.activation_functions.resetgate(resetgate)

            updategate = slice_w(hid_input, 1) + slice_w(input_n, 1)
            updategate = self.activation_functions.updategate(updategate)

            # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1})
            hidden_update_in = slice_w(input_n, 2)
            hidden_update_hid = slice_w(hid_input, 2)
            hidden_update = hidden_update_in + resetgate * hidden_update_hid

            if self.gradient_clipping:
                hidden_update = theano.gradient.grad_clip(
                    hidden_update,
                    -self.gradient_clipping,
                    self.gradient_clipping)

            hidden_update = self.activation_functions.hidden_update(
                hidden_update)

            # Compute (1 - u_t)h_{t - 1} + u_t c_t
            hid = (1 - updategate) * hid_previous + updategate * hidden_update
            return hid

        hid_init = T.dot(T.ones((n_batch, 1)), self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_sequences = [weight_hid_stacked]

        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_sequences += [weight_in_stacked, bias_stacked]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            n_time_steps = self.input_shape[0]

            # Explicitly unroll the recurrence instead of using scan
            hid_out, = unroll_scan(
                fn=one_gru_step,
                sequences=[input_value],
                outputs_info=[hid_init],
                go_backwards=self.backwards,
                non_sequences=non_sequences,
                n_steps=n_time_steps)

        else:
            # Scan op iterates over first dimension of input and
            # repeatedly applies the step function
            hid_out, _ = theano.scan(
                fn=one_gru_step,
                sequences=[input_value],
                outputs_info=[hid_init],
                go_backwards=self.backwards,
                non_sequences=non_sequences,
                truncate_gradient=self.n_gradient_steps,
                strict=True)

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            return hid_out[-1]

        # dimshuffle back to (n_batch, n_time_steps, n_features))
        hid_out = hid_out.dimshuffle(1, 0, 2)

        # if scan is backward reverse the output
        if self.backwards:
            hid_out = hid_out[:, ::-1]

        return hid_out
Esempio n. 28
0
class Embedding(BaseLayer):
    """
    Embedding layer accepts indices as an input and returns
    rows from the weight matrix associated with these indices.
    It's useful when inputs are categorical features or for the
    word embedding tasks.

    Parameters
    ----------
    input_size : int
        Layer's input vector dimension. It's, typically, associated with
        number of categories or number of unique words that input vector has.

    output_size : int
        Layer's output vector dimension.

    weight : array-like, Tensorfow variable, scalar or Initializer
        Defines layer's weights. Default initialization methods
        you can find :ref:`here <init-methods>`.
        Defaults to :class:`HeNormal() <neupy.init.HeNormal>`.

    {BaseLayer.name}

    Methods
    -------
    {BaseLayer.Methods}

    Attributes
    ----------
    {BaseLayer.Attributes}

    Examples
    --------

    This example converts dataset that has only categorical
    variables into format that suitable for Embedding layer.

    >>> import numpy as np
    >>> from neupy.layers import *
    >>>
    >>> dataset = np.array([
    ...     ['cold', 'high'],
    ...     ['hot',  'low'],
    ...     ['cold', 'low'],
    ...     ['hot',  'low'],
    ... ])
    >>>
    >>> unique_value, dataset_indices = np.unique(
    ...     dataset, return_inverse=True
    ... )
    >>> dataset_indices = dataset_indices.reshape((4, 2))
    >>> dataset_indices
    array([[0, 1],
           [2, 3],
           [0, 3],
           [2, 3]])
    >>>
    >>> n_features = dataset.shape[1]
    >>> n_unique_categories = len(unique_value)
    >>> embedded_size = 1
    >>>
    >>> network = join(
    ...     Input(n_features),
    ...     Embedding(n_unique_categories, embedded_size),
    ...     # Output from the embedding layer is 3D
    ...     # To make output 2D we need to reshape dimensions
    ...     Reshape(),
    ... )
    """
    input_size = IntProperty(minval=1)
    output_size = IntProperty(minval=1)
    weight = ParameterProperty()

    def __init__(self, input_size, output_size,
                 weight=init.HeNormal(), name=None):

        super(Embedding, self).__init__(name=name)

        self.input_size = input_size
        self.output_size = output_size
        self.weight = weight

    def get_output_shape(self, input_shape):
        input_shape = tf.TensorShape(input_shape)
        return input_shape.concatenate(self.output_size)

    def create_variables(self, input_shape):
        self.input_shape = input_shape
        self.weight = self.variable(
            value=self.weight, name='weight',
            shape=as_tuple(self.input_size, self.output_size))

    def output(self, input_value, **kwargs):
        input_value = tf.cast(input_value, tf.int32)
        return tf.gather(self.weight, input_value)

    def __repr__(self):
        return self._repr_arguments(
            self.input_size,
            self.output_size,
            name=self.name,
            weight=self.weight,
        )
Esempio n. 29
0
class RBM(BaseAlgorithm, BaseNetwork, MinibatchTrainingMixin):
    """
    Boolean/Bernoulli Restricted Boltzmann Machine (RBM).
    Algorithm assumes that inputs are either binary
    values or values between 0 and 1.

    Parameters
    ----------
    n_visible : int
        Number of visible units.

    n_hidden : int
        Number of hidden units.

    {MinibatchTrainingMixin.batch_size}

    weight : array-like, Theano variable, Initializer or scalar
        Default initialization methods
        you can find :ref:`here <init-methods>`.
        Defaults to :class:`XavierNormal <neupy.init.XavierNormal>`.

    hidden_bias : array-like, Theano variable, Initializer or scalar
        Default initialization methods
        you can find :ref:`here <init-methods>`.
        Defaults to :class:`Constant(value=0) <neupy.init.Constant>`.

    visible_bias : array-like, Theano variable, Initializer or scalar
        Default initialization methods
        you can find :ref:`here <init-methods>`.
        Defaults to :class:`Constant(value=0) <neupy.init.Constant>`.

    {BaseNetwork.Parameters}

    Methods
    -------
    train(input_train, epochs=100)
        Trains network.

    {BaseSkeleton.fit}

    visible_to_hidden(visible_input)
        Populates data throught the network and returns output
        from the hidden layer.

    hidden_to_visible(hidden_input)
        Propagates output from the hidden layer backward
        to the visible.

    gibbs_sampling(visible_input, n_iter=1)
        Makes Gibbs sampling ``n`` times using visible input.

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> data = np.array([
    ...     [1, 0, 1, 0],
    ...     [1, 0, 1, 0],
    ...     [1, 0, 0, 0],  # incomplete sample
    ...     [1, 0, 1, 0],
    ...
    ...     [0, 1, 0, 1],
    ...     [0, 0, 0, 1],  # incomplete sample
    ...     [0, 1, 0, 1],
    ...     [0, 1, 0, 1],
    ...     [0, 1, 0, 1],
    ...     [0, 1, 0, 1],
    ... ])
    >>>
    >>> rbm = algorithms.RBM(n_visible=4, n_hidden=1)
    >>> rbm.train(data, epochs=100)
    >>>
    >>> hidden_states = rbm.visible_to_hidden(data)
    >>> hidden_states.round(2)
    array([[ 0.99],
           [ 0.99],
           [ 0.95],
           [ 0.99],
           [ 0.  ],
           [ 0.01],
           [ 0.  ],
           [ 0.  ],
           [ 0.  ],
           [ 0.  ]])

    References
    ----------
    [1] G. Hinton, A Practical Guide to Training Restricted
        Boltzmann Machines, 2010.
        http://www.cs.toronto.edu/~hinton/absps/guideTR.pdf
    """
    n_visible = IntProperty(minval=1)
    n_hidden = IntProperty(minval=1)

    weight = ParameterProperty(default=init.XavierNormal())
    hidden_bias = ParameterProperty(default=init.Constant(value=0))
    visible_bias = ParameterProperty(default=init.Constant(value=0))

    def __init__(self, n_visible, n_hidden, **options):
        self.theano_random = theano_random_stream()

        super(ConfigurableABC, self).__init__(n_hidden=n_hidden,
                                              n_visible=n_visible,
                                              **options)

        self.weight = create_shared_parameter(value=self.weight,
                                              name='algo:rbm/matrix:weight',
                                              shape=(n_visible, n_hidden))
        self.hidden_bias = create_shared_parameter(
            value=self.hidden_bias,
            name='algo:rbm/vector:hidden-bias',
            shape=(n_hidden, ),
        )
        self.visible_bias = create_shared_parameter(
            value=self.visible_bias,
            name='algo:rbm/vector:visible-bias',
            shape=(n_visible, ),
        )

        super(RBM, self).__init__(**options)

    def init_input_output_variables(self):
        self.variables.update(
            network_input=T.matrix(name='algo:rbm/var:network-input'), )

    def init_variables(self):
        self.variables.update(h_samples=theano.shared(
            name='algo:rbm/matrix:hidden-samples',
            value=asint(np.zeros((self.batch_size, self.n_hidden))),
        ), )

    def init_methods(self):
        def free_energy(visible_sample):
            wx_b = T.dot(visible_sample, self.weight) + self.hidden_bias
            visible_bias_term = T.dot(visible_sample, self.visible_bias)
            hidden_term = T.log(asfloat(1) + T.exp(wx_b)).sum(axis=1)
            return -visible_bias_term - hidden_term

        def visible_to_hidden(visible_sample):
            wx_b = T.dot(visible_sample, self.weight) + self.hidden_bias
            return T.nnet.sigmoid(wx_b)

        def hidden_to_visible(hidden_sample):
            wx_b = T.dot(hidden_sample, self.weight.T) + self.visible_bias
            return T.nnet.sigmoid(wx_b)

        def sample_hidden_from_visible(visible_sample):
            theano_random = self.theano_random
            hidden_prob = visible_to_hidden(visible_sample)
            hidden_sample = theano_random.binomial(n=1,
                                                   p=hidden_prob,
                                                   dtype=theano.config.floatX)
            return hidden_sample

        def sample_visible_from_hidden(hidden_sample):
            theano_random = self.theano_random
            visible_prob = hidden_to_visible(hidden_sample)
            visible_sample = theano_random.binomial(n=1,
                                                    p=visible_prob,
                                                    dtype=theano.config.floatX)
            return visible_sample

        network_input = self.variables.network_input
        n_samples = asfloat(network_input.shape[0])
        theano_random = self.theano_random

        weight = self.weight
        h_bias = self.hidden_bias
        v_bias = self.visible_bias
        h_samples = self.variables.h_samples
        step = asfloat(self.step)

        sample_indeces = theano_random.random_integers(
            low=0, high=n_samples - 1, size=(self.batch_size, ))
        v_pos = ifelse(
            T.eq(n_samples, self.batch_size),
            network_input,
            # In case if final batch has less number of
            # samples then expected
            network_input[sample_indeces])
        h_pos = visible_to_hidden(v_pos)

        v_neg = sample_visible_from_hidden(h_samples)
        h_neg = visible_to_hidden(v_neg)

        weight_update = v_pos.T.dot(h_pos) - v_neg.T.dot(h_neg)
        h_bias_update = (h_pos - h_neg).mean(axis=0)
        v_bias_update = (v_pos - v_neg).mean(axis=0)

        # Stochastic pseudo-likelihood
        feature_index_to_flip = theano_random.random_integers(
            low=0,
            high=self.n_visible - 1,
        )
        rounded_input = T.round(network_input)
        rounded_input = network_input
        rounded_input_flip = T.set_subtensor(
            rounded_input[:, feature_index_to_flip],
            1 - rounded_input[:, feature_index_to_flip])
        error = T.mean(self.n_visible * T.log(
            T.nnet.sigmoid(
                free_energy(rounded_input_flip) - free_energy(rounded_input))))

        self.methods.update(train_epoch=theano.function(
            [network_input],
            error,
            name='algo:rbm/func:train-epoch',
            updates=[
                (weight, weight + step * weight_update / n_samples),
                (h_bias, h_bias + step * h_bias_update),
                (v_bias, v_bias + step * v_bias_update),
                (h_samples, asint(theano_random.binomial(n=1, p=h_neg))),
            ]),
                            prediction_error=theano.function(
                                [network_input],
                                error,
                                name='algo:rbm/func:prediction-error',
                            ),
                            visible_to_hidden=theano.function(
                                [network_input],
                                visible_to_hidden(network_input),
                                name='algo:rbm/func:visible-to-hidden',
                            ),
                            hidden_to_visible=theano.function(
                                [network_input],
                                hidden_to_visible(network_input),
                                name='algo:rbm/func:hidden-to-visible',
                            ),
                            gibbs_sampling=theano.function(
                                [network_input],
                                sample_visible_from_hidden(
                                    sample_hidden_from_visible(network_input)),
                                name='algo:rbm/func:gibbs-sampling',
                            ))

    def train(self, input_train, input_test=None, epochs=100, summary='table'):
        """
        Train RBM.

        Parameters
        ----------
        input_train : 1D or 2D array-like
        input_test : 1D or 2D array-like or None
            Defaults to ``None``.
        epochs : int
            Number of training epochs. Defaults to ``100``.
        summary : {'table', 'inline'}
            Training summary type. Defaults to ``'table'``.
        """
        return super(RBM, self).train(input_train=input_train,
                                      target_train=None,
                                      input_test=input_test,
                                      target_test=None,
                                      epochs=epochs,
                                      epsilon=None,
                                      summary=summary)

    def train_epoch(self, input_train, target_train=None):
        """
        Train one epoch.

        Parameters
        ----------
        input_train : array-like (n_samples, n_features)

        Returns
        -------
        float
        """
        errors = self.apply_batches(
            function=self.methods.train_epoch,
            input_data=input_train,
            description='Training batches',
            show_error_output=True,
        )

        n_samples = len(input_train)
        return average_batch_errors(errors, n_samples, self.batch_size)

    def visible_to_hidden(self, visible_input):
        """
        Populates data throught the network and returns output
        from the hidden layer.

        Parameters
        ----------
        visible_input : array-like (n_samples, n_visible_features)

        Returns
        -------
        array-like
        """
        is_input_feature1d = (self.n_visible == 1)
        visible_input = format_data(visible_input, is_input_feature1d)

        outputs = self.apply_batches(function=self.methods.visible_to_hidden,
                                     input_data=visible_input,
                                     description='Hidden from visible batches',
                                     show_progressbar=True,
                                     show_error_output=False)

        return np.concatenate(outputs, axis=0)

    def hidden_to_visible(self, hidden_input):
        """
        Propagates output from the hidden layer backward
        to the visible.

        Parameters
        ----------
        hidden_input : array-like (n_samples, n_hidden_features)

        Returns
        -------
        array-like
        """
        is_input_feature1d = (self.n_hidden == 1)
        hidden_input = format_data(hidden_input, is_input_feature1d)

        outputs = self.apply_batches(function=self.methods.hidden_to_visible,
                                     input_data=hidden_input,
                                     description='Visible from hidden batches',
                                     show_progressbar=True,
                                     show_error_output=False)

        return np.concatenate(outputs, axis=0)

    def prediction_error(self, input_data, target_data=None):
        """
        Compute the pseudo-likelihood of input samples.

        Parameters
        ----------
        input_data : array-like
            Values of the visible layer

        Returns
        -------
        float
            Value of the pseudo-likelihood.
        """
        is_input_feature1d = (self.n_visible == 1)
        input_data = format_data(input_data, is_input_feature1d)

        errors = self.apply_batches(
            function=self.methods.prediction_error,
            input_data=input_data,
            description='Validation batches',
            show_error_output=True,
        )
        return average_batch_errors(errors,
                                    n_samples=len(input_data),
                                    batch_size=self.batch_size)

    def gibbs_sampling(self, visible_input, n_iter=1):
        """
        Makes Gibbs sampling n times using visible input.

        Parameters
        ----------
        visible_input : 1d or 2d array
        n_iter : int
            Number of Gibbs sampling iterations. Defaults to ``1``.

        Returns
        -------
        array-like
            Output from the visible units after perfoming n
            Gibbs samples. Array will contain only binary
            units (0 and 1).
        """
        is_input_feature1d = (self.n_visible == 1)
        visible_input = format_data(visible_input, is_input_feature1d)

        gibbs_sampling = self.methods.gibbs_sampling

        input_ = visible_input
        for iteration in range(n_iter):
            input_ = gibbs_sampling(input_)

        return input_
Esempio n. 30
0
class BatchNorm(BaseLayer):
    """
    Batch-normalization layer.

    Parameters
    ----------
    axes : int, tuple with int or None
        The axis or axes along which normalization is applied.
        ``None`` means that normalization will be applied over
        all axes except the first one. In case of 4D tensor it will
        be equal to ``(0, 1, 2)``. Defaults to ``None``.

    epsilon : float
        Epsilon is a positive constant that adds to the standard
        deviation to prevent the division by zero.
        Defaults to ``1e-5``.

    alpha : float
        Coefficient for the exponential moving average of
        batch-wise means and standard deviations computed during
        training; the closer to one, the more it will depend on
        the last batches seen. Value needs to be between ``0`` and ``1``.
        Defaults to ``0.1``.

    gamma : array-like, Tensorfow variable, scalar or Initializer
        Default initialization methods you can
        find :ref:`here <init-methods>`.
        Defaults to ``Constant(value=1)``.

    beta : array-like, Tensorfow variable, scalar or Initializer
        Default initialization methods you can
        find :ref:`here <init-methods>`.
        Defaults to ``Constant(value=0)``.

    running_mean : array-like, Tensorfow variable, scalar or Initializer
        Default initialization methods you can
        find :ref:`here <init-methods>`.
        Defaults to ``Constant(value=0)``.

    running_inv_std : array-like, Tensorfow variable, scalar or Initializer
        Default initialization methods you can
        find :ref:`here <init-methods>`.
        Defaults to ``Constant(value=1)``.

    {BaseLayer.Parameters}

    Methods
    -------
    {BaseLayer.Methods}

    Attributes
    ----------
    {BaseLayer.Attributes}

    References
    ----------
    .. [1] Batch Normalization: Accelerating Deep Network Training
           by Reducing Internal Covariate Shift,
           http://arxiv.org/pdf/1502.03167v3.pdf
    """
    axes = AxesProperty(default=None)
    epsilon = NumberProperty(default=1e-5, minval=0)
    alpha = ProperFractionProperty(default=0.1)
    beta = ParameterProperty(default=init.Constant(value=0))
    gamma = ParameterProperty(default=init.Constant(value=1))

    running_mean = ParameterProperty(default=init.Constant(value=0))
    running_inv_std = ParameterProperty(default=init.Constant(value=1))

    def initialize(self):
        super(BatchNorm, self).initialize()

        input_shape = as_tuple(None, self.input_shape)
        ndim = len(input_shape)

        if self.axes is None:
            # If ndim == 4 then axes = (0, 1, 2)
            # If ndim == 2 then axes = (0,)
            self.axes = tuple(range(ndim - 1))

        if any(axis >= ndim for axis in self.axes):
            raise ValueError("Cannot apply batch normalization on the axis "
                             "that doesn't exist.")

        opposite_axes = find_opposite_axes(self.axes, ndim)
        parameter_shape = [
            input_shape[axis] if axis in opposite_axes else 1
            for axis in range(ndim)
        ]

        if any(parameter is None for parameter in parameter_shape):
            unknown_dim_index = parameter_shape.index(None)
            raise ValueError("Cannot apply batch normalization on the axis "
                             "with unknown size over the dimension #{} "
                             "(0-based indeces).".format(unknown_dim_index))

        self.add_parameter(value=self.running_mean,
                           shape=parameter_shape,
                           name='running_mean',
                           trainable=False)
        self.add_parameter(value=self.running_inv_std,
                           shape=parameter_shape,
                           name='running_inv_std',
                           trainable=False)

        self.add_parameter(value=self.gamma,
                           name='gamma',
                           shape=parameter_shape,
                           trainable=True)
        self.add_parameter(value=self.beta,
                           name='beta',
                           shape=parameter_shape,
                           trainable=True)

    def output(self, input_value):
        alpha = asfloat(self.alpha)
        running_mean = self.running_mean
        running_inv_std = self.running_inv_std

        if not self.training_state:
            mean, inv_std = running_mean, running_inv_std
        else:
            mean = tf.reduce_mean(
                input_value,
                self.axes,
                keepdims=True,
                name="mean",
            )
            variance = tf.reduce_mean(
                tf.squared_difference(input_value, tf.stop_gradient(mean)),
                self.axes,
                keepdims=True,
                name="variance",
            )
            inv_std = tf.rsqrt(variance + asfloat(self.epsilon))

            self.updates = [
                (running_inv_std,
                 asfloat(1 - alpha) * running_inv_std + alpha * inv_std),
                (running_mean,
                 asfloat(1 - alpha) * running_mean + alpha * mean)
            ]

        normalized_value = (input_value - mean) * inv_std
        return self.gamma * normalized_value + self.beta