Beispiel #1
0
class GaussianNoise(BaseLayer):
    """ Add gaussian noise to the input value. Mean and standard
    deviation are layer's parameters.

    Parameters
    ----------
    std : float
        Standard deviation of the gaussian noise. Values needs to
        be greater than zero. Defaults to ``1``.
    mean : float
        Mean of the gaussian noise. Defaults to ``0``.
    """
    std = NumberProperty(default=1, minval=0)
    mean = NumberProperty(default=0)

    def __init__(self, std, **options):
        options['std'] = std
        super(GaussianNoise, self).__init__(**options)

    def output(self, input_value):
        if not self.training_state:
            return input_value

        theano_random = theano_random_stream()
        noise = theano_random.normal(size=input_value.shape,
                                     avg=self.mean, std=self.std)
        return input_value + noise

    def __repr__(self):
        classname = self.__class__.__name__
        return "{}(mean={}, std={})".format(classname, self.mean, self.std)
Beispiel #2
0
class GaussianNoise(Identity):
    """
    Add gaussian noise to the input value. Mean and standard deviation
    of the noise can be controlled from the layers parameters.

    It's important to note that output from the layer is controled by
    the ``training`` parameter in the ``output`` method. Layer
    will be applied only in cases when ``training=True`` propagated
    through the network, otherwise it will act as an identity.

    Parameters
    ----------
    std : float
        Standard deviation of the gaussian noise. Values needs to
        be greater than zero. Defaults to ``1``.

    mean : float
        Mean of the gaussian noise. Defaults to ``0``.

    {Identity.name}

    Methods
    -------
    {Identity.Methods}

    Attributes
    ----------
    {Identity.Attributes}

    Examples
    --------
    >>> from neupy.layers import *
    >>> network = join(
    ...     Input(10),
    ...     Relu(5) >> GaussianNoise(std=0.1),
    ...     Relu(5) >> GaussianNoise(std=0.1),
    ...     Sigmoid(1),
    ... )
    >>> network
    (?, 10) -> [... 6 layers ...] -> (?, 1)
    """
    mean = NumberProperty()
    std = NumberProperty(minval=0)

    def __init__(self, mean=1, std=0, name=None):
        super(GaussianNoise, self).__init__(name=name)
        self.mean = mean
        self.std = std

    def output(self, input_value, training=False):
        if not training:
            return input_value

        noise = tf.random_normal(shape=tf.shape(input_value),
                                 mean=self.mean,
                                 stddev=self.std)

        return input_value + noise
Beispiel #3
0
class Relu(ActivationLayer):
    """
    The layer with the rectifier (ReLu) activation function.

    Parameters
    ----------
    alpha : float
        Alpha parameter defines the decreasing rate
        for the negative values. If ``alpha``
        is non-zero value then layer behave like a
        leaky ReLu. Defaults to ``0``.
    {ActivationLayer.Parameters}

    Methods
    -------
    {ActivationLayer.Methods}

    Attributes
    ----------
    {ActivationLayer.Attributes}
    """
    alpha = NumberProperty(default=0, minval=0)

    def activation_function(self, input_value):
        alpha = asfloat(self.alpha)
        return T.nnet.relu(input_value, alpha)
Beispiel #4
0
class Elu(ActivationLayer):
    """
    The layer with the exponensial linear unit (ELU)
    activation function.

    Parameters
    ----------
    alpha : float
        Alpha parameter defines the decreasing exponensial
        rate for the negative values. Defaults to ``1``.
    {ActivationLayer.Parameters}

    Methods
    -------
    {ActivationLayer.Methods}

    Attributes
    ----------
    {ActivationLayer.Attributes}

    References
    ----------
    .. [1] http://arxiv.org/pdf/1511.07289v3.pdf
    """
    alpha = NumberProperty(default=1, minval=0)

    def activation_function(self, input_value):
        alpha = asfloat(self.alpha)
        return T.nnet.elu(input_value, alpha)
Beispiel #5
0
class LVQ2(LVQ):
    """
    Learning Vector Quantization 2 (LVQ2) algorithm.
    Improved version for the LVQ algorithm.

    Parameters
    ----------
    epsilon : float
        Ration between to closest subclasses that
        triggers double weight update. Defaults to ``0.1``.

    {LVQ.Parameters}

    Notes
    -----
    {LVQ.Notes}
    """
    epsilon = NumberProperty(default=0.1)

    def train_epoch(self, input_train, target_train):
        weight = self.weight
        epsilon = self.epsilon
        subclass_to_class = self.subclass_to_class

        n_correct_predictions = 0
        for input_row, target in zip(input_train, target_train):
            step = self.training_step
            output = euclid_distance(input_row, weight)
            winner_subclasses = n_argmin(output, n=2, axis=1)

            top1_subclass, top2_subclass = winner_subclasses
            top1_class = subclass_to_class[top1_subclass]
            top2_class = subclass_to_class[top2_subclass]

            top1_weight_update = input_row - weight[top1_subclass, :]
            is_correct_prediction = (top1_class == target)

            closest_dist, runner_up_dist = output[0, winner_subclasses]
            double_update_condition_satisfied = (
                not is_correct_prediction and
                (top2_class == target) and
                closest_dist > ((1 - epsilon) * runner_up_dist) and
                runner_up_dist < ((1 + epsilon) * closest_dist)
            )

            if double_update_condition_satisfied:
                top2_weight_update = input_row - weight[top2_class, :]
                weight[top1_subclass, :] -= step * top1_weight_update
                weight[top2_subclass, :] += step * top2_weight_update

            elif is_correct_prediction:
                weight[top1_subclass, :] += step * top1_weight_update

            else:
                weight[top1_subclass, :] -= step * top1_weight_update

            n_correct_predictions += is_correct_prediction

        n_samples = len(input_train)
        return 1 - n_correct_predictions / n_samples
Beispiel #6
0
class RMSProp(MinibatchGradientDescent):
    """
    RMSProp algorithm.

    Parameters
    ----------
    decay : float
        Decay rate. Value need to be between ``0`` and ``1``.
        Defaults to ``0.95``.
    epsilon : float
        Value need to be greater than ``0``. Defaults to ``1e-5``.
    {MinibatchGradientDescent.Parameters}

    Attributes
    ----------
    {MinibatchGradientDescent.Attributes}

    Methods
    -------
    {MinibatchGradientDescent.Methods}
    """
    decay = ProperFractionProperty(default=0.95)
    epsilon = NumberProperty(default=1e-5, minval=0)

    def init_layers(self):
        super(RMSProp, self).init_layers()
        for layer in self.layers:
            for parameter in layer.parameters:
                parameter_shape = T.shape(parameter).eval()
                parameter.prev_mean_squred_grad = theano.shared(
                    name="prev_mean_squred_grad_" + parameter.name,
                    value=asfloat(np.zeros(parameter_shape)),
                )

    def init_param_updates(self, layer, parameter):

        n_parameters = count_parameters(self)
        self.variables.hessian = theano.shared(value=asfloat(
            np.zeros((n_parameters, n_parameters))),
                                               name='hessian_inverse')

        parameters = list(iter_parameters(self))
        hessian_matrix, full_gradient = find_hessian_and_gradient(
            self.variables.error_func, parameters)

        prev_mean_squred_grad = parameter.prev_mean_squred_grad
        step = self.variables.step
        gradient = T.grad(self.variables.error_func, wrt=parameter)

        mean_squred_grad = (self.decay * prev_mean_squred_grad +
                            (1 - self.decay) * gradient**2)
        parameter_delta = gradient / T.sqrt(mean_squred_grad + self.epsilon)

        return [
            (prev_mean_squred_grad, mean_squred_grad),
            (parameter, parameter - step * parameter_delta),
        ]
Beispiel #7
0
class GaussianNoise(BaseLayer):
    """
    Add gaussian noise to the input value. Mean and standard
    deviation are layer's parameters.

    Parameters
    ----------
    std : float
        Standard deviation of the gaussian noise. Values needs to
        be greater than zero. Defaults to ``1``.

    mean : float
        Mean of the gaussian noise. Defaults to ``0``.

    {BaseLayer.Parameters}

    Methods
    -------
    {BaseLayer.Methods}

    Attributes
    ----------
    {BaseLayer.Attributes}
    """
    std = NumberProperty(default=1, minval=0)
    mean = NumberProperty(default=0)

    def __init__(self, mean=1, std=0, **options):
        super(GaussianNoise, self).__init__(mean=mean, std=std, **options)

    def output(self, input_value):
        if not self.training_state:
            return input_value

        noise = tf.random_normal(
            shape=tf.shape(input_value),
            mean=self.mean,
            stddev=self.std)

        return input_value + noise

    def __repr__(self):
        classname = self.__class__.__name__
        return "{}(mean={}, std={})".format(classname, self.mean, self.std)
Beispiel #8
0
class Relu(ActivationLayer):
    """
    The layer with the rectifier (ReLu) activation function.

    Parameters
    ----------
    alpha : float
        Alpha parameter defines the decreasing rate
        for the negative values. If ``alpha``
        is non-zero value then layer behave like a
        leaky ReLu. Defaults to ``0``.

    {ActivationLayer.size}

    weight : array-like, Tensorfow variable, scalar or Initializer
        Defines layer's weights. Default initialization methods
        you can find :ref:`here <init-methods>`.
        Defaults to :class:`HeNormal(gain=2) <neupy.init.HeNormal>`.

    {ParameterBasedLayer.bias}

    {BaseLayer.Parameters}

    Methods
    -------
    {ActivationLayer.Methods}

    Attributes
    ----------
    {ActivationLayer.Attributes}

    Examples
    --------
    Feedforward Neural Networks (FNN)

    >>> from neupy.layers import *
    >>> network = Input(10) > Relu(20) > Relu(1)

    Convolutional Neural Networks (CNN)

    >>> from neupy.layers import *
    >>> network = join(
    ...     Input((32, 32, 3)),
    ...     Convolution((3, 3, 16)) > Relu(),
    ...     Convolution((3, 3, 32)) > Relu(),
    ...     Reshape(),
    ...     Softmax(10),
    ... )
    """
    alpha = NumberProperty(default=0, minval=0)
    weight = ParameterProperty(default=init.HeNormal(gain=2))

    def activation_function(self, input_value):
        if self.alpha == 0:
            return tf.nn.relu(input_value)
        return tf.nn.leaky_relu(input_value, asfloat(self.alpha))
class RMSProp(MinibatchGradientDescent):
    """
    RMSProp algorithm.

    Parameters
    ----------
    decay : float
        Decay rate. Value need to be between ``0`` and ``1``.
        Defaults to ``0.95``.

    epsilon : float
        Value need to be greater than ``0``. Defaults to ``1e-5``.

    {MinibatchGradientDescent.Parameters}

    Attributes
    ----------
    {MinibatchGradientDescent.Attributes}

    Methods
    -------
    {MinibatchGradientDescent.Methods}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> mnet = algorithms.RMSProp((2, 3, 1))
    >>> mnet.train(x_train, y_train)
    """
    decay = ProperFractionProperty(default=0.95)
    epsilon = NumberProperty(default=1e-5, minval=0)

    def init_param_updates(self, layer, parameter):
        step = self.variables.step

        parameter_shape = T.shape(parameter).eval()
        prev_mean_squred_grad = theano.shared(
            name="{}/prev-mean-squared-grad".format(parameter.name),
            value=asfloat(np.zeros(parameter_shape)),
        )

        gradient = T.grad(self.variables.error_func, wrt=parameter)

        mean_squred_grad = (self.decay * prev_mean_squred_grad +
                            (1 - self.decay) * gradient**2)
        parameter_delta = gradient / T.sqrt(mean_squred_grad + self.epsilon)

        return [
            (prev_mean_squred_grad, mean_squred_grad),
            (parameter, parameter - step * parameter_delta),
        ]
Beispiel #10
0
class SearchThenConverge(SingleStepConfigurable):
    """
    Algorithm decrease learning step after each epoch.

    Parameters
    ----------
    reduction_freq : int
        The parameter controls the frequency reduction step
        with respect to epochs. Defaults to ``100`` epochs.
        Can't be less than ``1``. Less value mean that step
        decrease faster.

    rate_coefitient : float
        Second important parameter to control the rate of
        error reduction. Defaults to ``0.2``

    Warns
    -----
    {SingleStepConfigurable.Warns}

    Examples
    --------
    >>> from neupy import algorithms
    >>>
    >>> bpnet = algorithms.GradientDescent(
    ...     (2, 4, 1),
    ...     step=0.1,
    ...     verbose=False,
    ...     addons=[algorithms.SearchThenConverge]
    ... )
    >>>

    See Also
    --------
    :network:`StepDecay`
    """
    reduction_freq = IntProperty(minval=1, default=100)
    rate_coefitient = NumberProperty(default=0.2)

    def init_train_updates(self):
        updates = super(SearchThenConverge, self).init_train_updates()

        first_step = asfloat(self.step)
        reduction_freq = asfloat(self.reduction_freq)

        step = self.variables.step
        epoch = self.variables.epoch

        epoch_value = epoch / reduction_freq
        rated_value = 1 + (self.rate_coefitient / first_step) * epoch_value
        step_update_condition = (first_step * rated_value) / (
            rated_value + reduction_freq * epoch_value**2)

        updates.append((step, step_update_condition))
        return updates
Beispiel #11
0
class ZCA(BaseSkeleton):
    """ ZCA (zero-phase component analysis) whitening.

    Parameters
    ----------
    regularization : float
        Regularization parameter. Defaults to ``1e-5``.

    Attributes
    ----------
    mean : 1D array
        Mean for each feature.
    components : array-like
        ZCA components.

    Methods
    -------
    train(data)
        Train ZCA.
    transform(data)
        Transform input data.
    """
    regularization = NumberProperty(default=1e-5, minval=0)

    def __init__(self, regularization=1e-5, **options):
        self.regularization = regularization
        self.mean = None
        self.components = None
        super(ZCA, self).__init__(**options)

    def fit(self, X, *args, **kwargs):
        self.train(X, *args, **kwargs)
        return self

    def train(self, data):
        data = as_array2d(data)
        self.mean = data.mean(axis=0)
        data = data - self.mean

        n_features = data.shape[1]
        sigma = np.dot(data.T, data) / n_features
        U, S, V = np.linalg.svd(sigma)

        self.components = (U / np.sqrt(S + self.regularization)).dot(U.T)

    def transform(self, data):
        if self.mean is None or self.components is None:
            raise NotTrainedException("Train ZCA before use it.")

        data_shape = data.shape
        data = as_array2d(data)
        data_transformed = data - self.mean
        data_transformed = np.dot(data_transformed, self.components.T)
        return data_transformed.reshape(data_shape)
Beispiel #12
0
class RMSProp(MinibatchGradientDescent):
    """ RMSProp algorithm.

    Parameters
    ----------
    decay : float
        Decay rate. Value need to be between ``0`` and ``1``.
        Defaults to ``0.95``.
    epsilon : float
        Value need to be greater than ``0``. Defaults to ``1e-5``.
    {MinibatchGradientDescent.batch_size}
    {GradientDescent.addons}
    {ConstructableNetwork.connection}
    {ConstructableNetwork.error}
    {BaseNetwork.step}
    {BaseNetwork.show_epoch}
    {BaseNetwork.shuffle_data}
    {BaseNetwork.epoch_end_signal}
    {BaseNetwork.train_end_signal}
    {Verbose.verbose}

    Methods
    -------
    {BaseSkeleton.predict}
    {SupervisedLearning.train}
    {BaseSkeleton.fit}
    {BaseNetwork.plot_errors}
    """
    decay = ProperFractionProperty(default=0.95)
    epsilon = NumberProperty(default=1e-5, minval=0)

    def init_layers(self):
        super(RMSProp, self).init_layers()
        for layer in self.layers:
            for parameter in layer.parameters:
                parameter_shape = T.shape(parameter).eval()
                parameter.prev_mean_squred_grad = theano.shared(
                    name="prev_mean_squred_grad_" + parameter.name,
                    value=asfloat(np.zeros(parameter_shape)),
                )

    def init_param_updates(self, layer, parameter):
        prev_mean_squred_grad = parameter.prev_mean_squred_grad
        step = self.variables.step
        gradient = T.grad(self.variables.error_func, wrt=parameter)

        mean_squred_grad = (self.decay * prev_mean_squred_grad +
                            (1 - self.decay) * gradient**2)
        parameter_delta = gradient / T.sqrt(mean_squred_grad + self.epsilon)

        return [
            (prev_mean_squred_grad, mean_squred_grad),
            (parameter, parameter - step * parameter_delta),
        ]
Beispiel #13
0
class SearchThenConverge(SingleStep):
    """ Algorithm minimize learning step. Similar to
    :network:`SimpleStepMinimization`, but more complicated step update rule.

    Parameters
    ----------
    epochs_step_minimizator : int
        The parameter controls the frequency reduction step with respect
        to epochs. Defaults to ``100`` epochs. Can't be less than ``1``.
        Less value mean that step decrease faster.
    rate_coefitient : float
        Second important parameter to control the rate of error reduction.
        Defaults to ``0.2``

    Attributes
    ----------
    {first_step}

    Warns
    -----
    {bp_depending}

    Examples
    --------
    >>> from neupy import algorithms
    >>>
    >>> bpnet = algorithms.Backpropagation(
    ...     (2, 4, 1),
    ...     step=0.1,
    ...     verbose=False,
    ...     optimizations=[algorithms.SearchThenConverge]
    ... )
    >>>

    See Also
    --------
    :network:`SimpleStepMinimization`
    """
    epochs_step_minimizator = NonNegativeIntProperty(min_size=1, default=100)
    rate_coefitient = NumberProperty(default=0.2)

    def after_weight_update(self, input_train, target_train):
        super(SearchThenConverge,
              self).after_weight_update(input_train, target_train)

        first_step = self.first_step
        epochs_step_minimizator = self.epochs_step_minimizator

        epoch_value = self.epoch / epochs_step_minimizator
        rated_value = (self.rate_coefitient / first_step) * epoch_value

        self.step = first_step * (1 + rated_value) / (
            1 + rated_value + epochs_step_minimizator * epoch_value**2)
Beispiel #14
0
class MaxNormRegularization(WeightUpdateConfigurable):
    """
    Max-norm regularization algorithm will clip norm of the
    parameter in case if it will exceed maximum limit.

    .. code-block:: python

        if norm(weight) > max_norm:
            weight = max_norm * weight / norm(weight)

    .. raw:: html

        <br>

    Warns
    -----
    {WeightUpdateConfigurable.Warns}

    Parameters
    ----------
    max_norm : int, float
        Any parameter that has norm greater than this value
        will be clipped. Defaults to ``10``.

    Examples
    --------
    >>> from neupy import algorithms
    >>> bpnet = algorithms.GradientDescent(
    ...     (2, 4, 1),
    ...     step=0.1,
    ...     max_norm=4,
    ...     addons=[algorithms.MaxNormRegularization]
    ... )

    References
    ----------
    [1] N. Srivastava, G. Hinton, A. Krizhevsky, I. Sutskever,
      R. Salakhutdinov. Dropout: A Simple Way to Prevent
      Neural Networks from Overfitting.
      http://jmlr.org/papers/volume15/srivastava14a/srivastava14a.pdf
    """
    max_norm = NumberProperty(default=10, minval=0)

    def init_param_updates(self, layer, parameter):
        updates = super(MaxNormRegularization,
                        self).init_param_updates(layer, parameter)

        updates_mapper = dict(updates)
        updated_value = updates_mapper[parameter]
        updates_mapper[parameter] = max_norm_clip(updated_value, self.max_norm)

        return list(updates_mapper.items())
Beispiel #15
0
class Adagrad(MinibatchGradientDescent):
    """
    Adagrad algorithm.

    Parameters
    ----------
    epsilon : float
        Value need to be greater than ``0``. Defaults to ``1e-5``.
    {MinibatchGradientDescent.Parameters}

    Attributes
    ----------
    {MinibatchGradientDescent.Attributes}

    Methods
    -------
    {MinibatchGradientDescent.Methods}
    """
    epsilon = NumberProperty(default=1e-5, minval=0)

    def init_layers(self):
        super(Adagrad, self).init_layers()
        for layer in self.layers:
            for parameter in layer.parameters:
                parameter_shape = T.shape(parameter).eval()
                parameter.prev_mean_squred_grad = theano.shared(
                    name="prev_mean_squred_grad_" + parameter.name,
                    value=asfloat(np.zeros(parameter_shape)),
                )

    def init_param_updates(self, layer, parameter):
        prev_mean_squred_grad = parameter.prev_mean_squred_grad
        step = self.variables.step

        gradient = T.grad(self.variables.error_func, wrt=parameter)

        mean_squred_grad = prev_mean_squred_grad + gradient**2
        parameter_delta = gradient * T.sqrt(mean_squred_grad + self.epsilon)

        return [
            (prev_mean_squred_grad, mean_squred_grad),
            (parameter, parameter - step * parameter_delta),
        ]
Beispiel #16
0
class Relu(ActivationLayer):
    """
    The layer with the rectifier (ReLu) activation function.

    Parameters
    ----------
    alpha : float
        Alpha parameter defines the decreasing rate
        for the negative values. If ``alpha``
        is non-zero value then layer behave like a
        leaky ReLu. Defaults to ``0``.

    {ActivationLayer.size}

    weight : array-like, Tensorfow variable, scalar or Initializer
        Defines layer's weights. Default initialization methods
        you can find :ref:`here <init-methods>`.
        Defaults to :class:`HeNormal(gain=2) <neupy.init.HeNormal>`.

    {ParameterBasedLayer.bias}

    {BaseLayer.Parameters}

    Methods
    -------
    {ActivationLayer.Methods}

    Attributes
    ----------
    {ActivationLayer.Attributes}
    """
    alpha = NumberProperty(default=0, minval=0)
    weight = ParameterProperty(default=init.HeNormal(gain=2))

    def activation_function(self, input_value):
        if self.alpha == 0:
            return tf.nn.relu(input_value)
        return tf.nn.leaky_relu(input_value, asfloat(self.alpha))
Beispiel #17
0
class WolfeLineSearchForStep(StepSelectionBuiltIn, Configurable):
    """
    Class that has all functions required in order to apply line search over
    step parameter that used during the network training.

    Parameters
    ----------
    wolfe_maxiter : int
        Controls maximun number of iteration during the line search that
        identifies optimal step size during the weight update stage.
        Defaults to ``20``.

    wolfe_c1 : float
        Parameter for Armijo condition rule. It's used during the line search
        that identifies optimal step size during the weight update stage.
        Defaults ``1e-4``.

    wolfe_c2 : float
        Parameter for curvature condition rule. It's used during the line
        search that identifies optimal step size during the weight update
        stage. Defaults ``0.9``.
    """
    wolfe_maxiter = IntProperty(default=20, minval=0)
    wolfe_c1 = NumberProperty(default=1e-4, minval=0)
    wolfe_c2 = NumberProperty(default=0.9, minval=0)

    def find_optimal_step(self, parameter_vector, parameter_update):
        network_inputs = self.variables.network_inputs
        network_output = self.variables.network_output
        layers_and_parameters = list(iter_parameters(self.layers))

        def prediction(step):
            step = asfloat(step)
            updated_params = parameter_vector + step * parameter_update

            # This trick allow us to replace shared variables
            # with tensorflow variables and get output from the network
            start_pos = 0
            for layer, attrname, param in layers_and_parameters:
                end_pos = start_pos + get_variable_size(param)
                updated_param_value = tf.reshape(
                    updated_params[start_pos:end_pos], param.shape)
                setattr(layer, attrname, updated_param_value)
                start_pos = end_pos

            output = self.connection.output(*network_inputs)

            # Restore previous parameters
            for layer, attrname, param in layers_and_parameters:
                setattr(layer, attrname, param)

            return output

        def phi(step):
            return self.error(network_output, prediction(step))

        def derphi(step):
            error_func = self.error(network_output, prediction(step))
            gradient, = tf.gradients(error_func, step)
            return gradient

        return line_search(phi, derphi, self.wolfe_maxiter, self.wolfe_c1,
                           self.wolfe_c2)
Beispiel #18
0
class BaseNetwork(BaseSkeleton):
    """
    Base class for Neural Network algorithms.

    Parameters
    ----------
    step : float
        Learning rate, defaults to ``0.1``.

    show_epoch : int
        This property controls how often the network will display
        information about training. It has to be defined as positive
        integer. For instance, number ``100`` mean that network shows
        summary at 1st, 100th, 200th, 300th ... and last epochs.

        Defaults to ``1``.

    shuffle_data : bool
        If it's ``True`` than training data will be shuffled before
        the training. Defaults to ``True``.

    signals : dict, list or function
        Function that will be triggered after certain events during
        the training.

    {Verbose.Parameters}

    Methods
    -------
    {BaseSkeleton.fit}

    predict(X)
        Propagates input ``X`` through the network and
        returns produced output.

    plot_errors(logx=False, show=True, **figkwargs)
        Using errors collected during the training this method
        generates plot that can give additional insight into the
        performance reached during the training.

    Attributes
    ----------
    errors : list
        Information about errors. It has two main attributes, namely
        ``train`` and ``valid``. These attributes provide access to
        the training and validation errors respectively.

    last_epoch : int
        Value equals to the last trained epoch. After initialization
        it is equal to ``0``.

    n_updates_made : int
        Number of training updates applied to the network.
    """
    step = NumberProperty(default=0.1, minval=0)
    show_epoch = IntProperty(minval=1, default=1)
    shuffle_data = Property(default=False, expected_type=bool)
    signals = Property(expected_type=object)

    def __init__(self, *args, **options):
        super(BaseNetwork, self).__init__(*args, **options)

        self.last_epoch = 0
        self.n_updates_made = 0
        self.errors = base_signals.ErrorCollector()

        signals = list(
            as_tuple(
                base_signals.ProgressbarSignal(),
                base_signals.PrintLastErrorSignal(),
                self.errors,
                self.signals,
            ))

        for i, signal in enumerate(signals):
            if inspect.isfunction(signal):
                signals[i] = base_signals.EpochEndSignal(signal)

            elif inspect.isclass(signal):
                signals[i] = signal()

        self.events = Events(network=self, signals=signals)

    def one_training_update(self, X_train, y_train=None):
        """
        Function would be trigger before run all training procedure
        related to the current epoch.

        Parameters
        ----------
        epoch : int
            Current epoch number.
        """
        raise NotImplementedError()

    def score(self, X, y):
        raise NotImplementedError()

    def plot_errors(self, logx=False, show=True, **figkwargs):
        return plot_optimizer_errors(optimizer=self,
                                     logx=logx,
                                     show=show,
                                     **figkwargs)

    def train(self,
              X_train,
              y_train=None,
              X_test=None,
              y_test=None,
              epochs=100,
              batch_size=None):
        """
        Method train neural network.

        Parameters
        ----------
        X_train : array-like
        y_train : array-like or None
        X_test : array-like or None
        y_test : array-like or None

        epochs : int
            Defaults to ``100``.

        epsilon : float or None
            Defaults to ``None``.
        """
        if epochs <= 0:
            raise ValueError("Number of epochs needs to be a positive number")

        epochs = int(epochs)
        first_epoch = self.last_epoch + 1
        batch_size = batch_size or getattr(self, 'batch_size', None)

        self.events.trigger(
            name='train_start',
            X_train=X_train,
            y_train=y_train,
            epochs=epochs,
            batch_size=batch_size,
            store_data=False,
        )

        try:
            for epoch in range(first_epoch, first_epoch + epochs):
                self.events.trigger('epoch_start')

                self.last_epoch = epoch
                iterator = iters.minibatches(
                    (X_train, y_train),
                    batch_size,
                    self.shuffle_data,
                )

                for X_batch, y_batch in iterator:
                    self.events.trigger('update_start')
                    update_start_time = time.time()

                    train_error = self.one_training_update(X_batch, y_batch)
                    self.n_updates_made += 1

                    self.events.trigger(
                        name='train_error',
                        value=train_error,
                        eta=time.time() - update_start_time,
                        epoch=epoch,
                        n_updates=self.n_updates_made,
                        n_samples=iters.count_samples(X_batch),
                        store_data=True,
                    )
                    self.events.trigger('update_end')

                if X_test is not None:
                    test_start_time = time.time()
                    validation_error = self.score(X_test, y_test)
                    self.events.trigger(
                        name='valid_error',
                        value=validation_error,
                        eta=time.time() - test_start_time,
                        epoch=epoch,
                        n_updates=self.n_updates_made,
                        n_samples=iters.count_samples(X_test),
                        store_data=True,
                    )

                self.events.trigger('epoch_end')

        except StopTraining as err:
            self.logs.message(
                "TRAIN",
                "Epoch #{} was stopped. Message: {}".format(epoch, str(err)))

        self.events.trigger('train_end')
Beispiel #19
0
class ConjugateGradient(WolfeLineSearchForStep, BaseOptimizer):

    """
    Conjugate Gradient algorithm.

    Parameters
    ----------
    update_function : ``fletcher_reeves``, ``polak_ribiere``,\
    ``hentenes_stiefel``, ``dai_yuan``, ``liu_storey``
        Update function. Defaults to ``fletcher_reeves``.

    epsilon : float
        Ensures computational stability during the division in
        ``update_function`` when denominator is very small number.
        Defaults to ``1e-7``.

    {WolfeLineSearchForStep.Parameters}

    {BaseOptimizer.network}

    {BaseOptimizer.loss}

    {BaseOptimizer.show_epoch}

    {BaseOptimizer.shuffle_data}

    {BaseOptimizer.signals}

    {BaseOptimizer.verbose}

    {BaseOptimizer.regularizer}

    Attributes
    ----------
    {BaseOptimizer.Attributes}

    Methods
    -------
    {BaseOptimizer.Methods}

    Examples
    --------
    >>> from sklearn import datasets, preprocessing
    >>> from sklearn.model_selection import train_test_split
    >>> from neupy import algorithms, layers
    >>>
    >>> dataset = datasets.load_boston()
    >>> data, target = dataset.data, dataset.target
    >>>
    >>> data_scaler = preprocessing.MinMaxScaler()
    >>> target_scaler = preprocessing.MinMaxScaler()
    >>>
    >>> x_train, x_test, y_train, y_test = train_test_split(
    ...     data_scaler.fit_transform(data),
    ...     target_scaler.fit_transform(target),
    ...     test_size=0.15
    ... )
    >>>
    >>> cgnet = algorithms.ConjugateGradient(
    ...     network=[
    ...         layers.Input(13),
    ...         layers.Sigmoid(50),
    ...         layers.Sigmoid(1),
    ...     ],
    ...     update_function='fletcher_reeves',
    ...     verbose=False
    ... )
    >>>
    >>> cgnet.train(x_train, y_train, epochs=100)
    >>> y_predict = cgnet.predict(x_test).round(1)
    >>>
    >>> real = target_scaler.inverse_transform(y_test)
    >>> predicted = target_scaler.inverse_transform(y_predict)

    References
    ----------
    [1] Jorge Nocedal, Stephen J. Wright, Numerical Optimization.
        Chapter 5, Conjugate Gradient Methods, p. 101-133
    """
    epsilon = NumberProperty(default=1e-7, minval=0)
    update_function = ChoiceProperty(
        default='fletcher_reeves',
        choices={
            'fletcher_reeves': fletcher_reeves,
            'polak_ribiere': polak_ribiere,
            'hentenes_stiefel': hentenes_stiefel,
            'liu_storey': liu_storey,
            'dai_yuan': dai_yuan,
        }
    )
    step = WithdrawProperty()

    def init_functions(self):
        n_parameters = self.network.n_parameters
        self.variables.update(
            prev_delta=tf.Variable(
                tf.zeros([n_parameters]),
                name="conj-grad/prev-delta",
                dtype=tf.float32,
            ),
            prev_gradient=tf.Variable(
                tf.zeros([n_parameters]),
                name="conj-grad/prev-gradient",
                dtype=tf.float32,
            ),
            iteration=tf.Variable(
                asfloat(self.last_epoch),
                name='conj-grad/current-iteration',
                dtype=tf.float32
            ),
        )
        super(ConjugateGradient, self).init_functions()

    def init_train_updates(self):
        iteration = self.variables.iteration
        previous_delta = self.variables.prev_delta
        previous_gradient = self.variables.prev_gradient

        n_parameters = self.network.n_parameters
        variables = self.network.variables
        parameters = [var for var in variables.values() if var.trainable]
        param_vector = make_single_vector(parameters)

        gradients = tf.gradients(self.variables.loss, parameters)
        full_gradient = make_single_vector(gradients)

        beta = self.update_function(
            previous_gradient, full_gradient, previous_delta, self.epsilon)

        parameter_delta = tf.where(
            tf.equal(tf.mod(iteration, n_parameters), 0),
            -full_gradient,
            -full_gradient + beta * previous_delta
        )

        step = self.find_optimal_step(param_vector, parameter_delta)
        updated_parameters = param_vector + step * parameter_delta
        updates = setup_parameter_updates(parameters, updated_parameters)

        # We have to compute these values first, otherwise
        # parallelization, in tensorflow, can mix update order
        # and, for example, previous gradient can be equal to
        # current gradient value. It happens because tensorflow
        # try to execute operations in parallel.
        with tf.control_dependencies([full_gradient, parameter_delta]):
            updates.extend([
                previous_gradient.assign(full_gradient),
                previous_delta.assign(parameter_delta),
                iteration.assign(iteration + 1),
            ])

        return updates
Beispiel #20
0
class Adamax(MinibatchGradientDescent):
    """ AdaMax algorithm.

    Parameters
    ----------
    beta1 : float
        Decay rate. Value need to be between ``0`` and ``1``.
        Defaults to ``0.95``.
    beta2 : float
        Decay rate. Value need to be between ``0`` and ``1``.
        Defaults to ``0.95``.
    epsilon : float
        Value need to be greater than ``0``. Defaults to ``1e-5``.
    step : float
        Learning rate, defaults to ``0.001``.
    {MinibatchGradientDescent.batch_size}
    {GradientDescent.addons}
    {ConstructableNetwork.connection}
    {ConstructableNetwork.error}
    {BaseNetwork.show_epoch}
    {BaseNetwork.shuffle_data}
    {BaseNetwork.epoch_end_signal}
    {BaseNetwork.train_end_signal}
    {Verbose.verbose}

    Methods
    -------
    {BaseSkeleton.predict}
    {SupervisedLearning.train}
    {BaseSkeleton.fit}
    {BaseNetwork.plot_errors}
    """
    step = NumberProperty(default=0.001, minval=0)
    beta1 = ProperFractionProperty(default=0.9)
    beta2 = ProperFractionProperty(default=0.999)
    epsilon = NumberProperty(default=1e-8, minval=0)

    def init_layers(self):
        super(Adamax, self).init_layers()
        for layer in self.layers:
            for parameter in layer.parameters:
                parameter_shape = T.shape(parameter).eval()
                parameter.prev_first_moment = theano.shared(
                    name="prev_first_moment_" + parameter.name,
                    value=asfloat(np.zeros(parameter_shape)),
                )
                parameter.prev_weighted_inf_norm = theano.shared(
                    name="prev_weighted_inf_norm_" + parameter.name,
                    value=asfloat(np.zeros(parameter_shape)),
                )

    def init_param_updates(self, layer, parameter):
        epoch = self.variables.epoch
        prev_first_moment = parameter.prev_first_moment
        prev_weighted_inf_norm = parameter.prev_weighted_inf_norm

        step = self.variables.step
        beta1 = self.beta1
        beta2 = self.beta2

        gradient = T.grad(self.variables.error_func, wrt=parameter)

        first_moment = beta1 * prev_first_moment + (1 - beta1) * gradient
        weighted_inf_norm = T.maximum(beta2 * prev_weighted_inf_norm,
                                      T.abs_(gradient))

        parameter_delta = ((1 / (1 - beta1**epoch)) *
                           (first_moment / (weighted_inf_norm + self.epsilon)))

        return [
            (prev_first_moment, first_moment),
            (prev_weighted_inf_norm, weighted_inf_norm),
            (parameter, parameter - step * parameter_delta),
        ]
Beispiel #21
0
class LVQ(BaseNetwork):
    """
    Learning Vector Quantization (LVQ) algorithm.

    Notes
    -----
    - Input data needs to be normalized, because LVQ uses
      Euclidian distance to find clusters.

    - Training error is just a ratio of miscassified
      samples

    Parameters
    ----------
    n_inputs : int
        Number of input units. It should be equal to the
        number of features in the input data set.

    n_subclasses : int, None
        Defines total number of subclasses. Values should be greater
        or equal to the number of classes. ``None`` will set up number
        of subclasses equal to the number of classes. Defaults to ``None``
        (or the same as ``n_classes``).

    n_classes : int
        Number of classes in the data set.

    prototypes_per_class : list, None
        Defines number of prototypes per each class. For instance,
        if ``n_classes=3`` and ``n_subclasses=8`` then there are
        can be 3 subclasses for the first class, 3 for the second one
        and 2 for the third one (3 + 3 + 2 == 8). The following example
        can be specified as ``prototypes_per_class=[3, 3, 2]``.

        There are two rules that apply to this parameter:

        1. ``sum(prototypes_per_class) == n_subclasses``

        2. ``len(prototypes_per_class) == n_classes``

        The ``None`` value will distribute approximately equal
        number of subclasses per each class. It's approximately,
        because in casses when ``n_subclasses % n_classes != 0``
        there is no way to distribute equal number of subclasses
        per each class.

        Defaults to ``None``.

    {BaseNetwork.step}

    n_updates_to_stepdrop : int or None
        If this options is not equal to ``None`` then after every
        update LVQ reduces step size and do it until number of
        applied updates would reach the ``n_updates_to_stepdrop``
        value. The minimum possible step size defined in the
        ``minstep`` parameter.

        Be aware that number of updates is not the same as number
        of epochs. LVQ applies update after each propagated sample
        through the network. Relations between this parameter and
        maximum number of epochs is following

        .. code-block:: python

            n_updates_to_stepdrop = n_samples * n_max_epochs

        If parameter equal to ``None`` then step size wouldn't be
        reduced after each update.

        Defaults to ``None``.

    minstep : float
        Step size would never be lower than this value. This
        property useful only in case if ``n_updates_to_stepdrop``
        is not ``None``. Defaults to ``1e-5``.

    {BaseNetwork.show_epoch}

    {BaseNetwork.shuffle_data}

    {BaseNetwork.epoch_end_signal}

    {BaseNetwork.train_end_signal}

    {Verbose.verbose}

    Methods
    -------
    {BaseSkeleton.predict}

    {BaseSkeleton.fit}
    """
    n_inputs = IntProperty(minval=1)
    n_subclasses = IntProperty(minval=2, default=None, allow_none=True)
    n_classes = IntProperty(minval=2)

    prototypes_per_class = TypedListProperty(allow_none=True, default=None)
    weight = Property(expected_type=(np.ndarray, init.Initializer),
                      allow_none=True, default=None)

    n_updates_to_stepdrop = IntProperty(default=None, allow_none=True,
                                        minval=1)
    minstep = NumberProperty(minval=0, default=1e-5)

    def __init__(self, **options):
        self.initialized = False
        super(LVQ, self).__init__(**options)

        self.n_updates = 0

        if self.n_subclasses is None:
            self.n_subclasses = self.n_classes

        if isinstance(self.weight, init.Initializer):
            weight_shape = (self.n_inputs, self.n_subclasses)
            self.weight = self.weight.sample(weight_shape)

        if self.weight is not None:
            self.initialized = True

        if self.n_subclasses < self.n_classes:
            raise ValueError("Number of subclasses should be greater "
                             "or equal to the number of classes. Network "
                             "was defined with {} subclasses and {} classes"
                             "".format(self.n_subclasses, self.n_classes))

        if self.prototypes_per_class is None:
            whole, reminder = divmod(self.n_subclasses, self.n_classes)
            self.prototypes_per_class = [whole] * self.n_classes

            if reminder:
                # Since we have reminder left, it means that we cannot
                # have an equal number of subclasses per each class,
                # therefor we will add +1 to randomly selected class.
                class_indeces = np.random.choice(self.n_classes, reminder,
                                                 replace=False)

                for class_index in class_indeces:
                    self.prototypes_per_class[class_index] += 1

        if len(self.prototypes_per_class) != self.n_classes:
            raise ValueError("LVQ defined for classification problem that has "
                             "{} classes, but the `prototypes_per_class` "
                             "variable has defined data for {} classes."
                             "".format(self.n_classes,
                                       len(self.prototypes_per_class)))

        if sum(self.prototypes_per_class) != self.n_subclasses:
            raise ValueError("Invalid distribution of subclasses for the "
                             "`prototypes_per_class` variable. Got total "
                             "of {} subclasses ({}) instead of {} expected"
                             "".format(sum(self.prototypes_per_class),
                                       self.prototypes_per_class,
                                       self.n_subclasses))

        self.subclass_to_class = []
        for class_id, n_prototypes in enumerate(self.prototypes_per_class):
            self.subclass_to_class.extend([class_id] * n_prototypes)

    @property
    def training_step(self):
        if self.n_updates_to_stepdrop is None:
            return self.step

        updates_ratio = (1 - self.n_updates / self.n_updates_to_stepdrop)
        return self.minstep + (self.step - self.minstep) * updates_ratio

    def predict(self, input_data):
        if not self.initialized:
            raise NotTrained("LVQ network hasn't been trained yet")

        input_data = format_data(input_data)
        subclass_to_class = self.subclass_to_class
        weight = self.weight

        predictions = []
        for input_row in input_data:
            output = euclid_distance(input_row, weight)
            winner_subclass = int(output.argmin(axis=1))

            predicted_class = subclass_to_class[winner_subclass]
            predictions.append(predicted_class)

        return np.array(predictions)

    def train(self, input_train, target_train, *args, **kwargs):
        input_train = format_data(input_train)
        target_train = format_data(target_train)

        n_input_samples = len(input_train)

        if n_input_samples <= self.n_subclasses:
            raise ValueError("Number of training input samples should be "
                             "greater than number of sublcasses. Training "
                             "method recived {} input samples."
                             "".format(n_input_samples))

        if not self.initialized:
            target_classes = sorted(np.unique(target_train).astype(np.int))
            expected_classes = list(range(self.n_classes))

            if target_classes != expected_classes:
                raise ValueError("All classes should be integers from the "
                                 "range [0, {}], but got the following "
                                 "classes instead {}".format(
                                    self.n_classes - 1, target_classes))

            weights = []
            iterator = zip(target_classes, self.prototypes_per_class)
            for target_class, n_prototypes in iterator:
                is_valid_class = (target_train[:, 0] == target_class)
                is_valid_class = is_valid_class.astype('float64')
                n_samples_per_class = sum(is_valid_class)
                is_valid_class /= n_samples_per_class

                if n_samples_per_class <= n_prototypes:
                    raise ValueError("Input data has {0} samples for class-{1}"
                                     ". Number of samples per specified "
                                     "class-{1} should be greater than {2}."
                                     "".format(n_samples_per_class,
                                               target_class, n_prototypes))

                class_weight_indeces = np.random.choice(
                    np.arange(n_input_samples), n_prototypes,
                    replace=False, p=is_valid_class)

                class_weight = input_train[class_weight_indeces]
                weights.extend(class_weight)

            self.weight = np.array(weights)
            self.initialized = True

        super(LVQ, self).train(input_train, target_train, *args, **kwargs)

    def train_epoch(self, input_train, target_train):
        weight = self.weight
        subclass_to_class = self.subclass_to_class

        n_correct_predictions = 0
        for input_row, target in zip(input_train, target_train):
            step = self.training_step
            output = euclid_distance(input_row, weight)
            winner_subclass = int(output.argmin())
            predicted_class = subclass_to_class[winner_subclass]

            weight_update = input_row - weight[winner_subclass, :]
            is_correct_prediction = (predicted_class == target)

            if is_correct_prediction:
                weight[winner_subclass, :] += step * weight_update
            else:
                weight[winner_subclass, :] -= step * weight_update

            n_correct_predictions += is_correct_prediction
            self.n_updates += 1

        n_samples = len(input_train)
        return 1 - n_correct_predictions / n_samples
Beispiel #22
0
class LVQ3(LVQ21):
    """
    Learning Vector Quantization 3 (LVQ3) algorithm.
    Improved version for the LVQ2.1 algorithm.

    Parameters
    ----------
    {LVQ.n_inputs}

    {LVQ.n_subclasses}

    {LVQ.n_classes}

    {LVQ.prototypes_per_class}

    {LVQ2.epsilon}

    slowdown_rate : float
        Paremeter scales learning step in order to decrease it
        in case if the two closest subclasses predict target
        value correctly. Defaults to ``0.4``.

    step : float
        Learning rate, defaults to ``0.01``.

    {BaseNetwork.show_epoch}

    {BaseNetwork.shuffle_data}

    {BaseNetwork.epoch_end_signal}

    {BaseNetwork.train_end_signal}

    {Verbose.verbose}

    Notes
    -----
    {LVQ21.Notes}
    - Decreasing step and increasing number of training epochs
      can improve the performance.
    """
    step = NumberProperty(minval=0, default=0.01)
    slowdown_rate = NumberProperty(minval=0, default=0.4)

    def train_epoch(self, input_train, target_train):
        weight = self.weight
        epsilon = self.epsilon
        slowdown_rate = self.slowdown_rate
        subclass_to_class = self.subclass_to_class

        n_correct_predictions = 0
        for input_row, target in zip(input_train, target_train):
            step = self.training_step
            output = euclid_distance(input_row, weight)
            winner_subclasses = n_argmin(output, n=2, axis=1)

            top1_subclass, top2_subclass = winner_subclasses
            top1_class = subclass_to_class[top1_subclass]
            top2_class = subclass_to_class[top2_subclass]

            top1_weight_update = input_row - weight[top1_subclass, :]
            is_first_correct = (top1_class == target)
            is_second_correct = (top2_class == target)

            closest_dist, runner_up_dist = output[0, winner_subclasses]
            double_update_condition_satisfied = (
                (
                    (is_first_correct and not is_second_correct) or
                    (is_second_correct and not is_first_correct)
                ) and
                closest_dist > ((1 - epsilon) * runner_up_dist) and
                runner_up_dist < ((1 + epsilon) * closest_dist)
            )
            two_closest_correct_condition_satisfied = (
                is_first_correct and is_second_correct and
                closest_dist > ((1 - epsilon) * (1 + epsilon) * runner_up_dist)
            )

            if double_update_condition_satisfied:
                top2_weight_update = input_row - weight[top2_class, :]

                if is_first_correct:
                    weight[top1_subclass, :] += step * top1_weight_update
                    weight[top2_subclass, :] -= step * top2_weight_update
                else:
                    weight[top1_subclass, :] -= step * top1_weight_update
                    weight[top2_subclass, :] += step * top2_weight_update

            elif two_closest_correct_condition_satisfied:
                beta = step * slowdown_rate
                top2_weight_update = input_row - weight[top2_class, :]

                weight[top1_subclass, :] += beta * top1_weight_update
                weight[top2_subclass, :] += beta * top2_weight_update

            else:
                weight[top1_subclass, :] -= step * top1_weight_update

            n_correct_predictions += is_first_correct
            self.n_updates += 1

        n_samples = len(input_train)
        return 1 - n_correct_predictions / n_samples
Beispiel #23
0
class GRU(BaseRNNLayer):
    """
    Gated Recurrent Unit (GRU) Layer.

    Parameters
    ----------
    {BaseRNNLayer.size}

    weights : dict or Initializer
        Weight parameters for different gates.
        Defaults to :class:`XavierUniform() <neupy.init.XavierUniform>`.

        - In case if application requires the same initialization method
          for all weights, then it's possible to specify initialization
          method that would be automaticaly applied to all weight
          parameters in the GRU layer.

          .. code-block:: python

              layers.GRU(2, weights=init.Normal(0.1))

        - In case if application requires different initialization
          values for different weights then it's possible to specify
          an exact weight by name.

          .. code-block:: python

              dict(
                  weight_in_to_updategate=init.XavierUniform(),
                  weight_hid_to_updategate=init.XavierUniform(),

                  weight_in_to_resetgate=init.XavierUniform(),
                  weight_hid_to_resetgate=init.XavierUniform(),

                  weight_in_to_hidden_update=init.XavierUniform(),
                  weight_hid_to_hidden_update=init.XavierUniform(),
              )

          If application requires modification to only one (or multiple)
          parameter then it's better to specify the one that you need to
          modify and ignore other parameters

          .. code-block:: python

              dict(weight_in_to_updategate=init.Normal(0.1))

          Other parameters like ``weight_in_to_resetgate`` will be
          equal to their default values.

    biases : dict or Initializer
        Bias parameters for different gates.
        Defaults to :class:`Constant(0) <neupy.init.Constant>`.

        - In case if application requires the same initialization method
          for all biases, then it's possible to specify initialization
          method that would be automaticaly applied to all bias parameters
          in the GRU layer.

          .. code-block:: python

              layers.GRU(2, biases=init.Constant(1))

        - In case if application requires different initialization
          values for different weights then it's possible to specify
          an exact weight by name.

          .. code-block:: python

              dict(
                  bias_updategate=init.Constant(0),
                  bias_resetgate=init.Constant(0),
                  bias_hidden_update=init.Constant(0),
              )

          If application requires modification to only one (or multiple)
          parameter then it's better to specify the one that you need to
          modify and ignore other parameters

          .. code-block:: python

              dict(bias_resetgate=init.Constant(1))

          Other parameters like ``bias_updategate`` will be
          equal to their default values.

    activation_functions : dict, callable
        Activation functions for different gates. Defaults to:

        .. code-block:: python

            # import theano.tensor as T
            dict(
                resetgate=T.nnet.sigmoid,
                updategate=T.nnet.sigmoid,
                hidden_update=T.tanh,
            )

        If application requires modification to only one parameter
        then it's better to specify the one that you need to modify
        and ignore other parameters

        .. code-block:: python

            dict(resetgate=T.tanh)

        Other parameters like ``updategate`` or ``hidden_update``
        will be equal to their default values.

    learn_init : bool
        If ``True``, make ``hid_init`` trainable variable.
        Defaults to ``False``.

    hid_init : array-like, Theano variable, scalar or Initializer
        Initializer for initial hidden state (:math:`h_0`).
        Defaults to :class:`Constant(0) <neupy.init.Constant>`.

    {BaseRNNLayer.only_return_final}

    backwards : bool
        If ``True``, process the sequence backwards and then reverse the
        output again such that the output from the layer is always
        from :math:`x_1` to :math:`x_n`. Defaults to ``False``.

    precompute_input : bool
        if ``True``, precompute ``input_to_hid`` before iterating
        through the sequence. This can result in a speed up at the
        expense of an increase in memory usage.
        Defaults to ``True``.

    unroll_scan : bool
        If ``True`` the recursion is unrolled instead of using scan.
        For some graphs this gives a significant speed up but it
        might also consume more memory. When ``unroll_scan=True``,
        backpropagation always includes the full sequence, so
        ``n_gradient_steps`` must be set to ``-1`` and the input
        sequence length must be known at compile time (i.e.,
        cannot be given as ``None``). Defaults to ``False``.

    {BaseLayer.Parameters}

    Notes
    -----
    Code was adapted from the
    `Lasagne <https://github.com/Lasagne/Lasagne>`_ library.

    Examples
    --------

    Sequence classification

    .. code-block:: python

        from neupy import layers, algorithms

        n_time_steps = 40
        n_categories = 20
        embedded_size = 10

        network = algorithms.RMSProp(
            [
                layers.Input(n_time_steps),
                layers.Embedding(n_categories, embedded_size),
                layers.GRU(20),
                layers.Sigmoid(1),
            ]
        )
    """
    weights = MultiParameterProperty(
        default=dict(
            weight_in_to_updategate=init.XavierUniform(),
            weight_hid_to_updategate=init.XavierUniform(),

            weight_in_to_resetgate=init.XavierUniform(),
            weight_hid_to_resetgate=init.XavierUniform(),

            weight_in_to_hidden_update=init.XavierUniform(),
            weight_hid_to_hidden_update=init.XavierUniform(),
        ))
    biases = MultiParameterProperty(
        default=dict(
            bias_updategate=init.Constant(0),
            bias_resetgate=init.Constant(0),
            bias_hidden_update=init.Constant(0),
        ))
    activation_functions = MultiCallableProperty(
        default=dict(
            resetgate=T.nnet.sigmoid,
            updategate=T.nnet.sigmoid,
            hidden_update=T.tanh,
        ))

    learn_init = Property(default=False, expected_type=bool)
    hid_init = ParameterProperty(default=init.Constant(0))

    backwards = Property(default=False, expected_type=bool)
    unroll_scan = Property(default=False, expected_type=bool)
    precompute_input = Property(default=True, expected_type=bool)

    n_gradient_steps = IntProperty(default=-1)
    gradient_clipping = NumberProperty(default=0, minval=0)

    def initialize(self):
        super(GRU, self).initialize()

        n_inputs = np.prod(self.input_shape[1:])
        weights = self.weights
        biases = self.biases

        # Update gate parameters
        self.weight_in_to_updategate = self.add_parameter(
            value=weights.weight_in_to_updategate,
            name='weight_in_to_updategate',
            shape=(n_inputs, self.size))
        self.weight_hid_to_updategate = self.add_parameter(
            value=weights.weight_hid_to_updategate,
            name='weight_hid_to_updategate',
            shape=(self.size, self.size))
        self.bias_updategate = self.add_parameter(
            value=biases.bias_updategate, name='bias_updategate',
            shape=(self.size,))

        # Reset gate parameters
        self.weight_in_to_resetgate = self.add_parameter(
            value=weights.weight_in_to_resetgate,
            name='weight_in_to_resetgate',
            shape=(n_inputs, self.size))
        self.weight_hid_to_resetgate = self.add_parameter(
            value=weights.weight_hid_to_resetgate,
            name='weight_hid_to_resetgate',
            shape=(self.size, self.size))
        self.bias_resetgate = self.add_parameter(
            value=biases.bias_resetgate, name='bias_forgetgate',
            shape=(self.size,))

        # Hidden update gate parameters
        self.weight_in_to_hidden_update = self.add_parameter(
            value=weights.weight_in_to_hidden_update,
            name='weight_in_to_hidden_update',
            shape=(n_inputs, self.size))
        self.weight_hid_to_hidden_update = self.add_parameter(
            value=weights.weight_hid_to_hidden_update,
            name='weight_hid_to_hidden_update',
            shape=(self.size, self.size))
        self.bias_hidden_update = self.add_parameter(
            value=biases.bias_hidden_update, name='bias_hidden_update',
            shape=(self.size,))

        self.add_parameter(value=self.hid_init, shape=(1, self.size),
                           name="hid_init", trainable=self.learn_init)

    def output(self, input_value):
        # Treat all dimensions after the second as flattened
        # feature dimensions
        if input_value.ndim > 3:
            input_value = T.flatten(input_value, 3)

        # Because scan iterates over the first dimension we
        # dimshuffle to (n_time_steps, n_batch, n_features)
        input_value = input_value.dimshuffle(1, 0, 2)
        seq_len, n_batch, _ = input_value.shape

        # Stack input weight matrices into a (num_inputs, 3 * num_units)
        # matrix, which speeds up computation
        weight_in_stacked = T.concatenate([
            self.weight_in_to_updategate,
            self.weight_in_to_resetgate,
            self.weight_in_to_hidden_update], axis=1)

        # Same for hidden weight matrices
        weight_hid_stacked = T.concatenate([
            self.weight_hid_to_updategate,
            self.weight_hid_to_resetgate,
            self.weight_hid_to_hidden_update], axis=1)

        # Stack biases into a (3 * num_units) vector
        bias_stacked = T.concatenate([
            self.bias_updategate,
            self.bias_resetgate,
            self.bias_hidden_update], axis=0)

        if self.precompute_input:
            # Because the input is given for all time steps, we can
            # precompute_input the inputs dot weight matrices before scanning.
            # weight_in_stacked is (n_features, 3 * num_units).
            # Input: (n_time_steps, n_batch, 3 * num_units).
            input_value = T.dot(input_value, weight_in_stacked) + bias_stacked

        # When theano.scan calls step, input_n will be
        # (n_batch, 3 * num_units). We define a slicing function
        # that extract the input to each GRU gate
        def slice_w(x, n):
            s = x[:, n * self.size:(n + 1) * self.size]
            if self.size == 1:
                s = T.addbroadcast(s, 1)  # Theano cannot infer this by itself
            return s

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def one_gru_step(input_n, hid_previous, *args):
            # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1},
            # and W_{hc} h_{t - 1}
            hid_input = T.dot(hid_previous, weight_hid_stacked)

            if self.gradient_clipping:
                input_n = theano.gradient.grad_clip(
                    input_n,
                    -self.gradient_clipping,
                    self.gradient_clipping)

                hid_input = theano.gradient.grad_clip(
                    hid_input,
                    -self.gradient_clipping,
                    self.gradient_clipping)

            if not self.precompute_input:
                # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u,
                # and W_{xc}x_t + b_c
                input_n = T.dot(input_n, weight_in_stacked) + bias_stacked

            # Reset and update gates
            resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0)
            resetgate = self.activation_functions.resetgate(resetgate)

            updategate = slice_w(hid_input, 1) + slice_w(input_n, 1)
            updategate = self.activation_functions.updategate(updategate)

            # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1})
            hidden_update_in = slice_w(input_n, 2)
            hidden_update_hid = slice_w(hid_input, 2)
            hidden_update = hidden_update_in + resetgate * hidden_update_hid

            if self.gradient_clipping:
                hidden_update = theano.gradient.grad_clip(
                    hidden_update,
                    -self.gradient_clipping,
                    self.gradient_clipping)

            hidden_update = self.activation_functions.hidden_update(
                hidden_update)

            # Compute (1 - u_t)h_{t - 1} + u_t c_t
            hid = (1 - updategate) * hid_previous + updategate * hidden_update
            return hid

        hid_init = T.dot(T.ones((n_batch, 1)), self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_sequences = [weight_hid_stacked]

        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_sequences += [weight_in_stacked, bias_stacked]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            n_time_steps = self.input_shape[0]

            # Explicitly unroll the recurrence instead of using scan
            hid_out, = unroll_scan(
                fn=one_gru_step,
                sequences=[input_value],
                outputs_info=[hid_init],
                go_backwards=self.backwards,
                non_sequences=non_sequences,
                n_steps=n_time_steps)

        else:
            # Scan op iterates over first dimension of input and
            # repeatedly applies the step function
            hid_out, _ = theano.scan(
                fn=one_gru_step,
                sequences=[input_value],
                outputs_info=[hid_init],
                go_backwards=self.backwards,
                non_sequences=non_sequences,
                truncate_gradient=self.n_gradient_steps,
                strict=True)

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            return hid_out[-1]

        # dimshuffle back to (n_batch, n_time_steps, n_features))
        hid_out = hid_out.dimshuffle(1, 0, 2)

        # if scan is backward reverse the output
        if self.backwards:
            hid_out = hid_out[:, ::-1]

        return hid_out
Beispiel #24
0
class QuasiNewton(StepSelectionBuiltIn, GradientDescent):
    """
    Quasi-Newton algorithm optimization.

    Parameters
    ----------
    update_function : {{'bfgs', 'dfp', 'psb', 'sr1'}}
        Update function. Defaults to ``bfgs``.

    h0_scale : float
        Default Hessian matrix is an identity matrix. The
        ``h0_scale`` parameter scales identity matrix.
        Defaults to ``1``.

    {GradientDescent.connection}

    {GradientDescent.error}

    {GradientDescent.show_epoch}

    {GradientDescent.shuffle_data}

    {GradientDescent.epoch_end_signal}

    {GradientDescent.train_end_signal}

    {GradientDescent.verbose}

    {GradientDescent.addons}

    Attributes
    ----------
    {GradientDescent.Attributes}

    Methods
    -------
    {GradientDescent.Methods}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> qnnet = algorithms.QuasiNewton(
    ...     (2, 3, 1),
    ...     update_function='bfgs'
    ... )
    >>> qnnet.train(x_train, y_train, epochs=10)

    See Also
    --------
    :network:`GradientDescent` : GradientDescent algorithm.
    """
    update_function = ChoiceProperty(default='bfgs',
                                     choices={
                                         'bfgs': bfgs,
                                         'dfp': dfp,
                                         'psb': psb,
                                         'sr1': sr1,
                                     })
    h0_scale = NumberProperty(default=1, minval=0)

    step = WithdrawProperty()

    def init_variables(self):
        super(QuasiNewton, self).init_variables()
        n_params = count_parameters(self.connection)
        self.variables.update(
            inv_hessian=theano.shared(
                name='algo:quasi-newton/matrix:inv-hessian',
                value=asfloat(self.h0_scale * np.eye(int(n_params))),
            ),
            prev_params=theano.shared(
                name='algo:quasi-newton/vector:prev-params',
                value=asfloat(np.zeros(n_params)),
            ),
            prev_full_gradient=theano.shared(
                name='algo:quasi-newton/vector:prev-full-gradient',
                value=asfloat(np.zeros(n_params)),
            ),
        )

    def init_train_updates(self):
        network_inputs = self.variables.network_inputs
        network_output = self.variables.network_output
        inv_hessian = self.variables.inv_hessian
        prev_params = self.variables.prev_params
        prev_full_gradient = self.variables.prev_full_gradient

        params = parameter_values(self.connection)
        param_vector = T.concatenate([param.flatten() for param in params])

        gradients = T.grad(self.variables.error_func, wrt=params)
        full_gradient = T.concatenate([grad.flatten() for grad in gradients])

        new_inv_hessian = ifelse(
            T.eq(self.variables.epoch, 1), inv_hessian,
            self.update_function(inv_hessian, param_vector - prev_params,
                                 full_gradient - prev_full_gradient))
        param_delta = -new_inv_hessian.dot(full_gradient)
        layers_and_parameters = list(iter_parameters(self.layers))

        def prediction(step):
            updated_params = param_vector + step * param_delta

            # This trick allow us to replace shared variables
            # with theano variables and get output from the network
            start_pos = 0
            for layer, attrname, param in layers_and_parameters:
                end_pos = start_pos + param.size
                updated_param_value = T.reshape(
                    updated_params[start_pos:end_pos], param.shape)
                setattr(layer, attrname, updated_param_value)
                start_pos = end_pos

            output = self.connection.output(*network_inputs)

            # Restore previous parameters
            for layer, attrname, param in layers_and_parameters:
                setattr(layer, attrname, param)

            return output

        def phi(step):
            return self.error(network_output, prediction(step))

        def derphi(step):
            error_func = self.error(network_output, prediction(step))
            return T.grad(error_func, wrt=step)

        step = asfloat(line_search(phi, derphi))
        updated_params = param_vector + step * param_delta
        updates = setup_parameter_updates(params, updated_params)

        updates.extend([
            (inv_hessian, new_inv_hessian),
            (prev_params, param_vector),
            (prev_full_gradient, full_gradient),
        ])

        return updates
Beispiel #25
0
class LVQ2(LVQ):
    """
    Learning Vector Quantization 2 (LVQ2) algorithm.
    Improved version for the LVQ algorithm.

    Parameters
    ----------
    epsilon : float
        Ration between to closest subclasses that
        triggers double weight update. Defaults to ``0.1``.

    {LVQ.Parameters}

    Notes
    -----
    {LVQ.Notes}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> X = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [2, 2], [1, 2]])
    >>> y = np.array([0, 0, 0, 1, 1, 1])
    >>>
    >>> lvqnet = algorithms.LVQ2(n_inputs=2, n_classes=2)
    >>> lvqnet.train(X, y, epochs=100)
    >>> lvqnet.predict([[2, 1], [-1, -1]])
    array([1, 0])
    """
    epsilon = NumberProperty(default=0.1)

    def one_training_update(self, X_train, y_train):
        weight = self.weight
        epsilon = self.epsilon
        subclass_to_class = self.subclass_to_class

        n_correct_predictions = 0
        for input_row, target in zip(X_train, y_train):
            step = self.training_step
            output = euclid_distance(input_row, weight)
            winner_subclasses = n_argmin(output, n=2, axis=1)

            top1_subclass, top2_subclass = winner_subclasses
            top1_class = subclass_to_class[top1_subclass]
            top2_class = subclass_to_class[top2_subclass]

            top1_weight_update = input_row - weight[top1_subclass, :]
            is_correct_prediction = (top1_class == target).item(0)

            closest_dist, runner_up_dist = output[0, winner_subclasses]
            double_update_condition_satisfied = (
                not is_correct_prediction and (top2_class == target)
                and closest_dist > ((1 - epsilon) * runner_up_dist)
                and runner_up_dist < ((1 + epsilon) * closest_dist))

            if double_update_condition_satisfied:
                top2_weight_update = input_row - weight[top2_class, :]
                weight[top1_subclass, :] -= step * top1_weight_update
                weight[top2_subclass, :] += step * top2_weight_update

            elif is_correct_prediction:
                weight[top1_subclass, :] += step * top1_weight_update

            else:
                weight[top1_subclass, :] -= step * top1_weight_update

            n_correct_predictions += is_correct_prediction

        n_samples = len(X_train)
        return 1 - n_correct_predictions / n_samples
Beispiel #26
0
class GrowingNeuralGas(BaseNetwork):
    """
    Growing Neural Gas (GNG) algorithm.

    Current algorithm has two modifications that hasn't been mentioned
    in the paper, but they help to speed up training.

    - The ``n_start_nodes`` parameter provides possibility to increase
      number of nodes during initialization step. It's useful when
      algorithm takes a lot of time building up large amount of neurons.

    - The ``min_distance_for_update`` parameter allows to speed up
      training when some data samples has neurons very close to them. The
      ``min_distance_for_update`` parameter controls threshold for the
      minimum distance for which we will want to update weights.

    Parameters
    ----------
    n_inputs : int
        Number of features in each sample.

    n_start_nodes : int
        Number of nodes that algorithm generates from the data during
        the initialization step. Defaults to ``2``.

    step : float
        Step (learning rate) for the neuron winner. Defaults to ``0.2``.

    neighbour_step : float
        Step (learning rate) for the neurons that connected via edges
        with neuron winner. This value typically has to be smaller than
        ``step`` value. Defaults to ``0.05``.

    max_edge_age : int
        It means that if edge won't be updated for ``max_edge_age`` iterations
        than it would be removed. The larger the value the more updates we
        allow to do before removing edge. Defaults to ``100``.

    n_iter_before_neuron_added : int
        Each ``n_iter_before_neuron_added`` weight update algorithm add new
        neuron. The smaller the value the more frequently algorithm adds
        new neurons to the network. Defaults to ``1000``.

    error_decay_rate : float
        This error decay rate would be applied to every neuron in the
        graph after each training iteration. It ensures that old errors
        will be reduced over time. Defaults to ``0.995``.

    after_split_error_decay_rate : float
        This decay rate reduces error for neurons with largest errors
        after algorithm added new neuron. This value typically lower than
        ``error_decay_rate``. Defaults to ``0.5``.

    max_nodes : int
        Maximum number of nodes that would be generated during the training.
        This parameter won't stop training when maximum number of nodes
        will be exceeded. Defaults to ``1000``.

    min_distance_for_update : float
        Parameter controls for which neurons we want to apply updates.
        In case if euclidean distance between data sample and closest
        neurons will be less than the ``min_distance_for_update`` value than
        update would be skipped for this data sample. Setting value to zero
        will disable effect provided by this parameter. Defaults to ``0``.

    {BaseNetwork.show_epoch}

    {BaseNetwork.shuffle_data}

    {BaseNetwork.signals}

    {Verbose.verbose}

    Methods
    -------
    train(X_train, epochs=100)
        Network learns topological structure of the data. Learned
        structure will be stored in the ``graph`` attribute.

    {BaseSkeleton.fit}

    initialize_nodes(data)
        Network initializes nodes randomly sampling ``n_start_nodes``
        from the data. It would be applied automatically before
        the training in case if graph is empty.

        Note: Node re-initialization can reset network.

    Notes
    -----
    - Unlike other algorithms this network doesn't make predictions.
      Instead, it learns topological structure of the data in form of
      the graph. After that training, structure of the network can be
      extracted from the ``graph`` attribute.

    - In order to speed up training, it might be useful to increase
      the ``n_start_nodes`` parameter.

    - During the training it happens that nodes learn topological
      structure of one part of the data better than the other, mostly
      because of the different data sample density in different places.
      Increasing the ``min_distance_for_update`` can speed up training
      ignoring updates for the neurons that very close to the data sample.
      (below specified ``min_distance_for_update`` value). Training can be
      stopped in case if none of the neurons has been updated during
      the training epoch.

    Attributes
    ----------
    graph : NeuralGasGraph instance
        This attribute stores all neurons and connections between them
        in the form of undirected graph.

    {BaseNetwork.Attributes}

    Examples
    --------
    >>> from neupy import algorithms
    >>> from sklearn.datasets import make_blobs
    >>>
    >>> data, _ = make_blobs(
    ...     n_samples=1000,
    ...     n_features=2,
    ...     centers=2,
    ...     cluster_std=0.4,
    ... )
    >>>
    >>> neural_gas = algorithms.GrowingNeuralGas(
    ...     n_inputs=2,
    ...     shuffle_data=True,
    ...     verbose=True,
    ...     max_edge_age=10,
    ...     n_iter_before_neuron_added=50,
    ...     max_nodes=100,
    ... )
    >>> neural_gas.graph.n_nodes
    100
    >>> len(neural_gas.graph.edges)
    175
    >>> edges = list(neural_gas.graph.edges.keys())
    >>> neuron_1, neuron_2 = edges[0]
    >>>
    >>> neuron_1.weight
    array([[-6.77166299,  2.4121606 ]])
    >>> neuron_2.weight
    array([[-6.829309  ,  2.27839633]])

    References
    ----------
    [1] A Growing Neural Gas Network Learns Topologies, Bernd Fritzke
    """
    n_inputs = IntProperty(minval=1, required=True)
    n_start_nodes = IntProperty(minval=2, default=2)

    step = NumberProperty(default=0.2, minval=0)
    neighbour_step = NumberProperty(default=0.05, minval=0)
    max_edge_age = IntProperty(default=100, minval=1)
    max_nodes = IntProperty(default=1000, minval=1)

    n_iter_before_neuron_added = IntProperty(default=1000, minval=1)
    after_split_error_decay_rate = ProperFractionProperty(default=0.5)
    error_decay_rate = ProperFractionProperty(default=0.995)
    min_distance_for_update = NumberProperty(default=0.0, minval=0)

    def __init__(self, *args, **kwargs):
        super(GrowingNeuralGas, self).__init__(*args, **kwargs)
        self.n_updates = 0
        self.graph = NeuralGasGraph()

    def format_input_data(self, X):
        is_feature1d = self.n_inputs == 1
        X = format_data(X, is_feature1d)

        if X.ndim != 2:
            raise ValueError("Cannot make prediction, because input "
                             "data has more than 2 dimensions")

        n_samples, n_features = X.shape

        if n_features != self.n_inputs:
            raise ValueError("Input data expected to have {} features, "
                             "but got {}".format(self.n_inputs, n_features))

        return X

    def initialize_nodes(self, data):
        self.graph = NeuralGasGraph()

        for sample in sample_data_point(data, n=self.n_start_nodes):
            self.graph.add_node(NeuronNode(sample.reshape(1, -1)))

    def train(self, X_train, epochs=100):
        X_train = self.format_input_data(X_train)

        if not self.graph.nodes:
            self.initialize_nodes(X_train)

        return super(GrowingNeuralGas, self).train(
            X_train=X_train, y_train=None,
            X_test=None, y_test=None,
            epochs=epochs)

    def one_training_update(self, X_train, y_train=None):
        graph = self.graph
        step = self.step
        neighbour_step = self.neighbour_step

        max_nodes = self.max_nodes
        max_edge_age = self.max_edge_age

        error_decay_rate = self.error_decay_rate
        after_split_error_decay_rate = self.after_split_error_decay_rate
        n_iter_before_neuron_added = self.n_iter_before_neuron_added

        # We square this value, because we deal with
        # squared distances during the training.
        min_distance_for_update = np.square(self.min_distance_for_update)

        n_samples = len(X_train)
        total_error = 0
        did_update = False

        for sample in X_train:
            nodes = graph.nodes
            weights = np.concatenate([node.weight for node in nodes])

            distance = np.linalg.norm(weights - sample, axis=1)
            neuron_ids = np.argsort(distance)

            closest_neuron_id, second_closest_id = neuron_ids[:2]
            closest_neuron = nodes[closest_neuron_id]
            second_closest = nodes[second_closest_id]
            total_error += distance[closest_neuron_id]

            if distance[closest_neuron_id] < min_distance_for_update:
                continue

            self.n_updates += 1
            did_update = True

            closest_neuron.error += distance[closest_neuron_id]
            closest_neuron.weight += step * (sample - closest_neuron.weight)

            graph.add_edge(closest_neuron, second_closest)

            for to_neuron in list(graph.edges_per_node[closest_neuron]):
                edge_id = graph.find_edge_id(to_neuron, closest_neuron)
                age = graph.edges[edge_id]

                if age >= max_edge_age:
                    graph.remove_edge(to_neuron, closest_neuron)

                    if not graph.edges_per_node[to_neuron]:
                        graph.remove_node(to_neuron)

                else:
                    graph.edges[edge_id] += 1
                    to_neuron.weight += neighbour_step * (
                        sample - to_neuron.weight)

            time_to_add_new_neuron = (
                self.n_updates % n_iter_before_neuron_added == 0 and
                graph.n_nodes < max_nodes)

            if time_to_add_new_neuron:
                nodes = graph.nodes
                largest_error_neuron = max(nodes, key=attrgetter('error'))
                neighbour_neuron = max(
                    graph.edges_per_node[largest_error_neuron],
                    key=attrgetter('error'))

                largest_error_neuron.error *= after_split_error_decay_rate
                neighbour_neuron.error *= after_split_error_decay_rate

                new_weight = 0.5 * (
                    largest_error_neuron.weight + neighbour_neuron.weight
                )
                new_neuron = NeuronNode(weight=new_weight.reshape(1, -1))

                graph.remove_edge(neighbour_neuron, largest_error_neuron)
                graph.add_node(new_neuron)
                graph.add_edge(largest_error_neuron, new_neuron)
                graph.add_edge(neighbour_neuron, new_neuron)

            for node in graph.nodes:
                node.error *= error_decay_rate

        if not did_update and min_distance_for_update != 0 and n_samples > 1:
            raise StopTraining(
                "Distance between every data sample and neurons, closest "
                "to them, is less then {}".format(min_distance_for_update))

        return total_error / n_samples

    def predict(self, *args, **kwargs):
        raise NotImplementedError(
            "Growing Neural Gas algorithm doesn't make prediction. "
            "It only learns graph structure from the data "
            "(class has `graph` attribute). ")
Beispiel #27
0
class LVQ3(LVQ21):
    """
    Learning Vector Quantization 3 (LVQ3) algorithm.
    Improved version for the LVQ algorithm.

    Parameters
    ----------
    slowdown_rate : float
        Paremeter scales learning step in order to decrease it
        in case if the two closest subclasses predict target
        value correctly. Defaults to ``0.4``.

    {LVQ21.Parameters}

    Notes
    -----
    {LVQ21.Notes}
    """
    slowdown_rate = NumberProperty(minval=0, default=0.4)

    def train_epoch(self, input_train, target_train):
        step = self.step
        weight = self.weight
        epsilon = self.epsilon
        slowdown_rate = self.slowdown_rate
        subclass_to_class = self.subclass_to_class

        n_correct_predictions = 0
        for input_row, target in zip(input_train, target_train):
            output = euclid_distance(input_row, weight)
            winner_subclasses = n_argmin(output, n=2, axis=1)

            top1_subclass, top2_subclass = winner_subclasses
            top1_class = subclass_to_class[top1_subclass]
            top2_class = subclass_to_class[top2_subclass]

            top1_weight_update = input_row - weight[top1_subclass, :]
            is_first_correct = (top1_class == target)
            is_second_correct = (top2_class == target)

            closest_dist, runner_up_dist = output[0, winner_subclasses]
            double_update_condition_satisfied = (
                (
                    (is_first_correct and not is_second_correct) or
                    (is_second_correct and not is_first_correct)
                ) and
                closest_dist > ((1 - epsilon) * runner_up_dist) and
                runner_up_dist < ((1 + epsilon) * closest_dist)
            )
            two_closest_correct_condition_satisfied = (
                is_first_correct and is_second_correct and
                closest_dist > ((1 - epsilon) * (1 + epsilon) * runner_up_dist)
            )

            if double_update_condition_satisfied:
                top2_weight_update = input_row - weight[top2_class, :]

                if is_first_correct:
                    weight[top2_subclass, :] -= step * top2_weight_update
                    weight[top1_subclass, :] += step * top1_weight_update
                else:
                    weight[top1_subclass, :] -= step * top1_weight_update
                    weight[top2_subclass, :] += step * top2_weight_update

            elif two_closest_correct_condition_satisfied:
                beta = step * slowdown_rate
                weight[top1_subclass, :] += beta * top1_weight_update
                weight[top2_subclass, :] += beta * top2_weight_update

            else:
                weight[top1_subclass, :] -= step * top1_weight_update

            n_correct_predictions += is_first_correct

        n_samples = len(input_train)
        return 1 - n_correct_predictions / n_samples
class BaseNetwork(BaseSkeleton):
    """
    Base class for Neural Network algorithms.

    Parameters
    ----------
    step : float
        Learning rate, defaults to ``0.1``.

    show_epoch : int or str
        This property controls how often the network will
        display information about training. There are two
        main syntaxes for this property.

        - You can define it as a positive integer number. It
          defines how offen would you like to see summary
          output in terminal. For instance, number `100` mean
          that network shows summary at 100th, 200th,
          300th ... epochs.

        - String defines number of times you want to see output in
          terminal. For instance, value ``'2 times'`` mean that
          the network will show output twice with approximately
          equal period of epochs and one additional output would
          be after the finall epoch.

        Defaults to ``1``.

    shuffle_data : bool
        If it's ``True`` class shuffles all your training data before
        training your network, defaults to ``True``.

    epoch_end_signal : function
        Calls this function when train epoch finishes.

    train_end_signal : function
        Calls this function when train process finishes.

    {Verbose.Parameters}

    Attributes
    ----------
    errors : ErrorHistoryList
        Contains list of training errors. This object has the same
        properties as list and in addition there are three additional
        useful methods: `last`, `previous` and `normalized`.

    train_errors : ErrorHistoryList
        Alias to the ``errors`` attribute.

    validation_errors : ErrorHistoryList
        The same as `errors` attribute, but it contains only validation
        errors.

    last_epoch : int
        Value equals to the last trained epoch. After initialization
        it is equal to ``0``.
    """
    step = NumberProperty(default=0.1, minval=0)

    show_epoch = ShowEpochProperty(minval=1, default=1)
    shuffle_data = Property(default=False, expected_type=bool)

    epoch_end_signal = Property(expected_type=types.FunctionType)
    train_end_signal = Property(expected_type=types.FunctionType)

    def __init__(self, *args, **options):
        self.errors = self.train_errors = ErrorHistoryList()
        self.validation_errors = ErrorHistoryList()
        self.training = AttributeKeyDict()
        self.last_epoch = 0

        super(BaseNetwork, self).__init__(*args, **options)

        if self.verbose:
            show_network_options(self, highlight_options=options)

    def predict(self, input_data):
        """
        Return prediction results for the input data.

        Parameters
        ----------
        input_data : array-like

        Returns
        -------
        array-like
        """
        raise NotImplementedError

    def on_epoch_start_update(self, epoch):
        """
        Function would be trigger before run all training procedure
        related to the current epoch.

        Parameters
        ----------
        epoch : int
            Current epoch number.
        """
        self.last_epoch = epoch

    def train_epoch(self, input_train, target_train=None):
        raise NotImplementedError()

    def prediction_error(self, input_test, target_test):
        raise NotImplementedError()

    def train(self, input_train, target_train=None, input_test=None,
              target_test=None, epochs=100, epsilon=None,
              summary='table'):
        """
        Method train neural network.

        Parameters
        ----------
        input_train : array-like

        target_train : array-like or None

        input_test : array-like or None

        target_test : array-like or None

        epochs : int
            Defaults to `100`.

        epsilon : float or None
            Defaults to ``None``.
        """
        show_epoch = self.show_epoch
        logs = self.logs
        training = self.training = AttributeKeyDict()

        if epochs <= 0:
            raise ValueError("Number of epochs needs to be greater than 0.")

        if epsilon is not None and epochs <= 2:
            raise ValueError("Network should train at teast 3 epochs before "
                             "check the difference between errors")

        logging_info_about_the_data(self, input_train, input_test)
        logging_info_about_training(self, epochs, epsilon)
        logs.newline()

        if summary == 'table':
            summary = SummaryTable(
                table_builder=table.TableBuilder(
                    table.Column(name="Epoch #"),
                    table.NumberColumn(name="Train err", places=4),
                    table.NumberColumn(name="Valid err", places=4),
                    table.TimeColumn(name="Time", width=10),
                    stdout=logs.write
                ),
                network=self,
                delay_limit=1.,
                delay_history_length=10,
            )

        elif summary == 'inline':
            summary = InlineSummary(network=self)

        else:
            raise ValueError("`{}` is unknown summary type"
                             "".format(summary))

        iterepochs = create_training_epochs_iterator(self, epochs, epsilon)
        show_epoch = parse_show_epoch_property(self, epochs, epsilon)
        training.show_epoch = show_epoch

        # Storring attributes and methods in local variables we prevent
        # useless __getattr__ call a lot of times in each loop.
        # This variables speed up loop in case on huge amount of
        # iterations.
        training_errors = self.errors
        validation_errors = self.validation_errors
        shuffle_data = self.shuffle_data

        train_epoch = self.train_epoch
        epoch_end_signal = self.epoch_end_signal
        train_end_signal = self.train_end_signal
        on_epoch_start_update = self.on_epoch_start_update

        is_first_iteration = True
        can_compute_validation_error = (input_test is not None)
        last_epoch_shown = 0
#############################################        
	symMatrix = tt.dmatrix("symMatrix")
        symEigenvalues, eigenvectors = tt.nlinalg.eig(symMatrix)
        get_Eigen = theano.function([symMatrix], [symEigenvalues, eigenvectors])
#############################################
        with logs.disable_user_input():
            for epoch in iterepochs:
                validation_error = None
                epoch_start_time = time.time()
                on_epoch_start_update(epoch)

                if shuffle_data:
                    data = shuffle(*as_tuple(input_train, target_train))
                    input_train, target_train = data[:-1], data[-1]

                try:
                    train_error = train_epoch(input_train, target_train)
		    print epoch
		    name=str(self)
		    if(name.split('(')[0]=='Hessian'):
		    	H=self.variables.hessian.get_value()
		   	ev,_=get_Eigen(H)
			print "positive EV ",np.sum(ev>0)
			print "Just zero EV", np.sum(ev==0)
			print "Zero EV ", np.sum(ev==0)+np.sum((ev < 0) & (ev > (np.min(ev)/2.0)))
			print "Neg EV ", np.sum(ev<0)
			print "Max EV ",np.max(ev)
			print "Min EV ",np.min(ev)
			s=str(self.itr)+'.npy'
			np.save(s,ev)
                    if can_compute_validation_error:
                        validation_error = self.prediction_error(input_test,
                                                                 target_test)

                    training_errors.append(train_error)
                    validation_errors.append(validation_error)

                    epoch_finish_time = time.time()
                    training.epoch_time = epoch_finish_time - epoch_start_time

                    if epoch % training.show_epoch == 0 or is_first_iteration:
                        summary.show_last()
                        last_epoch_shown = epoch

                    if epoch_end_signal is not None:
                        epoch_end_signal(self)

                    is_first_iteration = False

                except StopTraining as err:
                    # TODO: This notification breaks table view in terminal.
                    # I need to show it in a different way.
                    logs.message("TRAIN", "Epoch #{} stopped. {}"
                                          "".format(epoch, str(err)))
                    break

            if epoch != last_epoch_shown:
                summary.show_last()

            if train_end_signal is not None:
                train_end_signal(self)

            summary.finish()
            logs.newline()
Beispiel #29
0
class Adam(GradientDescent):
    """
    Adam algorithm.

    Parameters
    ----------
    beta1 : float
        Decay rate. Value need to be between ``0`` and ``1``.
        Defaults to ``0.95``.

    beta2 : float
        Decay rate. Value need to be between ``0`` and ``1``.
        Defaults to ``0.95``.

    epsilon : float
        Value need to be greater than ``0``. Defaults to ``1e-5``.

    step : float
        Learning rate, defaults to ``0.001``.

    {GradientDescent.batch_size}

    {BaseGradientDescent.addons}

    {ConstructibleNetwork.connection}

    {ConstructibleNetwork.error}

    {BaseNetwork.show_epoch}

    {BaseNetwork.shuffle_data}

    {BaseNetwork.epoch_end_signal}

    {BaseNetwork.train_end_signal}

    {Verbose.verbose}

    Attributes
    ----------
    {GradientDescent.Attributes}

    Methods
    -------
    {GradientDescent.Methods}

    References
    ----------
    [1] Diederik P. Kingma, Jimmy Lei Ba
        Adam: a Method for Stochastic Optimization.
        https://arxiv.org/pdf/1412.6980.pdf

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> mnet = algorithms.Adam((2, 3, 1))
    >>> mnet.train(x_train, y_train)
    """
    step = NumberProperty(default=0.001, minval=0)
    beta1 = ProperFractionProperty(default=0.9)
    beta2 = ProperFractionProperty(default=0.999)
    epsilon = NumberProperty(default=1e-7, minval=0)

    def init_variables(self):
        super(Adam, self).init_variables()

        self.variables.iteration = tf.Variable(
            asfloat(1),
            name='iteration',
            dtype=tf.float32,
        )

    def init_train_updates(self):
        updates = []

        iteration = self.variables.iteration
        step = self.variables.step

        # Since beta1 and beta2 are typically close to 1 and initial
        # values for first and second moments are close to zero the
        # initial estimates for these moments will be biased towards zero.
        # In order to solve this problem we need to correct this bias
        # by rescaling moments with large values during first updates
        # and vanishing this scaling factor more and more after every
        # update.
        #
        # Note that bias correction factor has been changed in order
        # to improve computational speed (suggestion from the original
        # paper).
        bias_correction = (
            tf.sqrt(1. - self.beta2 ** iteration) /
            (1. - self.beta1 ** iteration)
        )

        for layer, parameter, gradient in self.iter_params_and_grads():
            prev_first_moment = tf.Variable(
                tf.zeros(parameter.shape),
                name="{}/prev-first-moment".format(parameter.op.name),
                dtype=tf.float32,
            )
            prev_second_moment = tf.Variable(
                tf.zeros(parameter.shape),
                name="{}/prev-second-moment".format(parameter.op.name),
                dtype=tf.float32,
            )

            first_moment = (
                self.beta1 * prev_first_moment +
                (1. - self.beta1) * gradient
            )
            second_moment = (
                self.beta2 * prev_second_moment +
                (1. - self.beta2) * gradient ** 2
            )

            parameter_delta = bias_correction * first_moment / (
                tf.sqrt(second_moment) + self.epsilon)

            updates.extend([
                (prev_first_moment, first_moment),
                (prev_second_moment, second_moment),
                (parameter, parameter - step * parameter_delta),
            ])

        updates.append((iteration, iteration + 1))
        return updates
Beispiel #30
0
class QuasiNewton(WolfeLineSearchForStep, BaseGradientDescent):
    """
    Quasi-Newton algorithm. Every iteration quasi-Network method approximates
    inverse Hessian matrix with iterative updates. It doesn't have ``step``
    parameter. Instead, algorithm applies line search for the step parameter
    that satisfies strong Wolfe condition. Parameters that control wolfe
    search start with the ``wolfe_`` prefix.

    Parameters
    ----------
    update_function : ``bfgs``, ``dfp``, ``sr1``
        Update function for the iterative inverse hessian matrix
        approximation. Defaults to ``bfgs``.

        - ``bfgs`` -  It's rank 2 formula update. It can suffer from
          round-off error and inaccurate line searches.

        - ``dfp`` - DFP is a method very similar to BFGS. It's rank 2 formula
          update. It can suffer from round-off error and inaccurate line
          searches.

        - ``sr1`` - Symmetric rank 1 (SR1). Generates update for the
          inverse hessian matrix adding symmetric rank-1 matrix. It's
          possible that there is no rank 1 updates for the matrix and in
          this case update won't be applied and original inverse hessian
          will be returned.

    h0_scale : float
        Default Hessian matrix is an identity matrix. The
        ``h0_scale`` parameter scales identity matrix.
        Defaults to ``1``.

    epsilon : float
        Controls numerical stability for the ``update_function`` parameter.
        Defaults to ``1e-7``.

    {WolfeLineSearchForStep.Parameters}

    {BaseGradientDescent.connection}

    {BaseGradientDescent.error}

    {BaseGradientDescent.show_epoch}

    {BaseGradientDescent.shuffle_data}

    {BaseGradientDescent.epoch_end_signal}

    {BaseGradientDescent.train_end_signal}

    {BaseGradientDescent.verbose}

    {BaseGradientDescent.addons}

    Notes
    -----
    - Method requires all training data during propagation, which means
      it's not allowed to use mini-batches.

    Attributes
    ----------
    {BaseGradientDescent.Attributes}

    Methods
    -------
    {BaseGradientDescent.Methods}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> qnnet = algorithms.QuasiNewton(
    ...     (2, 3, 1),
    ...     update_function='bfgs'
    ... )
    >>> qnnet.train(x_train, y_train, epochs=10)

    References
    ----------
    [1] Yang Ding, Enkeleida Lushi, Qingguo Li,
        Investigation of quasi-Newton methods for unconstrained optimization.
        http://people.math.sfu.ca/~elushi/project_833.pdf

    [2] Jorge Nocedal, Stephen J. Wright, Numerical Optimization.
        Chapter 6, Quasi-Newton Methods, p. 135-163
    """
    update_function = ChoiceProperty(default='bfgs',
                                     choices={
                                         'bfgs': bfgs,
                                         'dfp': dfp,
                                         'sr1': sr1,
                                     })
    epsilon = NumberProperty(default=1e-7, minval=0)
    h0_scale = NumberProperty(default=1, minval=0)

    step = WithdrawProperty()

    def init_variables(self):
        super(QuasiNewton, self).init_variables()
        n_parameters = count_parameters(self.connection)

        self.variables.update(
            inv_hessian=tf.Variable(
                asfloat(self.h0_scale) * tf.eye(n_parameters),
                name="quasi-newton/inv-hessian",
                dtype=tf.float32,
            ),
            prev_params=tf.Variable(
                tf.zeros([n_parameters]),
                name="quasi-newton/prev-params",
                dtype=tf.float32,
            ),
            prev_full_gradient=tf.Variable(
                tf.zeros([n_parameters]),
                name="quasi-newton/prev-full-gradient",
                dtype=tf.float32,
            ),
        )

    def init_train_updates(self):
        inv_hessian = self.variables.inv_hessian
        prev_params = self.variables.prev_params
        prev_full_gradient = self.variables.prev_full_gradient

        params = parameter_values(self.connection)
        param_vector = make_single_vector(params)

        gradients = tf.gradients(self.variables.error_func, params)
        full_gradient = make_single_vector(gradients)

        new_inv_hessian = tf.where(
            tf.equal(self.variables.epoch, 1), inv_hessian,
            self.update_function(inv_H=inv_hessian,
                                 delta_w=param_vector - prev_params,
                                 delta_grad=full_gradient - prev_full_gradient,
                                 epsilon=self.epsilon))
        param_delta = -dot(new_inv_hessian, full_gradient)
        step = self.find_optimal_step(param_vector, param_delta)
        updated_params = param_vector + step * param_delta
        updates = setup_parameter_updates(params, updated_params)

        # We have to compute these values first, otherwise
        # parallelization in tensorflow can mix update order
        # and, for example, previous gradient can be equal to
        # current gradient value. It happens because tensorflow
        # try to execute operations in parallel.
        required_variables = [new_inv_hessian, param_vector, full_gradient]
        with tf.control_dependencies(required_variables):
            updates.extend([
                inv_hessian.assign(new_inv_hessian),
                prev_params.assign(param_vector),
                prev_full_gradient.assign(full_gradient),
            ])

        return updates