Example #1
0
class LazyLearningMixin(SharedDocs):
    """
    Mixin for lazy learning Neural Network algorithms.

    Notes
    -----
    - Network uses lazy learning which mean that network doesn't
      need iterative training. It just stores parameters
      and use them to make a predictions.

    Methods
    -------
    train(input_train, target_train, copy=True)
        Network just stores all the information about the data and use
        it for the prediction. Parameter ``copy`` copies input data
        before saving it inside the network.
    """
    step = WithdrawProperty()
    show_epoch = WithdrawProperty()
    shuffle_data = WithdrawProperty()
    train_end_signal = WithdrawProperty()
    epoch_end_signal = WithdrawProperty()

    def __init__(self, *args, **kwargs):
        self.input_train = None
        self.target_train = None
        super(LazyLearningMixin, self).__init__(*args, **kwargs)

    def train(self, input_train, target_train):
        self.input_train = input_train
        self.target_train = target_train

        if input_train.shape[0] != target_train.shape[0]:
            raise ValueError("Number of samples in the input and target "
                             "datasets are different")
Example #2
0
class ConjugateGradient(WolfeLineSearchForStep, BaseOptimizer):

    """
    Conjugate Gradient algorithm.

    Parameters
    ----------
    update_function : ``fletcher_reeves``, ``polak_ribiere``,\
    ``hentenes_stiefel``, ``dai_yuan``, ``liu_storey``
        Update function. Defaults to ``fletcher_reeves``.

    epsilon : float
        Ensures computational stability during the division in
        ``update_function`` when denominator is very small number.
        Defaults to ``1e-7``.

    {WolfeLineSearchForStep.Parameters}

    {BaseOptimizer.network}

    {BaseOptimizer.loss}

    {BaseOptimizer.show_epoch}

    {BaseOptimizer.shuffle_data}

    {BaseOptimizer.signals}

    {BaseOptimizer.verbose}

    {BaseOptimizer.regularizer}

    Attributes
    ----------
    {BaseOptimizer.Attributes}

    Methods
    -------
    {BaseOptimizer.Methods}

    Examples
    --------
    >>> from sklearn import datasets, preprocessing
    >>> from sklearn.model_selection import train_test_split
    >>> from neupy import algorithms, layers
    >>>
    >>> dataset = datasets.load_boston()
    >>> data, target = dataset.data, dataset.target
    >>>
    >>> data_scaler = preprocessing.MinMaxScaler()
    >>> target_scaler = preprocessing.MinMaxScaler()
    >>>
    >>> x_train, x_test, y_train, y_test = train_test_split(
    ...     data_scaler.fit_transform(data),
    ...     target_scaler.fit_transform(target),
    ...     test_size=0.15
    ... )
    >>>
    >>> cgnet = algorithms.ConjugateGradient(
    ...     network=[
    ...         layers.Input(13),
    ...         layers.Sigmoid(50),
    ...         layers.Sigmoid(1),
    ...     ],
    ...     update_function='fletcher_reeves',
    ...     verbose=False
    ... )
    >>>
    >>> cgnet.train(x_train, y_train, epochs=100)
    >>> y_predict = cgnet.predict(x_test).round(1)
    >>>
    >>> real = target_scaler.inverse_transform(y_test)
    >>> predicted = target_scaler.inverse_transform(y_predict)

    References
    ----------
    [1] Jorge Nocedal, Stephen J. Wright, Numerical Optimization.
        Chapter 5, Conjugate Gradient Methods, p. 101-133
    """
    epsilon = NumberProperty(default=1e-7, minval=0)
    update_function = ChoiceProperty(
        default='fletcher_reeves',
        choices={
            'fletcher_reeves': fletcher_reeves,
            'polak_ribiere': polak_ribiere,
            'hentenes_stiefel': hentenes_stiefel,
            'liu_storey': liu_storey,
            'dai_yuan': dai_yuan,
        }
    )
    step = WithdrawProperty()

    def init_functions(self):
        n_parameters = self.network.n_parameters
        self.variables.update(
            prev_delta=tf.Variable(
                tf.zeros([n_parameters]),
                name="conj-grad/prev-delta",
                dtype=tf.float32,
            ),
            prev_gradient=tf.Variable(
                tf.zeros([n_parameters]),
                name="conj-grad/prev-gradient",
                dtype=tf.float32,
            ),
            iteration=tf.Variable(
                asfloat(self.last_epoch),
                name='conj-grad/current-iteration',
                dtype=tf.float32
            ),
        )
        super(ConjugateGradient, self).init_functions()

    def init_train_updates(self):
        iteration = self.variables.iteration
        previous_delta = self.variables.prev_delta
        previous_gradient = self.variables.prev_gradient

        n_parameters = self.network.n_parameters
        variables = self.network.variables
        parameters = [var for var in variables.values() if var.trainable]
        param_vector = make_single_vector(parameters)

        gradients = tf.gradients(self.variables.loss, parameters)
        full_gradient = make_single_vector(gradients)

        beta = self.update_function(
            previous_gradient, full_gradient, previous_delta, self.epsilon)

        parameter_delta = tf.where(
            tf.equal(tf.mod(iteration, n_parameters), 0),
            -full_gradient,
            -full_gradient + beta * previous_delta
        )

        step = self.find_optimal_step(param_vector, parameter_delta)
        updated_parameters = param_vector + step * parameter_delta
        updates = setup_parameter_updates(parameters, updated_parameters)

        # We have to compute these values first, otherwise
        # parallelization, in tensorflow, can mix update order
        # and, for example, previous gradient can be equal to
        # current gradient value. It happens because tensorflow
        # try to execute operations in parallel.
        with tf.control_dependencies([full_gradient, parameter_delta]):
            updates.extend([
                previous_gradient.assign(full_gradient),
                previous_delta.assign(parameter_delta),
                iteration.assign(iteration + 1),
            ])

        return updates
Example #3
0
class QuasiNewton(StepSelectionBuiltIn, GradientDescent):
    """
    Quasi-Newton algorithm optimization.

    Parameters
    ----------
    update_function : {{'bfgs', 'dfp', 'psb', 'sr1'}}
        Update function. Defaults to ``bfgs``.

    h0_scale : float
        Default Hessian matrix is an identity matrix. The
        ``h0_scale`` parameter scales identity matrix.
        Defaults to ``1``.

    {GradientDescent.connection}

    {GradientDescent.error}

    {GradientDescent.show_epoch}

    {GradientDescent.shuffle_data}

    {GradientDescent.epoch_end_signal}

    {GradientDescent.train_end_signal}

    {GradientDescent.verbose}

    {GradientDescent.addons}

    Attributes
    ----------
    {GradientDescent.Attributes}

    Methods
    -------
    {GradientDescent.Methods}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> qnnet = algorithms.QuasiNewton(
    ...     (2, 3, 1),
    ...     update_function='bfgs'
    ... )
    >>> qnnet.train(x_train, y_train, epochs=10)

    See Also
    --------
    :network:`GradientDescent` : GradientDescent algorithm.
    """
    update_function = ChoiceProperty(default='bfgs',
                                     choices={
                                         'bfgs': bfgs,
                                         'dfp': dfp,
                                         'psb': psb,
                                         'sr1': sr1,
                                     })
    h0_scale = NumberProperty(default=1, minval=0)

    step = WithdrawProperty()

    def init_variables(self):
        super(QuasiNewton, self).init_variables()
        n_params = count_parameters(self.connection)
        self.variables.update(
            inv_hessian=theano.shared(
                name='algo:quasi-newton/matrix:inv-hessian',
                value=asfloat(self.h0_scale * np.eye(int(n_params))),
            ),
            prev_params=theano.shared(
                name='algo:quasi-newton/vector:prev-params',
                value=asfloat(np.zeros(n_params)),
            ),
            prev_full_gradient=theano.shared(
                name='algo:quasi-newton/vector:prev-full-gradient',
                value=asfloat(np.zeros(n_params)),
            ),
        )

    def init_train_updates(self):
        network_inputs = self.variables.network_inputs
        network_output = self.variables.network_output
        inv_hessian = self.variables.inv_hessian
        prev_params = self.variables.prev_params
        prev_full_gradient = self.variables.prev_full_gradient

        params = parameter_values(self.connection)
        param_vector = T.concatenate([param.flatten() for param in params])

        gradients = T.grad(self.variables.error_func, wrt=params)
        full_gradient = T.concatenate([grad.flatten() for grad in gradients])

        new_inv_hessian = ifelse(
            T.eq(self.variables.epoch, 1), inv_hessian,
            self.update_function(inv_hessian, param_vector - prev_params,
                                 full_gradient - prev_full_gradient))
        param_delta = -new_inv_hessian.dot(full_gradient)
        layers_and_parameters = list(iter_parameters(self.layers))

        def prediction(step):
            updated_params = param_vector + step * param_delta

            # This trick allow us to replace shared variables
            # with theano variables and get output from the network
            start_pos = 0
            for layer, attrname, param in layers_and_parameters:
                end_pos = start_pos + param.size
                updated_param_value = T.reshape(
                    updated_params[start_pos:end_pos], param.shape)
                setattr(layer, attrname, updated_param_value)
                start_pos = end_pos

            output = self.connection.output(*network_inputs)

            # Restore previous parameters
            for layer, attrname, param in layers_and_parameters:
                setattr(layer, attrname, param)

            return output

        def phi(step):
            return self.error(network_output, prediction(step))

        def derphi(step):
            error_func = self.error(network_output, prediction(step))
            return T.grad(error_func, wrt=step)

        step = asfloat(line_search(phi, derphi))
        updated_params = param_vector + step * param_delta
        updates = setup_parameter_updates(params, updated_params)

        updates.extend([
            (inv_hessian, new_inv_hessian),
            (prev_params, param_vector),
            (prev_full_gradient, full_gradient),
        ])

        return updates
Example #4
0
class LevenbergMarquardt(StepSelectionBuiltIn, BaseGradientDescent):
    """
    Levenberg-Marquardt algorithm is a variation of the Newton's method.
    It minimizes MSE error. The algorithm approximates Hessian matrix using
    dot product between two jacobian matrices.

    Notes
    -----
    - Method requires all training data during propagation, which means
      it's not allowed to use mini-batches.

    - Network minimizes only Mean Squared Error (MSE) loss function.

    - Efficient for small training datasets, because it
      computes gradient per each sample separately.

    - Efficient for small-sized networks.

    Parameters
    ----------
    {BaseGradientDescent.connection}

    mu : float
        Control invertion for J.T * J matrix, defaults to ``0.1``.

    mu_update_factor : float
        Factor to decrease the mu if update decrese the error, otherwise
        increse mu by the same factor. Defaults to ``1.2``

    error : {{``mse``}}
        Levenberg-Marquardt works only for quadratic functions.
        Defaults to ``mse``.

    {BaseGradientDescent.show_epoch}

    {BaseGradientDescent.shuffle_data}

    {BaseGradientDescent.epoch_end_signal}

    {BaseGradientDescent.train_end_signal}

    {BaseGradientDescent.verbose}

    {BaseGradientDescent.addons}

    Attributes
    ----------
    {BaseGradientDescent.Attributes}

    Methods
    -------
    {BaseGradientDescent.Methods}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> lmnet = algorithms.LevenbergMarquardt((2, 3, 1))
    >>> lmnet.train(x_train, y_train)

    See Also
    --------
    :network:`BaseGradientDescent` : BaseGradientDescent algorithm.
    """
    mu = BoundedProperty(default=0.01, minval=0)
    mu_update_factor = BoundedProperty(default=1.2, minval=1)
    error = ChoiceProperty(default='mse', choices={'mse': errors.mse})

    step = WithdrawProperty()

    def init_variables(self):
        super(LevenbergMarquardt, self).init_variables()
        self.variables.update(
            mu=tf.Variable(self.mu, name='lev-marq/mu'),
            last_error=tf.Variable(np.nan, name='lev-marq/last-error'),
        )

    def init_train_updates(self):
        network_output = self.variables.network_output
        prediction_func = self.variables.train_prediction_func
        last_error = self.variables.last_error
        error_func = self.variables.error_func
        mu = self.variables.mu

        new_mu = tf.where(
            tf.less(last_error, error_func),
            mu * self.mu_update_factor,
            mu / self.mu_update_factor,
        )

        err_for_each_sample = flatten((network_output - prediction_func) ** 2)

        params = parameter_values(self.connection)
        param_vector = make_single_vector(params)

        J = compute_jacobian(err_for_each_sample, params)
        J_T = tf.transpose(J)
        n_params = J.shape[1]

        parameter_update = tf.matrix_solve(
            tf.matmul(J_T, J) + new_mu * tf.eye(n_params.value),
            tf.matmul(J_T, tf.expand_dims(err_for_each_sample, 1))
        )
        updated_params = param_vector - flatten(parameter_update)

        updates = [(mu, new_mu)]
        parameter_updates = setup_parameter_updates(params, updated_params)
        updates.extend(parameter_updates)

        return updates

    def on_epoch_start_update(self, epoch):
        super(LevenbergMarquardt, self).on_epoch_start_update(epoch)

        last_error = self.errors.last()
        if last_error is not None:
            self.variables.last_error.load(last_error, tensorflow_session())
Example #5
0
 class B(A):
     prop = WithdrawProperty()
Example #6
0
class QuasiNewton(WolfeLineSearchForStep, BaseGradientDescent):
    """
    Quasi-Newton algorithm. Every iteration quasi-Network method approximates
    inverse Hessian matrix with iterative updates. It doesn't have ``step``
    parameter. Instead, algorithm applies line search for the step parameter
    that satisfies strong Wolfe condition. Parameters that control wolfe
    search start with the ``wolfe_`` prefix.

    Parameters
    ----------
    update_function : ``bfgs``, ``dfp``, ``sr1``
        Update function for the iterative inverse hessian matrix
        approximation. Defaults to ``bfgs``.

        - ``bfgs`` -  It's rank 2 formula update. It can suffer from
          round-off error and inaccurate line searches.

        - ``dfp`` - DFP is a method very similar to BFGS. It's rank 2 formula
          update. It can suffer from round-off error and inaccurate line
          searches.

        - ``sr1`` - Symmetric rank 1 (SR1). Generates update for the
          inverse hessian matrix adding symmetric rank-1 matrix. It's
          possible that there is no rank 1 updates for the matrix and in
          this case update won't be applied and original inverse hessian
          will be returned.

    h0_scale : float
        Default Hessian matrix is an identity matrix. The
        ``h0_scale`` parameter scales identity matrix.
        Defaults to ``1``.

    epsilon : float
        Controls numerical stability for the ``update_function`` parameter.
        Defaults to ``1e-7``.

    {WolfeLineSearchForStep.Parameters}

    {BaseGradientDescent.connection}

    {BaseGradientDescent.error}

    {BaseGradientDescent.show_epoch}

    {BaseGradientDescent.shuffle_data}

    {BaseGradientDescent.epoch_end_signal}

    {BaseGradientDescent.train_end_signal}

    {BaseGradientDescent.verbose}

    {BaseGradientDescent.addons}

    Notes
    -----
    - Method requires all training data during propagation, which means
      it's not allowed to use mini-batches.

    Attributes
    ----------
    {BaseGradientDescent.Attributes}

    Methods
    -------
    {BaseGradientDescent.Methods}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> qnnet = algorithms.QuasiNewton(
    ...     (2, 3, 1),
    ...     update_function='bfgs'
    ... )
    >>> qnnet.train(x_train, y_train, epochs=10)

    References
    ----------
    [1] Yang Ding, Enkeleida Lushi, Qingguo Li,
        Investigation of quasi-Newton methods for unconstrained optimization.
        http://people.math.sfu.ca/~elushi/project_833.pdf

    [2] Jorge Nocedal, Stephen J. Wright, Numerical Optimization.
        Chapter 6, Quasi-Newton Methods, p. 135-163
    """
    update_function = ChoiceProperty(default='bfgs',
                                     choices={
                                         'bfgs': bfgs,
                                         'dfp': dfp,
                                         'sr1': sr1,
                                     })
    epsilon = NumberProperty(default=1e-7, minval=0)
    h0_scale = NumberProperty(default=1, minval=0)

    step = WithdrawProperty()

    def init_variables(self):
        super(QuasiNewton, self).init_variables()
        n_parameters = count_parameters(self.connection)

        self.variables.update(
            inv_hessian=tf.Variable(
                asfloat(self.h0_scale) * tf.eye(n_parameters),
                name="quasi-newton/inv-hessian",
                dtype=tf.float32,
            ),
            prev_params=tf.Variable(
                tf.zeros([n_parameters]),
                name="quasi-newton/prev-params",
                dtype=tf.float32,
            ),
            prev_full_gradient=tf.Variable(
                tf.zeros([n_parameters]),
                name="quasi-newton/prev-full-gradient",
                dtype=tf.float32,
            ),
        )

    def init_train_updates(self):
        inv_hessian = self.variables.inv_hessian
        prev_params = self.variables.prev_params
        prev_full_gradient = self.variables.prev_full_gradient

        params = parameter_values(self.connection)
        param_vector = make_single_vector(params)

        gradients = tf.gradients(self.variables.error_func, params)
        full_gradient = make_single_vector(gradients)

        new_inv_hessian = tf.where(
            tf.equal(self.variables.epoch, 1), inv_hessian,
            self.update_function(inv_H=inv_hessian,
                                 delta_w=param_vector - prev_params,
                                 delta_grad=full_gradient - prev_full_gradient,
                                 epsilon=self.epsilon))
        param_delta = -dot(new_inv_hessian, full_gradient)
        step = self.find_optimal_step(param_vector, param_delta)
        updated_params = param_vector + step * param_delta
        updates = setup_parameter_updates(params, updated_params)

        # We have to compute these values first, otherwise
        # parallelization in tensorflow can mix update order
        # and, for example, previous gradient can be equal to
        # current gradient value. It happens because tensorflow
        # try to execute operations in parallel.
        required_variables = [new_inv_hessian, param_vector, full_gradient]
        with tf.control_dependencies(required_variables):
            updates.extend([
                inv_hessian.assign(new_inv_hessian),
                prev_params.assign(param_vector),
                prev_full_gradient.assign(full_gradient),
            ])

        return updates
Example #7
0
class Hessian(BaseOptimizer):
    """
    Hessian gradient decent optimization, also known as Newton's method. This
    algorithm uses second-order derivative (hessian matrix) in order to
    choose correct step during the training iteration. Because of this,
    method doesn't have ``step`` parameter.

    Parameters
    ----------
    penalty_const : float
        Inverse hessian could be singular matrix. For this reason
        algorithm include penalty that add to hessian matrix identity
        multiplied by defined constant. Defaults to ``1``.

    {BaseOptimizer.network}

    {BaseOptimizer.loss}

    {BaseOptimizer.regularizer}

    {BaseOptimizer.show_epoch}

    {BaseOptimizer.shuffle_data}

    {BaseOptimizer.signals}

    {BaseOptimizer.verbose}

    Attributes
    ----------
    {BaseOptimizer.Attributes}

    Methods
    -------
    {BaseOptimizer.Methods}

    Notes
    -----
    - Method requires all training data during propagation, which means
      it cannot be trained with mini-batches.

    - This method calculates full hessian matrix which means it will compute
      matrix with NxN parameters, where N = number of parameters in the
      network.

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>> from neupy.layers import *
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> network = Input(2) >> Sigmoid(3) >> Sigmoid(1)
    >>> optimizer = algorithms.Hessian(network)
    >>> optimizer.train(x_train, y_train)

    See Also
    --------
    :network:`HessianDiagonal` : Hessian diagonal approximation.
    """
    penalty_const = BoundedProperty(default=1, minval=0)
    step = WithdrawProperty()

    def init_train_updates(self):
        penalty_const = asfloat(self.penalty_const)

        n_parameters = self.network.n_parameters
        variables = self.network.variables
        parameters = [var for var in variables.values() if var.trainable]
        param_vector = make_single_vector(parameters)

        hessian_matrix, full_gradient = find_hessian_and_gradient(
            self.variables.loss, parameters
        )
        parameter_update = tf.matrix_solve(
            hessian_matrix + penalty_const * tf.eye(n_parameters),
            tf.reshape(full_gradient, [-1, 1])
        )
        updated_parameters = param_vector - flatten(parameter_update)
        updates = setup_parameter_updates(parameters, updated_parameters)

        return updates
Example #8
0
class Deconvolution(Convolution):
    """
    Deconvolution layer. It's commonly called like this in the literature,
    but it's just gradient of the convolution and not actual deconvolution.

    Parameters
    ----------
    {Convolution.size}

    {Convolution.padding}

    {Convolution.stride}

    weight : array-like, Tensorfow variable, scalar or Initializer
        Defines layer's weights. Shape of the weight will be equal to
        ``(filter rows, filter columns, output channels, input channels)``.
        Default initialization methods you can find
        :ref:`here <init-methods>`. Defaults to
        :class:`HeNormal(gain=2) <neupy.init.HeNormal>`.

    {ParameterBasedLayer.bias}

    {BaseLayer.Parameters}

    Methods
    -------
    {ParameterBasedLayer.Methods}

    Examples
    --------
    >>> from neupy import layers
    >>>
    >>> layers.join(
    ...     layers.Input((28, 28, 3)),
    ...     layers.Convolution((3, 3, 16)),
    ...     layers.Deconvolution((3, 3, 1)),
    ... )

    Attributes
    ----------
    {ParameterBasedLayer.Attributes}
    """
    dilation = WithdrawProperty()

    def output_shape_per_dim(self, *args, **kwargs):
        return deconv_output_shape(*args, **kwargs)

    @property
    def weight_shape(self):
        # Compare to the regular convolution weights
        # have switched input and output channels.
        return as_tuple(self.size, self.input_shape[-1])

    def output(self, input_value):
        input_shape = tf.shape(input_value)
        # We need to get information about output shape from the input
        # tensor's shape, because for some inputs we might have
        # height and width specified as None and shape value won't be
        # computed for these dimensions.
        output_shape = self.find_output_from_input_shape(
            tf.unstack(input_shape[1:]))

        batch_size = input_shape[0]
        padding = self.padding

        if isinstance(self.padding, (list, tuple)):
            height_pad, width_pad = self.padding

            # VALID option will make sure that
            # deconvolution won't use any padding.
            padding = 'VALID'

            # conv2d_transpose doesn't know about extra paddings that we added
            # in the convolution. For this reason we have to expand our
            # expected output shape and later we will remove these paddings
            # manually after transpose convolution.
            output_shape = (
                output_shape[0] + 2 * height_pad,
                output_shape[1] + 2 * width_pad,
                output_shape[2],
            )

        output = tf.nn.conv2d_transpose(
            input_value,
            self.weight,
            as_tuple(batch_size, output_shape),
            as_tuple(1, self.stride, 1),
            padding,
            data_format="NHWC"
        )

        if isinstance(self.padding, (list, tuple)):
            h_pad, w_pad = self.padding

            if h_pad > 0:
                output = output[:, h_pad:-h_pad, :, :]

            if w_pad > 0:
                output = output[:, :, w_pad:-w_pad, :]

        if self.bias is not None:
            bias = tf.reshape(self.bias, (1, 1, 1, -1))
            output += bias

        return output
Example #9
0
class LevenbergMarquardt(BaseOptimizer):
    """
    Levenberg-Marquardt algorithm is a variation of the Newton's method.
    It minimizes MSE error. The algorithm approximates Hessian matrix using
    dot product between two jacobian matrices.

    Notes
    -----
    - Method requires all training data during propagation, which means
      it's not allowed to use mini-batches.

    - Network minimizes only Mean Squared Error (MSE) loss function.

    - Efficient for small training datasets, because it
      computes gradient per each sample separately.

    - Efficient for small-sized networks.

    Parameters
    ----------
    {BaseOptimizer.network}

    mu : float
        Control invertion for J.T * J matrix, defaults to ``0.1``.

    mu_update_factor : float
        Factor to decrease the mu if update decrese the error, otherwise
        increse mu by the same factor. Defaults to ``1.2``

    error : {{``mse``}}
        Levenberg-Marquardt works only for quadratic functions.
        Defaults to ``mse``.

    {BaseOptimizer.show_epoch}

    {BaseOptimizer.shuffle_data}

    {BaseOptimizer.signals}

    {BaseOptimizer.verbose}

    Attributes
    ----------
    {BaseOptimizer.Attributes}

    Methods
    -------
    {BaseOptimizer.Methods}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>> from neupy.layers import *
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> network = Input(2) >> Sigmoid(3) >> Sigmoid(1)
    >>> optimizer = algorithms.LevenbergMarquardt(network)
    >>> optimizer.train(x_train, y_train)

    See Also
    --------
    :network:`BaseOptimizer` : BaseOptimizer algorithm.
    """
    mu = BoundedProperty(default=0.01, minval=0)
    mu_update_factor = BoundedProperty(default=1.2, minval=1)
    loss = ChoiceProperty(default='mse', choices={'mse': objectives.mse})

    step = WithdrawProperty()
    regularizer = WithdrawProperty()

    def init_functions(self):
        self.variables.update(
            mu=tf.Variable(self.mu, name='lev-marq/mu'),
            last_error=tf.Variable(np.nan, name='lev-marq/last-error'),
        )
        super(LevenbergMarquardt, self).init_functions()

    def init_train_updates(self):
        training_outputs = self.network.training_outputs
        last_error = self.variables.last_error
        error_func = self.variables.loss
        mu = self.variables.mu

        new_mu = tf.where(
            tf.less(last_error, error_func),
            mu * self.mu_update_factor,
            mu / self.mu_update_factor,
        )

        err_for_each_sample = flatten((self.target - training_outputs)**2)

        variables = self.network.variables
        params = [var for var in variables.values() if var.trainable]
        param_vector = make_single_vector(params)

        J = compute_jacobian(err_for_each_sample, params)
        J_T = tf.transpose(J)
        n_params = J.shape[1]

        parameter_update = tf.matrix_solve(
            tf.matmul(J_T, J) + new_mu * tf.eye(n_params.value),
            tf.matmul(J_T, tf.expand_dims(err_for_each_sample, 1)))
        updated_params = param_vector - flatten(parameter_update)

        updates = [(mu, new_mu)]
        parameter_updates = setup_parameter_updates(params, updated_params)
        updates.extend(parameter_updates)

        return updates

    def one_training_update(self, X_train, y_train):
        if self.errors.train:
            last_error = self.errors.train[-1]
            self.variables.last_error.load(last_error, tensorflow_session())

        return super(LevenbergMarquardt,
                     self).one_training_update(X_train, y_train)
Example #10
0
class RBFKMeans(StepSelectionBuiltIn, BaseNetwork):
    """
    Radial basis function K-means for clustering.

    Parameters
    ----------
    n_clusters : int
        Number of clusters.

    {BaseNetwork.show_epoch}

    {BaseNetwork.shuffle_data}

    {BaseNetwork.epoch_end_signal}

    {BaseNetwork.train_end_signal}

    {Verbose.verbose}

    Attributes
    ----------
    centers : array-like with shape (n_clusters, n_futures)
        Cluster centers.

    Methods
    -------
    train(input_train, epsilon=1e-5, epochs=100)
        Trains network.

    {BaseSkeleton.predict}

    {BaseSkeleton.fit}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy.algorithms import RBFKMeans
    >>>
    >>> data = np.array([
    ...     [0.11, 0.20],
    ...     [0.25, 0.32],
    ...     [0.64, 0.60],
    ...     [0.12, 0.42],
    ...     [0.70, 0.73],
    ...     [0.30, 0.27],
    ...     [0.43, 0.81],
    ...     [0.44, 0.87],
    ...     [0.12, 0.92],
    ...     [0.56, 0.67],
    ...     [0.36, 0.35],
    ... ])
    >>> rbfk_net = RBFKMeans(n_clusters=2, verbose=False)
    >>> rbfk_net.train(data, epsilon=1e-5)
    >>> rbfk_net.centers
    array([[ 0.228     ,  0.312     ],
           [ 0.48166667,  0.76666667]])
    >>>
    >>> new_data = np.array([[0.1, 0.1], [0.9, 0.9]])
    >>> rbfk_net.predict(new_data)
    array([[ 0.],
           [ 1.]])
    """
    n_clusters = IntProperty(minval=2)
    step = WithdrawProperty()

    def __init__(self, **options):
        self.centers = None
        super(RBFKMeans, self).__init__(**options)

    def predict(self, input_data):
        input_data = format_data(input_data)

        centers = self.centers
        classes = np.zeros((input_data.shape[0], 1))

        for i, value in enumerate(input_data):
            classes[i] = np.argmin(norm(centers - value, axis=1))

        return classes

    def train_epoch(self, input_train, target_train):
        centers = self.centers
        old_centers = centers.copy()
        output_train = self.predict(input_train)

        for i, center in enumerate(centers):
            positions = np.argwhere(output_train[:, 0] == i)

            if not np.any(positions):
                continue

            class_data = np.take(input_train, positions, axis=0)
            centers[i, :] = (1 / len(class_data)) * np.sum(class_data, axis=0)

        return np.abs(old_centers - centers)

    def train(self, input_train, epsilon=1e-5, epochs=100):
        n_clusters = self.n_clusters
        input_train = format_data(input_train)
        n_samples = input_train.shape[0]

        if n_samples <= n_clusters:
            raise ValueError("Number of samples in the dataset is less than "
                             "spcified number of clusters. Got {} samples, "
                             "expected at least {} (for {} clusters)"
                             "".format(n_samples, n_clusters + 1, n_clusters))

        self.centers = input_train[:n_clusters, :].copy()
        super(RBFKMeans, self).train(input_train,
                                     epsilon=epsilon,
                                     epochs=epochs)
Example #11
0
class Hessian(StepSelectionBuiltIn, GradientDescent):
    """
    Hessian gradient decent optimization. This GD algorithm
    variation using second derivative information helps choose better
    gradient direction and as a consequence better weight update
    parameter after each epoch.

    Parameters
    ----------
    penalty_const : float
        Inverse hessian could be singular matrix. For this reason
        algorithm include penalty that add to hessian matrix identity
        multiplied by defined constant. Defaults to ``1``.

    {GradientDescent.connection}

    {GradientDescent.error}

    {GradientDescent.show_epoch}

    {GradientDescent.shuffle_data}

    {GradientDescent.epoch_end_signal}

    {GradientDescent.train_end_signal}

    {GradientDescent.verbose}

    {GradientDescent.addons}

    Attributes
    ----------
    {GradientDescent.Attributes}

    Methods
    -------
    {GradientDescent.Methods}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> mnet = algorithms.Hessian((2, 3, 1))
    >>> mnet.train(x_train, y_train)

    See Also
    --------
    :network:`HessianDiagonal` : Hessian diagonal approximation.
    """
    penalty_const = BoundedProperty(default=1, minval=0)

    step = WithdrawProperty()

    def init_train_updates(self):
        n_parameters = count_parameters(self.connection)
        parameters = parameter_values(self.connection)
        param_vector = T.concatenate([param.flatten() for param in parameters])
        penalty_const = asfloat(self.penalty_const)
        print n_parameters
        self.variables.hessian = theano.shared(value=asfloat(
            np.zeros((n_parameters, n_parameters))),
                                               name='hessian_inverse')
        hessian_matrix, full_gradient = find_hessian_and_gradient(
            self.variables.error_func, parameters)
        updated_parameters = hessian_matrix
        updates = setup_parameter_updates([self.variables.hessian],
                                          updated_parameters)

        return updates
class LevenbergMarquardt(StepSelectionBuiltIn, GradientDescent):
    """
    Levenberg-Marquardt algorithm.

    Notes
    -----
    - Network minimizes only Mean Squared Error function.
    - Efficient for small training datasets, because it
      computes gradient per each sample separately.
    - Efficient for small-sized networks.

    Parameters
    ----------
    {GradientDescent.connection}

    mu : float
        Control invertion for J.T * J matrix, defaults to `0.1`.

    mu_update_factor : float
        Factor to decrease the mu if update decrese the error, otherwise
        increse mu by the same factor. Defaults to ``1.2``

    error : {{``mse``}}
        Levenberg-Marquardt works only for quadratic functions.
        Defaults to ``mse``.

    {GradientDescent.show_epoch}

    {GradientDescent.shuffle_data}

    {GradientDescent.epoch_end_signal}

    {GradientDescent.train_end_signal}

    {GradientDescent.verbose}

    {GradientDescent.addons}

    Attributes
    ----------
    {GradientDescent.Attributes}

    Methods
    -------
    {GradientDescent.Methods}

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> lmnet = algorithms.LevenbergMarquardt((2, 3, 1))
    >>> lmnet.train(x_train, y_train)

    See Also
    --------
    :network:`GradientDescent` : GradientDescent algorithm.
    """
    mu = BoundedProperty(default=0.01, minval=0)
    mu_update_factor = BoundedProperty(default=1.2, minval=1)
    error = ChoiceProperty(default='mse', choices={'mse': errors.mse})

    step = WithdrawProperty()

    def init_variables(self):
        super(LevenbergMarquardt, self).init_variables()
        self.variables.update(
            mu=theano.shared(name='lev-marq/mu', value=asfloat(self.mu)),
            last_error=theano.shared(name='lev-marq/last-error', value=np.nan),
        )

    def init_train_updates(self):
        network_output = self.variables.network_output
        prediction_func = self.variables.train_prediction_func
        last_error = self.variables.last_error
        error_func = self.variables.error_func
        mu = self.variables.mu

        new_mu = ifelse(
            T.lt(last_error, error_func),
            mu * self.mu_update_factor,
            mu / self.mu_update_factor,
        )

        se_for_each_sample = ((network_output - prediction_func)**2).ravel()

        params = parameter_values(self.connection)
        param_vector = T.concatenate([param.flatten() for param in params])

        J = compute_jacobian(se_for_each_sample, params)
        n_params = J.shape[1]

        updated_params = param_vector - slinalg.solve(
            J.T.dot(J) + new_mu * T.eye(n_params), J.T.dot(se_for_each_sample))

        updates = [(mu, new_mu)]
        parameter_updates = setup_parameter_updates(params, updated_params)
        updates.extend(parameter_updates)

        return updates

    def on_epoch_start_update(self, epoch):
        super(LevenbergMarquardt, self).on_epoch_start_update(epoch)

        last_error = self.errors.last()
        if last_error is not None:
            self.variables.last_error.set_value(last_error)
Example #13
0
class Hessian(StepSelectionBuiltIn, BaseGradientDescent):
    """
    Hessian gradient decent optimization, also known as Newton's method. This
    algorithm uses second-order derivative (hessian matrix) in order to
    choose correct step during the training itration. Because of this,
    method doesn't have ``step`` parameter.

    Parameters
    ----------
    penalty_const : float
        Inverse hessian could be singular matrix. For this reason
        algorithm include penalty that add to hessian matrix identity
        multiplied by defined constant. Defaults to ``1``.

    {BaseGradientDescent.connection}

    {BaseGradientDescent.error}

    {BaseGradientDescent.show_epoch}

    {BaseGradientDescent.shuffle_data}

    {BaseGradientDescent.epoch_end_signal}

    {BaseGradientDescent.train_end_signal}

    {BaseGradientDescent.verbose}

    {BaseGradientDescent.addons}

    Attributes
    ----------
    {BaseGradientDescent.Attributes}

    Methods
    -------
    {BaseGradientDescent.Methods}

    Notes
    -----
    - Method requires all training data during propagation, which means
      it's not allowed to use mini-batches.

    - This method calculates full hessian matrix which means it will compute
      matrix with NxN parameters, where N = number of parameters in the
      network.

    Examples
    --------
    >>> import numpy as np
    >>> from neupy import algorithms
    >>>
    >>> x_train = np.array([[1, 2], [3, 4]])
    >>> y_train = np.array([[1], [0]])
    >>>
    >>> mnet = algorithms.Hessian((2, 3, 1))
    >>> mnet.train(x_train, y_train)

    See Also
    --------
    :network:`HessianDiagonal` : Hessian diagonal approximation.
    """
    penalty_const = BoundedProperty(default=1, minval=0)

    step = WithdrawProperty()

    def init_train_updates(self):
        penalty_const = asfloat(self.penalty_const)

        n_parameters = count_parameters(self.connection)
        parameters = parameter_values(self.connection)
        param_vector = make_single_vector(parameters)

        hessian_matrix, full_gradient = find_hessian_and_gradient(
            self.variables.error_func, parameters)
        parameter_update = tf.matrix_solve(
            hessian_matrix + penalty_const * tf.eye(n_parameters),
            tf.reshape(full_gradient, [-1, 1]))
        updated_parameters = param_vector - flatten(parameter_update)
        updates = setup_parameter_updates(parameters, updated_parameters)

        return updates