Exemple #1
0
    def get_updates(self, gradients):
        """
        Compute the AdaDelta updates (see the paper for details).

        Parameters
        ----------
        gradients : dict
            A dictionary mapping from the model's parameters to their
            gradients.

        Returns
        -------
        updates : OrderdDict
            A dictionary mapping from the old model parameters, to their new
            values after a single iteration of the learning rule.
        """
        log.debug('Setting up ADADELTA for optimizer...')
        updates = OrderedDict()
        for param in gradients.keys():
            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(param.get_value() * 0.)
            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = sharedX(param.get_value() * 0.)

            if param.name is not None:
                mean_square_grad.name = 'mean_square_grad_' + param.name
                mean_square_dx.name = 'mean_square_dx_' + param.name

            # Accumulate gradient
            new_mean_squared_grad = (
                self.decay * mean_square_grad +
                (1 - self.decay) * T.sqr(gradients[param])
            )

            # Compute update
            epsilon = self.lr_scalers.get(param, 1.) * self.learning_rate
            rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon)
            rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon)
            delta_x_t = - (rms_dx_tm1 / rms_grad_t) * gradients[param]

            # Accumulate updates
            new_mean_square_dx = (
                self.decay * mean_square_dx +
                (1 - self.decay) * T.sqr(delta_x_t)
            )

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[param] = param + delta_x_t

        return updates
Exemple #2
0
    def get_updates(self, gradients):
        """
        Compute the AdaDelta updates (see the paper for details).

        Parameters
        ----------
        gradients : dict
            A dictionary mapping from the model's parameters to their
            gradients.

        Returns
        -------
        updates : OrderdDict
            A dictionary mapping from the old model parameters, to their new
            values after a single iteration of the learning rule.
        """
        log.debug('Setting up ADADELTA for optimizer...')
        updates = OrderedDict()
        for param in gradients.keys():
            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(param.get_value() * 0.)
            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = sharedX(param.get_value() * 0.)

            if param.name is not None:
                mean_square_grad.name = 'mean_square_grad_' + param.name
                mean_square_dx.name = 'mean_square_dx_' + param.name

            # Accumulate gradient
            new_mean_squared_grad = (
                self.decay * mean_square_grad +
                (1 - self.decay) * T.sqr(gradients[param]))

            # Compute update
            epsilon = self.lr_scalers.get(param, 1.) * self.learning_rate
            rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon)
            rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon)
            delta_x_t = -(rms_dx_tm1 / rms_grad_t) * gradients[param]

            # Accumulate updates
            new_mean_square_dx = (self.decay * mean_square_dx +
                                  (1 - self.decay) * T.sqr(delta_x_t))

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[param] = param + delta_x_t

        return updates
Exemple #3
0
    def __init__(self, model, dataset,
                 config=None, defaults=defaults,
                 n_epoch=None, batch_size=None, minimum_batch_size=None,
                 save_frequency=None, early_stop_threshold=None, early_stop_length=None,
                 learning_rate=None, lr_decay=None, lr_factor=None,
                 decay=None, gamma_clip=None, damping=None, grad_clip=None, start_var_reduction=None,
                 delta_clip=None, use_adagrad=None, skip_nan_inf=None, upper_bound_tau=None,
                 lower_bound_tau=None, use_corrected_grad=None):
        # get everything together with the Optimizer class
        super(AdaSecant, self).__init__(model, dataset, config=config, defaults=defaults,
                                        n_epoch=n_epoch, batch_size=batch_size, minimum_batch_size=minimum_batch_size,
                                        save_frequency=save_frequency, early_stop_length=early_stop_length,
                                        early_stop_threshold=early_stop_threshold, learning_rate=learning_rate,
                                        lr_decay=lr_decay, lr_factor=lr_factor, decay=decay, gamma_clip=gamma_clip,
                                        damping=damping, grad_clip=grad_clip, start_var_reduction=start_var_reduction,
                                        delta_clip=delta_clip, use_adagrad=use_adagrad, skip_nan_inf=skip_nan_inf,
                                        upper_bound_tau=upper_bound_tau, lower_bound_tau=lower_bound_tau,
                                        use_corrected_grad=use_corrected_grad)

        # We have to bound the tau to prevent it to
        # grow to an arbitrarily large number, oftenwise
        # that causes numerical instabilities for very deep
        # networks. Note that once tau become very large, it will keep,
        # increasing indefinitely.

        assert self.decay >= 0., "Decay needs to be >=0."
        assert self.decay < 1., "Decay needs to be <1."
        self.decay = sharedX(self.decay, "decay")
Exemple #4
0
    def get_updates(self, gradients):
        """
        Provides the symbolic (theano) description of the updates needed to
        perform this learning rule. See Notes for side-effects.

        Parameters
        ----------
        gradients : dict
            A dictionary mapping from the model's parameters to their
            gradients.

        Returns
        -------
        updates : OrderdDict
            A dictionary mapping from the old model parameters, to their new
            values after a single iteration of the learning rule.

        Notes
        -----
        This method has the side effect of storing the moving average
        of the square gradient in `self.mean_square_grads`. This is
        necessary in order for the monitoring channels to be able
        to track the value of these moving averages.
        Therefore, this method should only get called once for each
        instance of RMSProp.
        """
        log.debug('Setting up RMSProp for optimizer...')
        updates = OrderedDict()
        for param in gradients:

            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(param.get_value() * 0.)

            if param.name is None:
                raise ValueError("Model parameters must be named.")
            mean_square_grad.name = 'mean_square_grad_' + param.name

            if param.name in self.mean_square_grads:
                log.warning("Calling get_updates more than once on the "
                            "gradients of `%s` may make monitored values "
                            "incorrect." % param.name)
            # Store variable in self.mean_square_grads for monitoring.
            self.mean_square_grads[param.name] = mean_square_grad

            # Accumulate gradient
            new_mean_squared_grad = (
                self.decay * mean_square_grad +
                (1 - self.decay) * T.sqr(gradients[param]))

            # Compute update
            scaled_lr = self.lr_scalers.get(param, 1.) * self.learning_rate
            rms_grad_t = T.sqrt(new_mean_squared_grad)
            rms_grad_t = T.maximum(rms_grad_t, self.epsilon)
            delta_x_t = -scaled_lr * gradients[param] / rms_grad_t

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[param] = param + delta_x_t

        return updates
    def __init__(self, model, dataset,
                 config=None, defaults=defaults,
                 n_epoch=None, batch_size=None, minimum_batch_size=None,
                 save_frequency=None, early_stop_threshold=None, early_stop_length=None,
                 learning_rate=None, lr_decay=None, lr_factor=None,
                 momentum=None, momentum_decay=None, momentum_factor=None, nesterov_momentum=None):
        # superclass init
        super(SGD, self).__init__(model, dataset, config=config, defaults=defaults,
                                  n_epoch=n_epoch, batch_size=batch_size, minimum_batch_size=minimum_batch_size,
                                  save_frequency=save_frequency, early_stop_length=early_stop_length,
                                  early_stop_threshold=early_stop_threshold, learning_rate=learning_rate,
                                  lr_decay=lr_decay, lr_factor=lr_factor, momentum=momentum,
                                  momentum_decay=momentum_decay, momentum_factor=momentum_factor,
                                  nesterov_momentum=nesterov_momentum)
        # everything is in self! yay!

        # Momentum - smoothing over the parameter changes (see Hinton)
        if self.momentum:
            self.momentum = sharedX(self.momentum, 'momentum')
            if self.momentum_decay is not None and \
                            self.momentum_decay is not False and \
                            self.momentum_factor is not None:
                self.momentum_decay = get_decay_function(self.momentum_decay,
                                                         self.momentum,
                                                         self.momentum.get_value(),
                                                         self.momentum_factor)
            else:
                self.momentum_decay = False
        else:
            self.momentum = 1
Exemple #6
0
    def get_updates(self, grads):
        """
        Provides the symbolic (theano) description of the updates needed to
        perform this learning rule. See Notes for side-effects.

        Parameters
        ----------
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.

        Returns
        -------
        updates : OrderdDict
            A dictionary mapping from the old model parameters, to their new
            values after a single iteration of the learning rule.

        Notes
        -----
        This method has the side effect of storing the moving average
        of the square gradient in `self.mean_square_grads`. This is
        necessary in order for the monitoring channels to be able
        to track the value of these moving averages.
        Therefore, this method should only get called once for each
        instance of RMSProp.
        """
        log.debug('Setting up RMSProp for optimizer...')
        updates = OrderedDict()
        for param in grads:

            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(param.get_value() * 0.)

            if param.name is None:
                raise ValueError("Model parameters must be named.")
            mean_square_grad.name = 'mean_square_grad_' + param.name

            if param.name in self.mean_square_grads:
                log.warning("Calling get_updates more than once on the "
                              "gradients of `%s` may make monitored values "
                              "incorrect." % param.name)
            # Store variable in self.mean_square_grads for monitoring.
            self.mean_square_grads[param.name] = mean_square_grad

            # Accumulate gradient
            new_mean_squared_grad = (self.decay * mean_square_grad +
                                     (1 - self.decay) * T.sqr(grads[param]))

            # Compute update
            scaled_lr = self.lr_scalers.get(param, 1.) * self.learning_rate
            rms_grad_t = T.sqrt(new_mean_squared_grad)
            rms_grad_t = T.maximum(rms_grad_t, self.epsilon)
            delta_x_t = - scaled_lr * grads[param] / rms_grad_t

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[param] = param + delta_x_t

        return updates
Exemple #7
0
    def __init__(self,
                 train_X,
                 train_Y=None,
                 valid_X=None,
                 valid_Y=None,
                 test_X=None,
                 test_Y=None):
        log.info('Wrapping matrix from memory')
        super(self.__class__, self).__init__()

        # make sure the inputs are arrays
        train_X = numpy.array(train_X)
        self._train_shape = train_X.shape
        self.train_X = sharedX(train_X)
        if train_Y:
            self.train_Y = sharedX(numpy.array(train_Y))

        if valid_X:
            valid_X = numpy.array(valid_X)
            self._valid_shape = valid_X.shape
            self.valid_X = sharedX(valid_X)
        if valid_Y:
            self.valid_Y = sharedX(numpy.array(valid_Y))

        if test_X:
            test_X = numpy.array(test_X)
            self._test_shape = test_X.shape
            self.test_X = sharedX(test_X)
        if test_Y:
            self.test_Y = sharedX(numpy.array(test_Y))
Exemple #8
0
    def get_updates(self, gradients):
        """
        Based on Pylearn2
        (https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/training_algorithms/learning_rule.py)

        Implements momentum as described in Section 9 of
        "A Practical Guide to Training Restricted Boltzmann Machines",
        Geoffrey Hinton.
        Parameters are updated by the formula:
        inc := momentum * inc - learning_rate * d cost / d param
        param := param + inc

        Also has the option to implement Nesterov momentum (accelerated momentum), which works better in a lot of cases.

        Parameters
        ----------
        gradients : dict
            A dictionary mapping from the model's parameters to their
            gradients.

        Returns
        -------
        updates : OrderdDict
            A dictionary mapping from the old model parameters, to their new
            values after a single iteration of the learning rule.
        """
        log.debug('Setting up Stochastic Gradient Descent with momentum for optimizer...')
        updates = OrderedDict()
        for (param, gradient) in six.iteritems(gradients):
            velocity = sharedX(param.get_value() * 0.)

            assert param.dtype == velocity.dtype
            assert gradient.dtype == param.dtype

            if param.name is not None:
                velocity.name = 'vel_' + param.name

            scaled_lr = self.learning_rate * self.lr_scalers.get(param, 1.)
            updates[velocity] = self.momentum * velocity - scaled_lr * gradient

            inc = updates[velocity]
            if self.nesterov_momentum:
                log.debug('Using Nesterov momentum for parameter %s', str(param))
                inc = self.momentum * inc - scaled_lr * gradient

            assert inc.dtype == velocity.dtype
            updates[param] = param + inc

        return updates
Exemple #9
0
    def __init__(self, model, dataset, decay=None, max_scaling=None, iterator_class=SequentialIterator,
                 config=None, defaults=_defaults, rng=None, n_epoch=None, batch_size=None, minimum_batch_size=None,
                 save_frequency=None, early_stop_threshold=None, early_stop_length=None, learning_rate=None,
                 flag_para_load=None):
        if not decay:
            if config:
                decay = config.get('decay', defaults.get('decay'))
            elif defaults:
                decay = defaults.get('decay')
            else:
                log.error("RMSProp missing 'decay' parameter in config or defaults!")
                raise AssertionError
        assert decay >= 0.
        assert decay < 1.
        self.decay = sharedX(decay)

        if not max_scaling:
            if config:
                max_scaling = config.get('max_scaling', defaults.get('max_scaling'))
            elif defaults:
                max_scaling = defaults.get('max_scaling')
            else:
                log.error("RMSProp missing 'max_scaling' parameter in config or defaults!")
                raise AssertionError
        assert max_scaling > 0.
        self.epsilon = 1. / max_scaling

        self.mean_square_grads = OrderedDict()

        # need to call the SGD constructor after parameters are extracted because the constructor calls get_updates()!
        super(RMSProp, self).__init__(model=model,
                                      dataset=dataset,
                                      iterator_class=iterator_class,
                                      config=config,
                                      defaults=defaults,
                                      rng=rng,
                                      n_epoch=n_epoch,
                                      batch_size=batch_size,
                                      minimum_batch_size=minimum_batch_size,
                                      save_frequency=save_frequency,
                                      early_stop_length=early_stop_length,
                                      early_stop_threshold=early_stop_threshold,
                                      learning_rate=learning_rate,
                                      flag_para_load=flag_para_load)
Exemple #10
0
    def get_updates(self, grads):
        """
        From Pylearn2 (https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/training_algorithms/learning_rule.py)

        Implements momentum as described in Section 9 of
        "A Practical Guide to Training Restricted Boltzmann Machines",
        Geoffrey Hinton.
        Parameters are updated by the formula:
        inc := momentum * inc - learning_rate * d cost / d param
        param := param + inc

        Also has the option to implement Nesterov momentum (accelerated momentum), which works better in a lot of cases.

        :param grads: OrderedDict
        An OrderedDict of (parameter, gradient) for the model's gradients
        :return: OrderedDict
        Updates at each training step
        """
        log.debug(
            'Setting up Stochastic Gradient Descent with momentum for optimizer...'
        )
        updates = OrderedDict()
        for (param, gradient) in six.iteritems(grads):
            vel = sharedX(param.get_value() * 0.)
            assert param.dtype == vel.dtype
            assert gradient.dtype == param.dtype
            if param.name is not None:
                vel.name = 'vel_' + param.name

            scaled_lr = self.learning_rate * self.lr_scalers.get(param, 1.)
            updates[vel] = self.momentum * vel - scaled_lr * gradient

            inc = updates[vel]
            if self.nesterov_momentum:
                log.debug('Using Nesterov momentum')
                inc = self.momentum * inc - scaled_lr * gradient

            assert inc.dtype == vel.dtype
            updates[param] = param + inc

        return updates
Exemple #11
0
    def __init__(self, model, dataset,
                 n_epoch=10, batch_size=100, minimum_batch_size=1,
                 save_frequency=None, early_stop_threshold=None, early_stop_length=None,
                 learning_rate=.1, lr_decay="exponential", lr_factor=.995,
                 momentum=0.5, momentum_decay="linear", momentum_factor=0, nesterov_momentum=True):
        """
        Initialize SGD.

        Parameters
        ----------
        model : Model
            The Model to train.
        dataset : Dataset
            The Dataset to use when training the Model.
        n_epoch : int
            how many training iterations over the dataset to go.
        batch_size : int
            How many examples from the training dataset to use in parallel.
        minimum_batch_size : int
            The minimum number of examples required at a time (for things like time series, this would be > 1).
        save_frequency : int
            How many epochs to train between each new save of the Model's parameters.
        early_stop_threshold : float
            The factor by how much the best validation training score needs to improve to determine early stopping.
        early_stop_length : int
            The patience or number of epochs to wait after the early_stop_threshold has been reached before stopping.
        learning_rate : float
            The multiplicative amount to adjust parameters based on their gradient values.
        lr_decay : str
            The type of decay function to use for changing the learning rate over epochs. See
            `opendeep.utils.decay` for options.
        lr_factor : float
            The amount to use for the decay function when changing the learning rate over epochs. See
            `opendeep.utils.decay` for its effect for given decay functions.
        momentum : float
            The momentum to use during gradient updates.
        momentum_decay : str
            The type of decay function to use for changing the momentum over epochs. See
            `opendeep.utils.decay` for options.
        momentum_factor : float
            The amount to use for the decay function when changing the momentum over epochs. See
            `opendeep.utils.decay` for its effect for given decay functions.
        nesterov_momentum : bool
            Whether or not to use Nesterov momentum.
        """
        # superclass init
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(SGD, self).__init__(**initial_parameters)

        # Momentum - smoothing over the parameter changes (see Hinton)
        if momentum:
            self.momentum = sharedX(momentum, 'momentum')
            if momentum_decay is not None and \
                            momentum_decay is not False and \
                            momentum_factor is not None:
                self.momentum_decay = get_decay_function(momentum_decay,
                                                         self.momentum,
                                                         self.momentum.get_value(),
                                                         momentum_factor)
            else:
                self.momentum_decay = False
        else:
            self.momentum = 0
            self.momentum_decay = False

        self.nesterov_momentum = nesterov_momentum
Exemple #12
0
    def __init__(self, model, dataset,
                 config=None, defaults=None,
                 n_epoch=None, batch_size=None, minimum_batch_size=None,
                 save_frequency=None, early_stop_threshold=None, early_stop_length=None,
                 learning_rate=None, lr_decay=None, lr_factor=None,
                 **kwargs):
        # Default values to use for some training parameters
        _defaults = {"n_epoch": 1000,
                     "batch_size": 100,
                     "minimum_batch_size": 1,
                     "save_frequency": 10,
                     "early_stop_threshold": .9995,
                     "early_stop_length": 30,
                     "learning_rate": 0.001,
                     "lr_decay": "exponential",
                     "lr_factor": 1,  # no learning rate decay by default
                     }

        log.debug("Initializing optimizer %s", str(type(self)))

        assert isinstance(model, Model), "Optimizer input model needs to be an opendeep Model class!"
        self.model = model
        self.dataset = dataset
        assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be an opendeep Dataset class!"

        # set self.args to be the combination of the defaults and the config dictionaries from the subclass
        in_args = combine_config_and_defaults(config, defaults)
        self.args = combine_config_and_defaults(in_args, _defaults)

        # if the args are none, make it a blank dictionary
        if self.args is None:
            self.args = {}

        # now that our required variables are out of the way, do the same thing for everything else passed via kwargs
        for arg, val in kwargs.items():
            if (val is not None or str(arg) not in self.args) and str(arg) != 'kwargs':
                self.args[str(arg)] = val
            # flatten kwargs if it was passed as a variable
            elif str(arg) == 'kwargs':
                inner_kwargs = kwargs['kwargs']
                for key, item in inner_kwargs.items():
                    if item is not None or str(key) not in self.args:
                        self.args[str(key)] = item

        # now take care of overriding explicits passed in
        if n_epoch is not None:
            self.args['n_epoch'] = n_epoch
        if batch_size is not None:
            self.args['batch_size'] = batch_size
        if minimum_batch_size is not None:
            self.args['minimum_batch_size'] = minimum_batch_size
        if save_frequency is not None:
            self.args['save_frequency'] = save_frequency
        if early_stop_threshold is not None:
            self.args['early_stop_threshold'] = early_stop_threshold
        if early_stop_length is not None:
            self.args['early_stop_length'] = early_stop_length
        if learning_rate is not None:
            self.args['learning_rate'] = learning_rate
        if lr_decay is not None:
            self.args['lr_decay'] = lr_decay
        if lr_factor is not None:
            self.args['lr_factor'] = lr_factor

        # Magic! Now self.args contains the combination of all the initialization variables, overridden like so:
        # _defaults < defaults < config < kwargs (explicits passed to model's __init__)

        # log the arguments
        log.debug("optimizer config args: %s", str(self.args))

        # Finally, to make things really easy, update the class 'self' with everything in self.args to make
        # all the parameters accessible via self.<param>
        self.__dict__.update(self.args)

        # Learning rate - how drastic of a step do the parameters change
        self.learning_rate = sharedX(self.learning_rate, 'learning_rate')
        self.lr_scalers = self.model.get_lr_scalers()
        if self.lr_decay:
            self.learning_rate_decay = get_decay_function(self.lr_decay,
                                                          self.learning_rate,
                                                          self.learning_rate.get_value(),
                                                          self.lr_factor)
        else:
            self.learning_rate_decay = False
Exemple #13
0
    def __init__(self,
                 model,
                 dataset,
                 n_epoch=10,
                 batch_size=100,
                 minimum_batch_size=1,
                 save_frequency=None,
                 early_stop_threshold=None,
                 early_stop_length=None,
                 learning_rate=1e-6,
                 lr_decay=None,
                 lr_factor=None,
                 decay=0.95,
                 gamma_clip=1.8,
                 damping=1e-7,
                 grad_clip=None,
                 start_var_reduction=0,
                 delta_clip=None,
                 use_adagrad=False,
                 skip_nan_inf=False,
                 upper_bound_tau=1e8,
                 lower_bound_tau=1.5,
                 use_corrected_grad=True):
        """
        Initialize AdaSecant.

        Parameters
        ----------
        model : Model
            The Model to train.
        dataset : Dataset
            The Dataset to use when training the Model.
        n_epoch : int
            how many training iterations over the dataset to go.
        batch_size : int
            How many examples from the training dataset to use in parallel.
        minimum_batch_size : int
            The minimum number of examples required at a time (for things like time series, this would be > 1).
        save_frequency : int
            How many epochs to train between each new save of the Model's parameters.
        early_stop_threshold : float
            The factor by how much the best validation training score needs to improve to determine early stopping.
        early_stop_length : int
            The patience or number of epochs to wait after the early_stop_threshold has been reached before stopping.
        learning_rate : float
            The multiplicative amount to adjust parameters based on their gradient values.
        lr_decay : str
            The type of decay function to use for changing the learning rate over epochs. See
            `opendeep.utils.decay` for options.
        lr_factor : float
            The amount to use for the decay function when changing the learning rate over epochs. See
            `opendeep.utils.decay` for its effect for given decay functions.
        decay : float, optional
            Decay rate :math:`\\rho` in Algorithm 1 of the aforementioned
            paper. Decay 0.95 seems to work fine for several tasks.
        gamma_clip : float, optional
            The clipping threshold for the gamma. In general 1.8 seems to
            work fine for several tasks.
        start_var_reduction: float, optional,
            How many updates later should the variance reduction start from?
        delta_clip: float, optional,
            The threshold to clip the deltas after.
        grad_clip: float, optional,
            Apply gradient clipping for RNNs (not necessary for feedforward networks). But this is
            a constraint on the norm of the gradient per layer.
            Based on:
            Pascanu, Razvan, Tomas Mikolov, and Yoshua Bengio. "On the difficulty of training
            recurrent neural networks." arXiv preprint arXiv:1211.5063 (2012).
        use_adagrad: bool, optional
            Either to use clipped adagrad or not.
        use_corrected_grad: bool, optional
            Either to use correction for gradients (referred as variance
            reduction in the workshop paper).
        """
        # get everything together with the Optimizer class
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(AdaSecant, self).__init__(**initial_parameters)

        assert decay >= 0., "Decay needs to be >=0."
        assert decay < 1., "Decay needs to be <1."
        self.decay = sharedX(decay, "decay")

        self.damping = damping
        self.skip_nan_inf = skip_nan_inf

        if grad_clip:
            assert grad_clip > 0.
            assert grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1."
        self.grad_clip = grad_clip

        self.use_adagrad = use_adagrad
        self.use_corrected_grad = use_corrected_grad
        self.gamma_clip = gamma_clip
        self.start_var_reduction = start_var_reduction
        self.delta_clip = delta_clip

        # We have to bound the tau to prevent it to
        # grow to an arbitrarily large number, oftenwise
        # that causes numerical instabilities for very deep
        # networks. Note that once tau become very large, it will keep,
        # increasing indefinitely.
        self.lower_bound_tau = lower_bound_tau
        self.upper_bound_tau = upper_bound_tau
Exemple #14
0
    def get_updates(self, gradients):
        """
        Compute AdaSecant updates (see the paper for details).

        Parameters
        ----------
        gradients : dict
            A dictionary mapping from the model's parameters to their gradients.

        Returns
        -------
        OrderdDict
            A dictionary mapping from the old model parameters
            to their new values after a single iteration of the learning rule.
        """
        updates = OrderedDict({})
        eps = self.damping
        step = sharedX(0., name="step")

        if self.skip_nan_inf:
            #If norm of the gradients of a parameter is inf or nan don't update that parameter
            #That might be useful for RNNs.
            gradients = OrderedDict({
                p:
                T.switch(T.or_(T.isinf(gradients[p]), T.isnan(gradients[p])),
                         0, gradients[p])
                for p in gradients.keys()
            })

        #Block-normalize gradients:
        gradients = OrderedDict({
            p: gradients[p] / (gradients[p].norm(2) + eps)
            for p in gradients.keys()
        })
        nparams = len(gradients.keys())

        #Apply the gradient clipping, this is only necessary for RNNs and sometimes for very deep
        #networks
        if self.grad_clip:
            gnorm = sum([g.norm(2) for g in gradients.values()])

            gradients = OrderedDict({p: T.switch(gnorm/nparams > self.grad_clip,
                                 g * self.grad_clip * nparams / gnorm , g)\
                                 for p, g in gradients.iteritems()})

        for param in gradients.keys():
            gradients[param].name = "grad_%s" % param.name
            mean_grad = sharedX(param.get_value() * 0. + eps,
                                name="mean_grad_%s" % param.name)
            # mean_corrected_grad = sharedX(param.get_value() * 0 + eps, name="mean_corrected_grad_%s" % param.name)
            slow_constant = 2.1

            if self.use_adagrad:
                # sum_square_grad := \sum_i g_i^2
                sum_square_grad = sharedX(param.get_value(borrow=True) * 0.,
                                          name="sum_square_grad_%s" %
                                          param.name)
            """
            Initialization of accumulators
            """
            taus_x_t = sharedX(
                (numpy.ones_like(param.get_value()) + eps) * slow_constant,
                name="taus_x_t_" + param.name)
            self.taus_x_t = taus_x_t

            #Variance reduction parameters
            #Numerator of the gamma:
            gamma_nume_sqr = sharedX(numpy.zeros_like(param.get_value()) + eps,
                                     name="gamma_nume_sqr_" + param.name)

            #Denominator of the gamma:
            gamma_deno_sqr = sharedX(numpy.zeros_like(param.get_value()) + eps,
                                     name="gamma_deno_sqr_" + param.name)

            #For the covariance parameter := E[\gamma \alpha]_{t-1}
            cov_num_t = sharedX(numpy.zeros_like(param.get_value()) + eps,
                                name="cov_num_t_" + param.name)

            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(numpy.zeros_like(param.get_value()) +
                                       eps,
                                       name="msg_" + param.name)

            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = sharedX(param.get_value() * 0.,
                                     name="msd_" + param.name)
            if self.use_corrected_grad:
                old_grad = sharedX(param.get_value() * 0. + eps)

            #The uncorrected gradient of previous of the previous update:
            old_plain_grad = sharedX(param.get_value() * 0. + eps)
            mean_curvature = sharedX(param.get_value() * 0. + eps)
            mean_curvature_sqr = sharedX(param.get_value() * 0. + eps)

            # Initialize the E[\Delta]_{t-1}
            mean_dx = sharedX(param.get_value() * 0.)

            # Block-wise normalize the gradient:
            norm_grad = gradients[param]

            #For the first time-step, assume that delta_x_t := norm_grad
            cond = T.eq(step, 0)
            msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx
            mdx = cond * norm_grad + (1 - cond) * mean_dx
            """
            Compute the new updated values.
            """
            # E[g_i^2]_t
            new_mean_squared_grad = (mean_square_grad * self.decay +
                                     T.sqr(norm_grad) * (1 - self.decay))
            new_mean_squared_grad.name = "msg_" + param.name
            # E[g_i]_t
            new_mean_grad = (mean_grad * self.decay + norm_grad *
                             (1 - self.decay))
            new_mean_grad.name = "nmg_" + param.name

            mg = new_mean_grad
            mgsq = new_mean_squared_grad

            # Keep the rms for numerator and denominator of gamma.
            new_gamma_nume_sqr = (gamma_nume_sqr * (1 - 1 / taus_x_t) + T.sqr(
                (norm_grad - old_grad) * (old_grad - mg)) / taus_x_t)
            new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name

            new_gamma_deno_sqr = (gamma_deno_sqr * (1 - 1 / taus_x_t) + T.sqr(
                (mg - norm_grad) * (old_grad - mg)) / taus_x_t)
            new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name

            gamma = T.sqrt(gamma_nume_sqr) / T.sqrt(gamma_deno_sqr + eps)
            gamma.name = "gamma_" + param.name

            if self.gamma_clip:
                gamma = T.minimum(gamma, self.gamma_clip)

            momentum_step = gamma * mg
            corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma)

            #For starting the variance reduction.
            if self.start_var_reduction > -1:
                cond = T.le(self.start_var_reduction, step)
                corrected_grad = cond * corrected_grad_cand + (
                    1 - cond) * norm_grad
            else:
                corrected_grad = norm_grad

            new_sum_squared_grad = None
            if self.use_adagrad:
                g = corrected_grad
                # Accumulate gradient
                new_sum_squared_grad = (sum_square_grad + T.sqr(g))

                rms_g_t = T.sqrt(new_sum_squared_grad)
                rms_g_t = T.maximum(rms_g_t, 1.0)

            # Use the gradients from the previous update
            # to compute the \nabla f(x_t) - \nabla f(x_{t-1})
            cur_curvature = norm_grad - old_plain_grad
            cur_curvature_sqr = T.sqr(cur_curvature)

            new_curvature_ave = (mean_curvature * (1 - 1 / taus_x_t) +
                                 (cur_curvature / taus_x_t))
            new_curvature_ave.name = "ncurve_ave_" + param.name

            #Average average curvature
            nc_ave = new_curvature_ave

            new_curvature_sqr_ave = (mean_curvature_sqr * (1 - 1 / taus_x_t) +
                                     (cur_curvature_sqr / taus_x_t))
            new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name

            #Unbiased average squared curvature
            nc_sq_ave = new_curvature_sqr_ave

            epsilon = self.lr_scalers.get(param, 1.) * self.learning_rate
            scaled_lr = self.lr_scalers.get(param, 1.) * sharedX(1.0)
            rms_dx_tm1 = T.sqrt(msdx + epsilon)

            rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon)

            #This is where the update step is being defined
            delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t /
                                      (new_curvature_sqr_ave + epsilon))
            delta_x_t.name = "delta_x_t_" + param.name

            # This part seems to be necessary for only RNNs
            # For feedforward networks this does not seem to be important.
            if self.delta_clip:
                log.info("Clipping will be applied on the adaptive step size.")
                delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip)
                if self.use_adagrad:
                    delta_x_t = delta_x_t * corrected_grad / rms_g_t
                else:
                    log.info("Clipped adagrad is disabled.")
                    delta_x_t = delta_x_t * corrected_grad
            else:
                log.info(
                    "Clipping will not be applied on the adaptive step size.")
                if self.use_adagrad:
                    delta_x_t = delta_x_t * corrected_grad / rms_g_t
                else:
                    log.info("Clipped adagrad will not be used.")
                    delta_x_t = delta_x_t * corrected_grad

            new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX(
                1 + eps, "stabilized")

            #To compute the E[\Delta^2]_t
            new_mean_square_dx = (msdx * (1 - 1 / taus_x_t) +
                                  (T.sqr(delta_x_t) / taus_x_t))

            #To compute the E[\Delta]_t
            new_mean_dx = (mean_dx * (1 - 1 / taus_x_t) + (delta_x_t /
                                                           (taus_x_t)))

            #Perform the outlier detection:
            #This outlier detection is slightly different:
            new_taus_t = T.switch(
                T.or_(
                    abs(norm_grad - mg) > (2 * T.sqrt(mgsq - mg**2)),
                    abs(cur_curvature - nc_ave) >
                    (2 * T.sqrt(nc_sq_ave - nc_ave**2))), sharedX(2.2),
                new_taus_t)

            #Apply the bound constraints on tau:
            new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t)
            new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t)

            new_cov_num_t = (cov_num_t * (1 - 1 / taus_x_t) +
                             (delta_x_t * cur_curvature) * (1 / taus_x_t))

            update_step = delta_x_t

            # Apply updates
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[mean_dx] = new_mean_dx
            updates[gamma_nume_sqr] = new_gamma_nume_sqr
            updates[gamma_deno_sqr] = new_gamma_deno_sqr
            updates[taus_x_t] = new_taus_t
            updates[cov_num_t] = new_cov_num_t
            updates[mean_grad] = new_mean_grad
            updates[old_plain_grad] = norm_grad
            updates[mean_curvature] = new_curvature_ave
            updates[mean_curvature_sqr] = new_curvature_sqr_ave
            updates[param] = param + update_step
            updates[step] = step + 1

            if self.use_adagrad:
                updates[sum_square_grad] = new_sum_squared_grad

            if self.use_corrected_grad:
                updates[old_grad] = corrected_grad

        return updates
Exemple #15
0
    def __init__(self, model, dataset,
                 n_epoch=1000, batch_size=100, minimum_batch_size=1,
                 save_frequency=10, early_stop_threshold=.9995, early_stop_length=30,
                 learning_rate=1e-3, lr_decay='exponential', lr_factor=1,
                 **kwargs):
        """
        Initialize the Optimizer.

        Parameters
        ----------
        model : Model
            The Model to train.
        dataset : Dataset
            The Dataset to use when training the Model.
        n_epoch : int
            how many training iterations over the dataset to go.
        batch_size : int
            How many examples from the training dataset to use in parallel.
        minimum_batch_size : int
            The minimum number of examples required at a time (for things like time series, this would be > 1).
        save_frequency : int
            How many epochs to train between each new save of the Model's parameters.
        early_stop_threshold : float
            The factor by how much the best validation training score needs to improve to determine early stopping.
        early_stop_length : int
            The patience or number of epochs to wait after the early_stop_threshold has been reached before stopping.
        learning_rate : float
            The multiplicative amount to adjust parameters based on their gradient values.
        lr_decay : str
            The type of decay function to use for changing the learning rate over epochs. See
            `opendeep.utils.decay` for options.
        lr_factor : float
            The amount to use for the decay function when changing the learning rate over epochs. See
            `opendeep.utils.decay` for its effect for given decay functions.
        """
        log.info("Initializing optimizer %s", str(type(self)))

        if early_stop_threshold is None:
            early_stop_threshold = 1.
        if save_frequency is None:
            save_frequency = 1000000
        if early_stop_length is None:
            early_stop_length = 100

        self.args = locals().copy()
        self.args.pop('self')
        kwargs = self.args.pop('kwargs')
        self.args = add_kwargs_to_dict(kwargs, self.args)
        # log the arguments
        log.info("optimizer config args: %s", str(self.args))

        assert isinstance(model, Model), "Optimizer input model needs to be an opendeep Model class!"
        assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be an opendeep Dataset class!"
        self.model = model
        self.dataset = dataset

        # Learning rate - how drastic of a step do the parameters change
        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        self.lr_scalers = self.model.get_lr_scalers()
        if lr_decay:
            self.learning_rate_decay = get_decay_function(lr_decay,
                                                          self.learning_rate,
                                                          self.learning_rate.get_value(),
                                                          lr_factor)
        else:
            self.learning_rate_decay = False

        self.noise_switches = raise_to_list(self.model.get_noise_switch())
        self.batch_size = batch_size
        self.minimum_batch_size = minimum_batch_size
        self.n_epoch = n_epoch
        self.save_frequency = save_frequency
        self.early_stop_threshold = early_stop_threshold
        self.early_stop_length = early_stop_length
Exemple #16
0
    def __init__(self, config=None, defaults=_default,
                 inputs_hook=None, params_hook=None,
                 input_size=None, output_size=None,
                 activation=None,
                 cost=None, cost_args=None,
                 weights_init=None, weights_mean=None, weights_std=None, weights_interval=None,
                 bias_init=None,
                 noise=None, noise_level=None, mrg=None,
                 outdir=None,
                 **kwargs):
        # init Model to combine the defaults and config dictionaries with the initial parameters.
        initial_parameters = locals()
        initial_parameters.pop('self')
        super(BasicLayer, self).__init__(**initial_parameters)
        # all configuration parameters are now in self!

        ##################
        # specifications #
        ##################
        # grab info from the inputs_hook, or from parameters
        if self.inputs_hook is not None:  # inputs_hook is a tuple of (Shape, Input)
            assert len(self.inputs_hook) == 2, 'Expected inputs_hook to be tuple!'  # make sure inputs_hook is a tuple
            self.input_size = self.inputs_hook[0] or self.input_size
            self.input = self.inputs_hook[1]
        else:
            # make the input a symbolic matrix
            self.input = T.fmatrix('X')

        # now that we have the input specs, define the output 'target' variable to be used in supervised training!
        self.target = T.fmatrix('Y')

        # either grab the output's desired size from the parameter directly, or copy n_in
        self.output_size = self.output_size or self.input_size

        # other specifications
        # activation function!
        activation_func = get_activation_function(self.activation)
        # cost function!
        cost_func = get_cost_function(self.cost)

        ####################################################
        # parameters - make sure to deal with params_hook! #
        ####################################################
        if self.params_hook is not None:
            # make sure the params_hook has W (weights matrix) and b (bias vector)
            assert len(self.params_hook) == 2, \
                "Expected 2 params (W and b) for BasicLayer, found {0!s}!".format(len(self.params_hook))
            W, b = self.params_hook
        else:
            W = get_weights(weights_init=self.weights_init,
                            shape=(self.input_size, self.output_size),
                            name="W",
                            # if gaussian
                            mean=self.weights_mean,
                            std=self.weights_std,
                            # if uniform
                            interval=self.weights_interval)

            # grab the bias vector
            b = get_bias(shape=self.output_size, name="b", init_values=self.bias_init)

        # Finally have the two parameters - weights matrix W and bias vector b. That is all!
        self.params = [W, b]

        ###############
        # computation #
        ###############
        # Here is the meat of the computation transforming input -> output
        # It simply involves a matrix multiplication of inputs*weights, adding the bias vector, and then passing
        # the result through our activation function (normally something nonlinear such as: max(0, output))
        self.output = activation_func(T.dot(self.input, W) + b)

        # Now deal with noise if we added it:
        if self.noise:
            log.debug('Adding noise switch.')
            if self.noise_level is not None:
                noise_func = get_noise(self.noise, self.noise_level, self.mrg)
            else:
                noise_func = get_noise(self.noise, mrg=self.mrg)
            # apply the noise as a switch!
            # default to apply noise. this is for the cost and gradient functions to be computed later
            # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
            self.switch = sharedX(value=1, name="basiclayer_noise_switch")
            self.output = T.switch(self.switch,
                                   noise_func(input=self.output),
                                   self.output)

        # now to define the cost of the model - use the cost function to compare our output with the target value.
        self.cost = cost_func(output=self.output, target=self.target, **self.cost_args)

        log.debug("Initialized a basic fully-connected layer with shape %s and activation: %s",
                  str((self.input_size, self.output_size)), str(self.activation))
Exemple #17
0
    def get_updates(self, grads):
        """
        .. todo::
            WRITEME
        Parameters
        ----------
        grads : dict
            A dictionary mapping from the model's parameters to their
            gradients.
        """

        updates = OrderedDict({})
        eps = self.damping
        step = sharedX(0., name="step")

        if self.skip_nan_inf:
            #If norm of the gradients of a parameter is inf or nan don't update that parameter
            #That might be useful for RNNs.
            grads = OrderedDict({p: T.switch(T.or_(T.isinf(grads[p]),
                T.isnan(grads[p])), 0, grads[p]) for
                p in grads.keys()})

        #Block-normalize gradients:
        grads = OrderedDict({p: grads[p] / (grads[p].norm(2) + eps) for p in grads.keys()})
        nparams = len(grads.keys())

        #Apply the gradient clipping, this is only necessary for RNNs and sometimes for very deep
        #networks
        if self.grad_clip:
            assert self.grad_clip > 0.
            assert self.grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1."
            gnorm = sum([g.norm(2) for g in grads.values()])

            grads = OrderedDict({p: T.switch(gnorm/nparams > self.grad_clip,
                                 g * self.grad_clip * nparams / gnorm , g)\
                                 for p, g in grads.iteritems()})

        for param in grads.keys():
            grads[param].name = "grad_%s" % param.name
            mean_grad = sharedX(param.get_value() * 0. + eps, name="mean_grad_%s" % param.name)
            # mean_corrected_grad = sharedX(param.get_value() * 0 + eps, name="mean_corrected_grad_%s" % param.name)
            slow_constant = 2.1

            if self.use_adagrad:
                # sum_square_grad := \sum_i g_i^2
                sum_square_grad = sharedX(param.get_value(borrow=True) * 0., name="sum_square_grad_%s" % param.name)

            """
            Initialization of accumulators
            """
            taus_x_t = sharedX((numpy.ones_like(param.get_value()) + eps) * slow_constant,
                               name="taus_x_t_" + param.name)
            self.taus_x_t = taus_x_t

            #Variance reduction parameters
            #Numerator of the gamma:
            gamma_nume_sqr = sharedX(numpy.zeros_like(param.get_value()) + eps,
                                     name="gamma_nume_sqr_" + param.name)

            #Denominator of the gamma:
            gamma_deno_sqr = sharedX(numpy.zeros_like(param.get_value()) + eps,
                                     name="gamma_deno_sqr_" + param.name)

            #For the covariance parameter := E[\gamma \alpha]_{t-1}
            cov_num_t = sharedX(numpy.zeros_like(param.get_value()) + eps,
                                name="cov_num_t_" + param.name)

            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(numpy.zeros_like(param.get_value()) + eps,
                                       name="msg_" + param.name)

            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = sharedX(param.get_value() * 0., name="msd_" + param.name)
            if self.use_corrected_grad:
                old_grad = sharedX(param.get_value() * 0. + eps)

            #The uncorrected gradient of previous of the previous update:
            old_plain_grad = sharedX(param.get_value() * 0. + eps)
            mean_curvature = sharedX(param.get_value() * 0. + eps)
            mean_curvature_sqr = sharedX(param.get_value() * 0. + eps)

            # Initialize the E[\Delta]_{t-1}
            mean_dx = sharedX(param.get_value() * 0.)

            # Block-wise normalize the gradient:
            norm_grad = grads[param]

            #For the first time-step, assume that delta_x_t := norm_grad
            cond = T.eq(step, 0)
            msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx
            mdx  = cond * norm_grad + (1 - cond) * mean_dx

            """
            Compute the new updated values.
            """
            # E[g_i^2]_t
            new_mean_squared_grad = (mean_square_grad * self.decay + T.sqr(norm_grad) * (1 - self.decay))
            new_mean_squared_grad.name = "msg_" + param.name
            # E[g_i]_t
            new_mean_grad = (mean_grad * self.decay + norm_grad * (1 - self.decay))
            new_mean_grad.name = "nmg_" + param.name

            mg = new_mean_grad
            mgsq = new_mean_squared_grad

            # Keep the rms for numerator and denominator of gamma.
            new_gamma_nume_sqr = (
                gamma_nume_sqr * (1 - 1 / taus_x_t) + T.sqr((norm_grad - old_grad) * (old_grad - mg)) / taus_x_t
            )
            new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name

            new_gamma_deno_sqr = (
                gamma_deno_sqr * (1 - 1 / taus_x_t) + T.sqr((mg - norm_grad) * (old_grad - mg)) / taus_x_t
            )
            new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name

            gamma = T.sqrt(gamma_nume_sqr) / T.sqrt(gamma_deno_sqr + eps)
            gamma.name = "gamma_" + param.name

            if self.gamma_clip:
                gamma = T.minimum(gamma, self.gamma_clip)

            momentum_step = gamma * mg
            corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma)

            #For starting the variance reduction.
            if self.start_var_reduction > -1:
                cond = T.le(self.start_var_reduction, step)
                corrected_grad = cond * corrected_grad_cand + (1 - cond) * norm_grad
            else:
                corrected_grad = norm_grad

            new_sum_squared_grad = None
            if self.use_adagrad:
                g = corrected_grad
                # Accumulate gradient
                new_sum_squared_grad = (sum_square_grad + T.sqr(g))

                rms_g_t = T.sqrt(new_sum_squared_grad)
                rms_g_t = T.maximum(rms_g_t, 1.0)

            # Use the gradients from the previous update
            # to compute the \nabla f(x_t) - \nabla f(x_{t-1})
            cur_curvature = norm_grad - old_plain_grad
            cur_curvature_sqr = T.sqr(cur_curvature)

            new_curvature_ave = (mean_curvature * (1 - 1 / taus_x_t) + (cur_curvature / taus_x_t))
            new_curvature_ave.name = "ncurve_ave_" + param.name

            #Average average curvature
            nc_ave = new_curvature_ave

            new_curvature_sqr_ave = (mean_curvature_sqr * (1 - 1 / taus_x_t) + (cur_curvature_sqr / taus_x_t))
            new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name

            #Unbiased average squared curvature
            nc_sq_ave = new_curvature_sqr_ave

            epsilon = self.lr_scalers.get(param, 1.) * self.learning_rate
            scaled_lr = self.lr_scalers.get(param, 1.) * sharedX(1.0)
            rms_dx_tm1 = T.sqrt(msdx + epsilon)

            rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon)

            #This is where the update step is being defined
            delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t / (new_curvature_sqr_ave + epsilon))
            delta_x_t.name = "delta_x_t_" + param.name

            # This part seems to be necessary for only RNNs
            # For feedforward networks this does not seem to be important.
            if self.delta_clip:
                log.info("Clipping will be applied on the adaptive step size.")
                delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip)
                if self.use_adagrad:
                    delta_x_t = delta_x_t * corrected_grad / rms_g_t
                else:
                    log.info("Clipped adagrad is disabled.")
                    delta_x_t = delta_x_t * corrected_grad
            else:
                log.info("Clipping will not be applied on the adaptive step size.")
                if self.use_adagrad:
                    delta_x_t = delta_x_t * corrected_grad / rms_g_t
                else:
                    log.info("Clipped adagrad will not be used.")
                    delta_x_t = delta_x_t * corrected_grad

            new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX(1 + eps, "stabilized")

            #To compute the E[\Delta^2]_t
            new_mean_square_dx = (msdx * (1 - 1 / taus_x_t) + (T.sqr(delta_x_t) / taus_x_t))

            #To compute the E[\Delta]_t
            new_mean_dx = (mean_dx * (1 - 1 / taus_x_t) + (delta_x_t / (taus_x_t)))

            #Perform the outlier detection:
            #This outlier detection is slightly different:
            new_taus_t = T.switch(T.or_(abs(norm_grad - mg) > (2 * T.sqrt(mgsq - mg**2)),
                                        abs(cur_curvature - nc_ave) > (2 * T.sqrt(nc_sq_ave - nc_ave**2))),
                                        sharedX(2.2), new_taus_t)

            #Apply the bound constraints on tau:
            new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t)
            new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t)

            new_cov_num_t = (cov_num_t * (1 - 1 / taus_x_t) + (delta_x_t * cur_curvature) * (1 / taus_x_t))

            update_step = delta_x_t

            # Apply updates
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[mean_dx] = new_mean_dx
            updates[gamma_nume_sqr] = new_gamma_nume_sqr
            updates[gamma_deno_sqr] = new_gamma_deno_sqr
            updates[taus_x_t] = new_taus_t
            updates[cov_num_t] = new_cov_num_t
            updates[mean_grad] = new_mean_grad
            updates[old_plain_grad] = norm_grad
            updates[mean_curvature] = new_curvature_ave
            updates[mean_curvature_sqr] = new_curvature_sqr_ave
            updates[param] = param + update_step
            updates[step] = step + 1

            if self.use_adagrad:
                updates[sum_square_grad] = new_sum_squared_grad

            if self.use_corrected_grad:
                updates[old_grad] = corrected_grad

        return updates
    def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/gsn/',
                 input_size=None, hidden_size=1000,
                 layers=2, walkbacks=4,
                 visible_activation='sigmoid', hidden_activation='tanh',
                 input_sampling=True, mrg=RNG_MRG.MRG_RandomStreams(1),
                 tied_weights=True,
                 weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3,
                 bias_init=0.0,
                 cost_function='binary_crossentropy', cost_args=None,
                 add_noise=True, noiseless_h1=True,
                 hidden_noise='gaussian', hidden_noise_level=2, input_noise='salt_and_pepper', input_noise_level=0.4,
                 noise_decay='exponential', noise_annealing=1,
                 image_width=None, image_height=None,
                 **kwargs):
        """
        Initialize a GSN.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a
            newly supervised classification model). For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. n_in).
        hiddens_hook : Tuple of (shape, variable)
            Routing information for the model to accept its hidden representation from elsewhere.
            This is used for linking different models together (e.g. setting the DAE model's hidden layers to the RNN's
            output layer gives a generative recurrent model.) For now, it needs to include the shape
            information (normally the dimensionality of the hiddens i.e. n_hidden).
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters - such as a training model with dropout applied
            to layers and one without for testing, where the parameters are shared between the two.
        outdir : str
            The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will
            be saved.
        input_size : int
            The size (dimensionality) of the input to the DAE. If shape is provided in `inputs_hook`, this is optional.
            The :class:`Model` requires an `output_size`, which gets set to this value because the DAE is an
            unsupervised model. The output is a reconstruction of the input.
        hidden_size : int
            The size (dimensionality) of the hidden layer for the DAE. Generally, you want it to be larger than
            `input_size`, which is known as *overcomplete*.
        visible_activation : str or callable
            The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer.
            This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        hidden_activation : str or callable
            The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        layers : int
            The number of hidden layers to use.
        walkbacks : int
            The number of walkbacks to perform (the variable K in Bengio's paper above). A walkback is a Gibbs sample
            from the DAE, which means the model generates inputs in sequence, where each generated input is compared
            to the original input to create the reconstruction cost for training. For running the model, the very last
            generated input in the Gibbs chain is used as the output.
        input_sampling : bool
            During walkbacks, whether to sample from the generated input to create a new starting point for the next
            walkback (next step in the Gibbs chain). This generally makes walkbacks more effective by making the
            process more stochastic - more likely to find spurious modes in the model's representation.
        mrg : random
            A random number generator that is used when adding noise into the network and for sampling from the input.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        tied_weights : bool
            DAE has two weight matrices - W from input -> hiddens and V from hiddens -> input. This boolean
            determines if V = W.T, which 'ties' V to W and reduces the number of parameters necessary during training.
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        cost_function : str or callable
            The function to use when calculating the reconstruction cost of the model. This should be appropriate
            for the type of input, i.e. use 'binary_crossentropy' for binary inputs, or 'mse' for real-valued inputs.
            See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        add_noise : bool
            Whether to add noise (corrupt) the input before passing it through the computation graph during training.
            This should most likely be set to the default of True, because this is a *denoising* autoencoder after all.
        noiseless_h1 : bool
            Whether to not add noise (corrupt) the hidden layer during computation.
        hidden_noise : str
            What type of noise to use for corrupting the hidden layer (if not `noiseless_h1`). See opendeep.utils.noise
            for options. This should be appropriate for the hidden unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        hidden_noise_level : float
            The amount of noise to use for the noise function specified by `hidden_noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        input_noise : str
            What type of noise to use for corrupting the input before computation (if `add_noise`).
            See opendeep.utils.noise for options. This should be appropriate for the input units, i.e. salt-and-pepper
            for binary units, etc.
        input_noise_level : float
            The amount of noise used to corrupt the input. This could be the masking probability for salt-and-pepper,
            standard deviation for Gaussian, interval for Uniform, etc.
        noise_decay : str or False
            Whether to use `input_noise` scheduling (decay `input_noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the DAE learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_annealing : float
            The amount to reduce the `input_noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        image_width : int
            If the input should be represented as an image, the width of the input image. If not specified, it will be
            close to the square factor of the `input_size`.
        image_height : int
            If the input should be represented as an image, the height of the input image. If not specified, it will be
            close to the square factor of the `input_size`.
        """
        # init Model to combine the defaults and config dictionaries with the initial parameters.
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(GSN, self).__init__(**initial_parameters)

        # when the input should be thought of as an image, either use the specified width and height,
        # or try to make as square as possible.
        if image_height is None and image_width is None:
            (_h, _w) = closest_to_square_factors(self.input_size)
            self.image_width  = _w
            self.image_height = _h
        else:
            self.image_height = image_height
            self.image_width = image_width

        ############################
        # Theano variables and RNG #
        ############################
        if self.inputs_hook is None:
            self.X = T.matrix('X')
        else:
            # inputs_hook is a (shape, input) tuple
            self.X = self.inputs_hook[1]
        
        ##########################
        # Network specifications #
        ##########################
        # generally, walkbacks should be at least 2*layers
        if layers % 2 == 0:
            if walkbacks < 2*layers:
                log.warning('Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. '
                            'Generaly want 2X walkbacks to layers',
                            str(layers), str(walkbacks))
        else:
            if walkbacks < 2*layers-1:
                log.warning('Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. '
                            'Generaly want 2X walkbacks to layers',
                            str(layers), str(walkbacks))

        self.add_noise = add_noise
        self.noise_annealing = as_floatX(noise_annealing)  # noise schedule parameter
        self.hidden_noise_level = sharedX(hidden_noise_level, dtype=theano.config.floatX)
        self.hidden_noise = get_noise(name=hidden_noise, noise_level=self.hidden_noise_level, mrg=mrg)
        self.input_noise_level = sharedX(input_noise_level, dtype=theano.config.floatX)
        self.input_noise = get_noise(name=input_noise, noise_level=self.input_noise_level, mrg=mrg)

        self.walkbacks = walkbacks
        self.tied_weights = tied_weights
        self.layers = layers
        self.noiseless_h1 = noiseless_h1
        self.input_sampling = input_sampling
        self.noise_decay = noise_decay

        # if there was a hiddens_hook, unpack the hidden layers in the tensor
        if self.hiddens_hook is not None:
            hidden_size = self.hiddens_hook[0]
            self.hiddens_flag = True
        else:
            self.hiddens_flag = False

        # determine the sizes of each layer in a list.
        #  layer sizes, from h0 to hK (h0 is the visible layer)
        hidden_size = list(raise_to_list(hidden_size))
        if len(hidden_size) == 1:
            self.layer_sizes = [self.input_size] + hidden_size * self.layers
        else:
            assert len(hidden_size) == self.layers, "Hiddens sizes and number of hidden layers mismatch." + \
                                                    "Hiddens %d and layers %d" % (len(hidden_size), self.layers)
            self.layer_sizes = [self.input_size] + hidden_size

        if self.hiddens_hook is not None:
            self.hiddens = self.unpack_hiddens(self.hiddens_hook[1])

        #########################
        # Activation functions! #
        #########################
        # hidden unit activation
        self.hidden_activation = get_activation_function(hidden_activation)
        # Visible layer activation
        self.visible_activation = get_activation_function(visible_activation)
        # make sure the sampling functions are appropriate for the activation functions.
        if is_binary(self.visible_activation):
            self.visible_sampling = mrg.binomial
        else:
            # TODO: implement non-binary activation
            log.error("Non-binary visible activation not supported yet!")
            raise NotImplementedError("Non-binary visible activation not supported yet!")

        # Cost function
        self.cost_function = get_cost_function(cost_function)
        self.cost_args = cost_args or dict()

        ###############
        # Parameters! #
        ###############
        # make sure to deal with params_hook!
        if self.params_hook is not None:
            # if tied weights, expect layers*2 + 1 params
            if self.tied_weights:
                assert len(self.params_hook) == 2*layers + 1, \
                    "Tied weights: expected {0!s} params, found {1!s}!".format(2*layers+1, len(self.params_hook))
                self.weights_list = self.params_hook[:layers]
                self.bias_list = self.params_hook[layers:]
            # if untied weights, expect layers*3 + 1 params
            else:
                assert len(self.params_hook) == 3*layers + 1, \
                    "Untied weights: expected {0!s} params, found {1!s}!".format(3*layers+1, len(self.params_hook))
                self.weights_list = self.params_hook[:2*layers]
                self.bias_list = self.params_hook[2*layers:]
        # otherwise, construct our params
        else:
            # initialize a list of weights and biases based on layer_sizes for the GSN
            self.weights_list = [get_weights(weights_init=weights_init,
                                             shape=(self.layer_sizes[i], self.layer_sizes[i+1]),
                                             name="W_{0!s}_{1!s}".format(i, i+1),
                                             rng=mrg,
                                             # if gaussian
                                             mean=weights_mean,
                                             std=weights_std,
                                             # if uniform
                                             interval=weights_interval)
                                 for i in range(layers)]
            # add more weights if we aren't tying weights between layers (need to add for higher-lower layers now)
            if not tied_weights:
                self.weights_list.extend(
                    [get_weights(weights_init=weights_init,
                                 shape=(self.layer_sizes[i+1], self.layer_sizes[i]),
                                 name="W_{0!s}_{1!s}".format(i+1, i),
                                 rng=mrg,
                                 # if gaussian
                                 mean=weights_mean,
                                 std=weights_std,
                                 # if uniform
                                 interval=weights_interval)
                     for i in reversed(range(layers))]
                )
            # initialize each layer bias to 0's.
            self.bias_list = [get_bias(shape=(self.layer_sizes[i],),
                                       name='b_' + str(i),
                                       init_values=bias_init)
                              for i in range(layers+1)]

        # build the params of the model into a list
        self.params = self.weights_list + self.bias_list
        log.debug("gsn params: %s", str(self.params))

        # using the properties, build the computational graph
        self.cost, self.monitors, self.output, self.hiddens = self.build_computation_graph()
Exemple #19
0
    def __init__(self,
                 inputs_hook=None,
                 hiddens_hook=None,
                 params_hook=None,
                 outdir='outputs/gsn/',
                 input_size=None,
                 hidden_size=1000,
                 layers=2,
                 walkbacks=4,
                 visible_activation='sigmoid',
                 hidden_activation='tanh',
                 input_sampling=True,
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 tied_weights=True,
                 weights_init='uniform',
                 weights_interval='montreal',
                 weights_mean=0,
                 weights_std=5e-3,
                 bias_init=0.0,
                 cost_function='binary_crossentropy',
                 cost_args=None,
                 add_noise=True,
                 noiseless_h1=True,
                 hidden_noise='gaussian',
                 hidden_noise_level=2,
                 input_noise='salt_and_pepper',
                 input_noise_level=0.4,
                 noise_decay='exponential',
                 noise_annealing=1,
                 image_width=None,
                 image_height=None,
                 **kwargs):
        """
        Initialize a GSN.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a
            newly supervised classification model). For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. n_in).
        hiddens_hook : Tuple of (shape, variable)
            Routing information for the model to accept its hidden representation from elsewhere.
            This is used for linking different models together (e.g. setting the DAE model's hidden layers to the RNN's
            output layer gives a generative recurrent model.) For now, it needs to include the shape
            information (normally the dimensionality of the hiddens i.e. n_hidden).
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters - such as a training model with dropout applied
            to layers and one without for testing, where the parameters are shared between the two.
        outdir : str
            The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will
            be saved.
        input_size : int
            The size (dimensionality) of the input to the DAE. If shape is provided in `inputs_hook`, this is optional.
            The :class:`Model` requires an `output_size`, which gets set to this value because the DAE is an
            unsupervised model. The output is a reconstruction of the input.
        hidden_size : int
            The size (dimensionality) of the hidden layer for the DAE. Generally, you want it to be larger than
            `input_size`, which is known as *overcomplete*.
        visible_activation : str or callable
            The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer.
            This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        hidden_activation : str or callable
            The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        layers : int
            The number of hidden layers to use.
        walkbacks : int
            The number of walkbacks to perform (the variable K in Bengio's paper above). A walkback is a Gibbs sample
            from the DAE, which means the model generates inputs in sequence, where each generated input is compared
            to the original input to create the reconstruction cost for training. For running the model, the very last
            generated input in the Gibbs chain is used as the output.
        input_sampling : bool
            During walkbacks, whether to sample from the generated input to create a new starting point for the next
            walkback (next step in the Gibbs chain). This generally makes walkbacks more effective by making the
            process more stochastic - more likely to find spurious modes in the model's representation.
        mrg : random
            A random number generator that is used when adding noise into the network and for sampling from the input.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        tied_weights : bool
            DAE has two weight matrices - W from input -> hiddens and V from hiddens -> input. This boolean
            determines if V = W.T, which 'ties' V to W and reduces the number of parameters necessary during training.
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        cost_function : str or callable
            The function to use when calculating the reconstruction cost of the model. This should be appropriate
            for the type of input, i.e. use 'binary_crossentropy' for binary inputs, or 'mse' for real-valued inputs.
            See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        add_noise : bool
            Whether to add noise (corrupt) the input before passing it through the computation graph during training.
            This should most likely be set to the default of True, because this is a *denoising* autoencoder after all.
        noiseless_h1 : bool
            Whether to not add noise (corrupt) the hidden layer during computation.
        hidden_noise : str
            What type of noise to use for corrupting the hidden layer (if not `noiseless_h1`). See opendeep.utils.noise
            for options. This should be appropriate for the hidden unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        hidden_noise_level : float
            The amount of noise to use for the noise function specified by `hidden_noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        input_noise : str
            What type of noise to use for corrupting the input before computation (if `add_noise`).
            See opendeep.utils.noise for options. This should be appropriate for the input units, i.e. salt-and-pepper
            for binary units, etc.
        input_noise_level : float
            The amount of noise used to corrupt the input. This could be the masking probability for salt-and-pepper,
            standard deviation for Gaussian, interval for Uniform, etc.
        noise_decay : str or False
            Whether to use `input_noise` scheduling (decay `input_noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the DAE learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_annealing : float
            The amount to reduce the `input_noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        image_width : int
            If the input should be represented as an image, the width of the input image. If not specified, it will be
            close to the square factor of the `input_size`.
        image_height : int
            If the input should be represented as an image, the height of the input image. If not specified, it will be
            close to the square factor of the `input_size`.
        """
        # init Model to combine the defaults and config dictionaries with the initial parameters.
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(GSN, self).__init__(**initial_parameters)

        # when the input should be thought of as an image, either use the specified width and height,
        # or try to make as square as possible.
        if image_height is None and image_width is None:
            (_h, _w) = closest_to_square_factors(self.input_size)
            self.image_width = _w
            self.image_height = _h
        else:
            self.image_height = image_height
            self.image_width = image_width

        ############################
        # Theano variables and RNG #
        ############################
        if self.inputs_hook is None:
            self.X = T.matrix('X')
        else:
            # inputs_hook is a (shape, input) tuple
            self.X = self.inputs_hook[1]

        ##########################
        # Network specifications #
        ##########################
        # generally, walkbacks should be at least 2*layers
        if layers % 2 == 0:
            if walkbacks < 2 * layers:
                log.warning(
                    'Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. '
                    'Generaly want 2X walkbacks to layers', str(layers),
                    str(walkbacks))
        else:
            if walkbacks < 2 * layers - 1:
                log.warning(
                    'Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. '
                    'Generaly want 2X walkbacks to layers', str(layers),
                    str(walkbacks))

        self.add_noise = add_noise
        self.noise_annealing = as_floatX(
            noise_annealing)  # noise schedule parameter
        self.hidden_noise_level = sharedX(hidden_noise_level,
                                          dtype=theano.config.floatX)
        self.hidden_noise = get_noise(name=hidden_noise,
                                      noise_level=self.hidden_noise_level,
                                      mrg=mrg)
        self.input_noise_level = sharedX(input_noise_level,
                                         dtype=theano.config.floatX)
        self.input_noise = get_noise(name=input_noise,
                                     noise_level=self.input_noise_level,
                                     mrg=mrg)

        self.walkbacks = walkbacks
        self.tied_weights = tied_weights
        self.layers = layers
        self.noiseless_h1 = noiseless_h1
        self.input_sampling = input_sampling
        self.noise_decay = noise_decay

        # if there was a hiddens_hook, unpack the hidden layers in the tensor
        if self.hiddens_hook is not None:
            hidden_size = self.hiddens_hook[0]
            self.hiddens_flag = True
        else:
            self.hiddens_flag = False

        # determine the sizes of each layer in a list.
        #  layer sizes, from h0 to hK (h0 is the visible layer)
        hidden_size = list(raise_to_list(hidden_size))
        if len(hidden_size) == 1:
            self.layer_sizes = [self.input_size] + hidden_size * self.layers
        else:
            assert len(hidden_size) == self.layers, "Hiddens sizes and number of hidden layers mismatch." + \
                                                    "Hiddens %d and layers %d" % (len(hidden_size), self.layers)
            self.layer_sizes = [self.input_size] + hidden_size

        if self.hiddens_hook is not None:
            self.hiddens = self.unpack_hiddens(self.hiddens_hook[1])

        #########################
        # Activation functions! #
        #########################
        # hidden unit activation
        self.hidden_activation = get_activation_function(hidden_activation)
        # Visible layer activation
        self.visible_activation = get_activation_function(visible_activation)
        # make sure the sampling functions are appropriate for the activation functions.
        if is_binary(self.visible_activation):
            self.visible_sampling = mrg.binomial
        else:
            # TODO: implement non-binary activation
            log.error("Non-binary visible activation not supported yet!")
            raise NotImplementedError(
                "Non-binary visible activation not supported yet!")

        # Cost function
        self.cost_function = get_cost_function(cost_function)
        self.cost_args = cost_args or dict()

        ###############
        # Parameters! #
        ###############
        # make sure to deal with params_hook!
        if self.params_hook is not None:
            # if tied weights, expect layers*2 + 1 params
            if self.tied_weights:
                assert len(self.params_hook) == 2*layers + 1, \
                    "Tied weights: expected {0!s} params, found {1!s}!".format(2*layers+1, len(self.params_hook))
                self.weights_list = self.params_hook[:layers]
                self.bias_list = self.params_hook[layers:]
            # if untied weights, expect layers*3 + 1 params
            else:
                assert len(self.params_hook) == 3*layers + 1, \
                    "Untied weights: expected {0!s} params, found {1!s}!".format(3*layers+1, len(self.params_hook))
                self.weights_list = self.params_hook[:2 * layers]
                self.bias_list = self.params_hook[2 * layers:]
        # otherwise, construct our params
        else:
            # initialize a list of weights and biases based on layer_sizes for the GSN
            self.weights_list = [
                get_weights(
                    weights_init=weights_init,
                    shape=(self.layer_sizes[i], self.layer_sizes[i + 1]),
                    name="W_{0!s}_{1!s}".format(i, i + 1),
                    rng=mrg,
                    # if gaussian
                    mean=weights_mean,
                    std=weights_std,
                    # if uniform
                    interval=weights_interval) for i in range(layers)
            ]
            # add more weights if we aren't tying weights between layers (need to add for higher-lower layers now)
            if not tied_weights:
                self.weights_list.extend([
                    get_weights(
                        weights_init=weights_init,
                        shape=(self.layer_sizes[i + 1], self.layer_sizes[i]),
                        name="W_{0!s}_{1!s}".format(i + 1, i),
                        rng=mrg,
                        # if gaussian
                        mean=weights_mean,
                        std=weights_std,
                        # if uniform
                        interval=weights_interval)
                    for i in reversed(range(layers))
                ])
            # initialize each layer bias to 0's.
            self.bias_list = [
                get_bias(shape=(self.layer_sizes[i], ),
                         name='b_' + str(i),
                         init_values=bias_init) for i in range(layers + 1)
            ]

        # build the params of the model into a list
        self.params = self.weights_list + self.bias_list
        log.debug("gsn params: %s", str(self.params))

        # using the properties, build the computational graph
        self.cost, self.monitors, self.output, self.hiddens = self.build_computation_graph(
        )
Exemple #20
0
    def __init__(self, model, dataset,
                 n_epoch=10, batch_size=100, minimum_batch_size=1,
                 save_frequency=None, early_stop_threshold=None, early_stop_length=None,
                 learning_rate=1e-6, lr_decay=None, lr_factor=None,
                 decay=0.95, gamma_clip=1.8, damping=1e-7, grad_clip=None, start_var_reduction=0,
                 delta_clip=None, use_adagrad=False, skip_nan_inf=False,
                 upper_bound_tau=1e8, lower_bound_tau=1.5, use_corrected_grad=True):
        """
        Initialize AdaSecant.

        Parameters
        ----------
        model : Model
            The Model to train.
        dataset : Dataset
            The Dataset to use when training the Model.
        n_epoch : int
            how many training iterations over the dataset to go.
        batch_size : int
            How many examples from the training dataset to use in parallel.
        minimum_batch_size : int
            The minimum number of examples required at a time (for things like time series, this would be > 1).
        save_frequency : int
            How many epochs to train between each new save of the Model's parameters.
        early_stop_threshold : float
            The factor by how much the best validation training score needs to improve to determine early stopping.
        early_stop_length : int
            The patience or number of epochs to wait after the early_stop_threshold has been reached before stopping.
        learning_rate : float
            The multiplicative amount to adjust parameters based on their gradient values.
        lr_decay : str
            The type of decay function to use for changing the learning rate over epochs. See
            `opendeep.utils.decay` for options.
        lr_factor : float
            The amount to use for the decay function when changing the learning rate over epochs. See
            `opendeep.utils.decay` for its effect for given decay functions.
        decay : float, optional
            Decay rate :math:`\\rho` in Algorithm 1 of the aforementioned
            paper. Decay 0.95 seems to work fine for several tasks.
        gamma_clip : float, optional
            The clipping threshold for the gamma. In general 1.8 seems to
            work fine for several tasks.
        start_var_reduction: float, optional,
            How many updates later should the variance reduction start from?
        delta_clip: float, optional,
            The threshold to clip the deltas after.
        grad_clip: float, optional,
            Apply gradient clipping for RNNs (not necessary for feedforward networks). But this is
            a constraint on the norm of the gradient per layer.
            Based on:
            Pascanu, Razvan, Tomas Mikolov, and Yoshua Bengio. "On the difficulty of training
            recurrent neural networks." arXiv preprint arXiv:1211.5063 (2012).
        use_adagrad: bool, optional
            Either to use clipped adagrad or not.
        use_corrected_grad: bool, optional
            Either to use correction for gradients (referred as variance
            reduction in the workshop paper).
        """
        # get everything together with the Optimizer class
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(AdaSecant, self).__init__(**initial_parameters)

        assert decay >= 0., "Decay needs to be >=0."
        assert decay < 1., "Decay needs to be <1."
        self.decay = sharedX(decay, "decay")

        self.damping = damping
        self.skip_nan_inf = skip_nan_inf

        if grad_clip:
            assert grad_clip > 0.
            assert grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1."
        self.grad_clip = grad_clip

        self.use_adagrad = use_adagrad
        self.use_corrected_grad = use_corrected_grad
        self.gamma_clip = gamma_clip
        self.start_var_reduction = start_var_reduction
        self.delta_clip = delta_clip

        # We have to bound the tau to prevent it to
        # grow to an arbitrarily large number, oftenwise
        # that causes numerical instabilities for very deep
        # networks. Note that once tau become very large, it will keep,
        # increasing indefinitely.
        self.lower_bound_tau = lower_bound_tau
        self.upper_bound_tau = upper_bound_tau
Exemple #21
0
    def __init__(self,
                 inputs_hook=None,
                 params_hook=None,
                 outdir='outputs/basic',
                 input_size=None,
                 output_size=None,
                 activation='rectifier',
                 cost='mse',
                 cost_args=None,
                 weights_init='uniform',
                 weights_mean=0,
                 weights_std=5e-3,
                 weights_interval='montreal',
                 bias_init=0.0,
                 noise=None,
                 noise_level=None,
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 **kwargs):
        """
        Initialize a basic layer.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together. For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. input_size).
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters - such as a training model with dropout applied
            to layers and one without for testing, where the parameters are shared between the two.
        outdir : str
            The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will
            be saved.
        input_size : int
            The size (dimensionality) of the input to the layer. If shape is provided in `inputs_hook`,
            this is optional.
        output_size : int
            The size (dimensionality) of the output from the layer.
        activation : str or callable
            The activation function to use after the dot product going from input -> output. This can be a string
            representing an option from opendeep.utils.activation, or your own function as long as it is callable.
        cost : str or callable
            The cost function to use when training the layer. This should be appropriate for the output type, i.e.
            mse for real-valued outputs, binary cross-entropy for binary outputs, etc.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        weights_init : str
            Determines the method for initializing input -> output weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        noise : str
            What type of noise to use for corrupting the output (if not None). See opendeep.utils.noise
            for options. This should be appropriate for the output activation, i.e. Gaussian for tanh or other
            real-valued activations, etc. Often, you will use 'dropout' here as a regularization in BasicLayers.
        noise_level : float
            The amount of noise to use for the noise function specified by `noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        """
        # init Model to combine the defaults and config dictionaries with the initial parameters.
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(BasicLayer, self).__init__(**initial_parameters)

        ##################
        # specifications #
        ##################
        # grab info from the inputs_hook, or from parameters
        if inputs_hook is not None:  # inputs_hook is a tuple of (Shape, Input)
            assert len(
                inputs_hook
            ) == 2, 'Expected inputs_hook to be tuple!'  # make sure inputs_hook is a tuple
            self.input = inputs_hook[1]
        else:
            # make the input a symbolic matrix
            self.input = T.matrix('X')

        # now that we have the input specs, define the output 'target' variable to be used in supervised training!
        self.target = T.matrix('Y')

        # either grab the output's desired size from the parameter directly, or copy input_size
        self.output_size = self.output_size or self.input_size

        # other specifications
        # activation function!
        activation_func = get_activation_function(activation)
        # cost function!
        cost_func = get_cost_function(cost)
        cost_args = cost_args or dict()

        ####################################################
        # parameters - make sure to deal with params_hook! #
        ####################################################
        if params_hook is not None:
            # make sure the params_hook has W (weights matrix) and b (bias vector)
            assert len(params_hook) == 2, \
                "Expected 2 params (W and b) for BasicLayer, found {0!s}!".format(len(params_hook))
            W, b = params_hook
        else:
            W = get_weights(
                weights_init=weights_init,
                shape=(self.input_size, self.output_size),
                name="W",
                rng=mrg,
                # if gaussian
                mean=weights_mean,
                std=weights_std,
                # if uniform
                interval=weights_interval)

            # grab the bias vector
            b = get_bias(shape=output_size, name="b", init_values=bias_init)

        # Finally have the two parameters - weights matrix W and bias vector b. That is all!
        self.params = [W, b]

        ###############
        # computation #
        ###############
        # Here is the meat of the computation transforming input -> output
        # It simply involves a matrix multiplication of inputs*weights, adding the bias vector, and then passing
        # the result through our activation function (normally something nonlinear such as: max(0, output))
        self.output = activation_func(T.dot(self.input, W) + b)

        # Now deal with noise if we added it:
        if noise:
            log.debug('Adding noise switch.')
            if noise_level is not None:
                noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg)
            else:
                noise_func = get_noise(noise, mrg=mrg)
            # apply the noise as a switch!
            # default to apply noise. this is for the cost and gradient functions to be computed later
            # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
            self.switch = sharedX(value=1, name="basiclayer_noise_switch")
            self.output = T.switch(self.switch, noise_func(input=self.output),
                                   self.output)

        # now to define the cost of the model - use the cost function to compare our output with the target value.
        self.cost = cost_func(output=self.output,
                              target=self.target,
                              **cost_args)

        log.debug(
            "Initialized a basic fully-connected layer with shape %s and activation: %s",
            str((self.input_size, self.output_size)), str(activation))
Exemple #22
0
    def __init__(self,
                 model,
                 dataset,
                 iterator_class=SequentialIterator,
                 config=None,
                 defaults=_defaults,
                 rng=None,
                 n_epoch=None,
                 batch_size=None,
                 minimum_batch_size=None,
                 save_frequency=None,
                 early_stop_threshold=None,
                 early_stop_length=None,
                 learning_rate=None,
                 lr_decay=None,
                 lr_factor=None,
                 momentum=None,
                 momentum_decay=None,
                 momentum_factor=None,
                 nesterov_momentum=None,
                 flag_para_load=None):
        # superclass init
        super(SGD, self).__init__(config=config, defaults=defaults)
        # config and defaults are now combined in self.args! yay!

        self.model = model
        self.dataset = dataset
        self.iterator = iterator_class

        # Training epochs - how many times to iterate over the whole dataset
        self.n_epoch = n_epoch or self.args.get('n_epoch')

        # Dataset iteration batch sizes - number of examples in each calculation
        self.batch_size = batch_size or self.args.get('batch_size')
        self.minimum_batch_size = minimum_batch_size or self.args.get(
            'minimum_batch_size')

        # Number of epochs between saving model parameters
        self.save_frequency = save_frequency or self.args.get('save_frequency')

        # Early stopping threshold and patience - by how much does the cost have to improve over a number of epochs
        self.early_stop_threshold = early_stop_threshold or self.args.get(
            'early_stop_threshold')
        self.early_stop_length = early_stop_length or self.args.get(
            'early_stop_length')

        # Learning rate - how drastic of a step do the parameters change
        lr = learning_rate or self.args.get('learning_rate')
        self.learning_rate = sharedX(lr, 'learning_rate')
        self.lr_scalers = self.model.get_lr_scalers()
        if lr_decay or self.args.get('lr_decay'):
            self.learning_rate_decay = get_decay_function(
                lr_decay or self.args.get('lr_decay'), self.learning_rate,
                self.learning_rate.get_value(), lr_factor
                or self.args.get('lr_factor'))

        # Momentum - smoothing over the parameter changes (see Hinton)
        self.momentum = sharedX(momentum or self.args.get('momentum'),
                                'momentum')
        if self.args.get('momentum_decay'):
            self.momentum_decay = get_decay_function(
                momentum_decay or self.args.get('momentum_decay'),
                self.momentum, self.momentum.get_value(), momentum_factor
                or self.args.get('momentum_factor'))
        self.nesterov_momentum = nesterov_momentum or self.args.get(
            'nesterov_momentum')

        # RNG for working on random iterator
        if rng is None:
            random.seed(123)
            self.rng = random
        else:
            self.rng = rng

        self.params = self.model.get_params()

        # Now create the training cost function for the model to use while training - update parameters
        log.info("%s params: %s", str(type(self.model)), str(self.params))
        # gradient!
        gradient = grad(self.model.get_train_cost(), self.params)
        grads = OrderedDict(zip(self.params, gradient))

        # Calculate the optimizer updates each run
        # This is where the magic happens for a lot of sub-implementations of SGD, including AdaDelta!
        # It tells how to update the params each training epoch
        gradient_updates = self.get_updates(grads)

        # Combine the updates from the model also if applicable
        train_updates = model.get_updates()
        if train_updates:
            train_updates.update(gradient_updates)
        else:
            train_updates = gradient_updates

        # Compile the training function!
        log.info('Compiling f_learn function for model %s...',
                 str(type(self.model)))
        t = time.time()
        self.f_learn = function(inputs=model.get_inputs(),
                                updates=train_updates,
                                outputs=self.model.get_train_cost(),
                                name='f_learn')
        log.info('f_learn compilation took %s',
                 make_time_units_string(time.time() - t))

        # Determine if this function is unsupervised or not by looking at the number of inputs to the f_learn function.
        # If there is only one input, it is unsupervised, otherwise, it is supervised.
        # This workaround was provided by Pascal Lamblin on the theano-users google group
        num_inputs = len(
            [i for i in self.f_learn.maker.inputs if not i.shared])
        if num_inputs == 1:
            log.debug("Model is unsupervised: 1 input to f_learn.")
            self.unsupervised = True
        elif num_inputs == 2:
            log.debug("Model is supervised: 2 inputs to f_learn.")
            self.unsupervised = False
        else:
            log.error(
                "Number of inputs to f_learn on model %s was %s. Needs to be 1 for unsupervised or 2 for supervised.",
                str(type(self.model)), str(num_inputs))
            raise AssertionError(
                "Number of inputs to f_learn on model %s was %s. Needs to be 1 for unsupervised or 2 for supervised."
                % str(type(self.model)), str(num_inputs))

        # grab the function(s) to use to monitor different model values during training
        self.monitors = self.model.get_monitors()
Exemple #23
0
    def __init__(
        self,
        inputs_hook=None,
        params_hook=None,
        outdir="outputs/basic",
        input_size=None,
        output_size=None,
        activation="rectifier",
        cost="mse",
        cost_args=None,
        weights_init="uniform",
        weights_mean=0,
        weights_std=5e-3,
        weights_interval="montreal",
        bias_init=0.0,
        noise=None,
        noise_level=None,
        mrg=RNG_MRG.MRG_RandomStreams(1),
        **kwargs
    ):
        """
        Initialize a basic layer.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together. For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. input_size).
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters - such as a training model with dropout applied
            to layers and one without for testing, where the parameters are shared between the two.
        outdir : str
            The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will
            be saved.
        input_size : int
            The size (dimensionality) of the input to the layer. If shape is provided in `inputs_hook`,
            this is optional.
        output_size : int
            The size (dimensionality) of the output from the layer.
        activation : str or callable
            The activation function to use after the dot product going from input -> output. This can be a string
            representing an option from opendeep.utils.activation, or your own function as long as it is callable.
        cost : str or callable
            The cost function to use when training the layer. This should be appropriate for the output type, i.e.
            mse for real-valued outputs, binary cross-entropy for binary outputs, etc.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        weights_init : str
            Determines the method for initializing input -> output weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        noise : str
            What type of noise to use for corrupting the output (if not None). See opendeep.utils.noise
            for options. This should be appropriate for the output activation, i.e. Gaussian for tanh or other
            real-valued activations, etc. Often, you will use 'dropout' here as a regularization in BasicLayers.
        noise_level : float
            The amount of noise to use for the noise function specified by `noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        """
        # init Model to combine the defaults and config dictionaries with the initial parameters.
        initial_parameters = locals().copy()
        initial_parameters.pop("self")
        super(BasicLayer, self).__init__(**initial_parameters)

        ##################
        # specifications #
        ##################
        # grab info from the inputs_hook, or from parameters
        if inputs_hook is not None:  # inputs_hook is a tuple of (Shape, Input)
            assert len(inputs_hook) == 2, "Expected inputs_hook to be tuple!"  # make sure inputs_hook is a tuple
            self.input = inputs_hook[1]
        else:
            # make the input a symbolic matrix
            self.input = T.matrix("X")

        # now that we have the input specs, define the output 'target' variable to be used in supervised training!
        self.target = T.matrix("Y")

        # either grab the output's desired size from the parameter directly, or copy input_size
        self.output_size = self.output_size or self.input_size

        # other specifications
        # activation function!
        activation_func = get_activation_function(activation)
        # cost function!
        cost_func = get_cost_function(cost)
        cost_args = cost_args or dict()

        ####################################################
        # parameters - make sure to deal with params_hook! #
        ####################################################
        if params_hook is not None:
            # make sure the params_hook has W (weights matrix) and b (bias vector)
            assert len(params_hook) == 2, "Expected 2 params (W and b) for BasicLayer, found {0!s}!".format(
                len(params_hook)
            )
            W, b = params_hook
        else:
            W = get_weights(
                weights_init=weights_init,
                shape=(self.input_size, self.output_size),
                name="W",
                rng=mrg,
                # if gaussian
                mean=weights_mean,
                std=weights_std,
                # if uniform
                interval=weights_interval,
            )

            # grab the bias vector
            b = get_bias(shape=output_size, name="b", init_values=bias_init)

        # Finally have the two parameters - weights matrix W and bias vector b. That is all!
        self.params = [W, b]

        ###############
        # computation #
        ###############
        # Here is the meat of the computation transforming input -> output
        # It simply involves a matrix multiplication of inputs*weights, adding the bias vector, and then passing
        # the result through our activation function (normally something nonlinear such as: max(0, output))
        self.output = activation_func(T.dot(self.input, W) + b)

        # Now deal with noise if we added it:
        if noise:
            log.debug("Adding noise switch.")
            if noise_level is not None:
                noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg)
            else:
                noise_func = get_noise(noise, mrg=mrg)
            # apply the noise as a switch!
            # default to apply noise. this is for the cost and gradient functions to be computed later
            # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
            self.switch = sharedX(value=1, name="basiclayer_noise_switch")
            self.output = T.switch(self.switch, noise_func(input=self.output), self.output)

        # now to define the cost of the model - use the cost function to compare our output with the target value.
        self.cost = cost_func(output=self.output, target=self.target, **cost_args)

        log.debug(
            "Initialized a basic fully-connected layer with shape %s and activation: %s",
            str((self.input_size, self.output_size)),
            str(activation),
        )