Example #1
0
 def get_decay_params(self):
     # noise scheduling
     noise_schedule = get_decay_function(self.noise_decay,
                                         self.input_noise_level,
                                         self.args.get('input_noise_level'),
                                         self.noise_annealing)
     return [noise_schedule]
    def __init__(self, model, dataset,
                 config=None, defaults=defaults,
                 n_epoch=None, batch_size=None, minimum_batch_size=None,
                 save_frequency=None, early_stop_threshold=None, early_stop_length=None,
                 learning_rate=None, lr_decay=None, lr_factor=None,
                 momentum=None, momentum_decay=None, momentum_factor=None, nesterov_momentum=None):
        # superclass init
        super(SGD, self).__init__(model, dataset, config=config, defaults=defaults,
                                  n_epoch=n_epoch, batch_size=batch_size, minimum_batch_size=minimum_batch_size,
                                  save_frequency=save_frequency, early_stop_length=early_stop_length,
                                  early_stop_threshold=early_stop_threshold, learning_rate=learning_rate,
                                  lr_decay=lr_decay, lr_factor=lr_factor, momentum=momentum,
                                  momentum_decay=momentum_decay, momentum_factor=momentum_factor,
                                  nesterov_momentum=nesterov_momentum)
        # everything is in self! yay!

        # Momentum - smoothing over the parameter changes (see Hinton)
        if self.momentum:
            self.momentum = sharedX(self.momentum, 'momentum')
            if self.momentum_decay is not None and \
                            self.momentum_decay is not False and \
                            self.momentum_factor is not None:
                self.momentum_decay = get_decay_function(self.momentum_decay,
                                                         self.momentum,
                                                         self.momentum.get_value(),
                                                         self.momentum_factor)
            else:
                self.momentum_decay = False
        else:
            self.momentum = 1
 def get_decay_params(self):
     # noise scheduling
     noise_schedule = get_decay_function(self.noise_decay,
                                         self.input_noise_level,
                                         self.args.get('input_noise_level'),
                                         self.noise_annealing)
     return [noise_schedule]
    def __init__(self,
                 dataset,
                 loss,
                 model=None,
                 epochs=10,
                 batch_size=100,
                 min_batch_size=1,
                 save_freq=None,
                 stop_threshold=None,
                 stop_patience=None,
                 learning_rate=.1,
                 lr_decay="exponential",
                 lr_decay_factor=.995,
                 momentum=0.5,
                 momentum_decay="linear",
                 momentum_factor=0,
                 nesterov_momentum=True,
                 grad_clip=None,
                 hard_clip=False):
        """
        Initialize SGD.

        Parameters
        ----------
        dataset : Dataset
            The :class:`opendeep.data.Dataset` to use when training the Model.
        loss : Loss
            The :class:`opendeep.optimization.loss.Loss` function to compare the model to a 'target' result.
        model : Model
            The :class:`opendeep.models.Model` to train. Needed if the Optimizer isn't being passed to a
            Model's .train() method.
        epochs : int
            how many training iterations over the dataset to go.
        batch_size : int
            How many examples from the training dataset to use in parallel.
        min_batch_size : int
            The minimum number of examples required at a time (for things like time series, this would be > 1).
        save_freq : int
            How many epochs to train between each new save of the Model's parameters.
        stop_threshold : float
            The factor by how much the best validation training score needs to improve to determine early stopping.
        stop_patience : int
            The patience or number of epochs to wait after the stop_threshold has been reached before stopping.
        learning_rate : float
            The multiplicative amount to adjust parameters based on their gradient values.
        lr_decay : str
            The type of decay function to use for changing the learning rate over epochs. See
            `opendeep.utils.decay` for options.
        lr_decay_factor : float
            The amount to use for the decay function when changing the learning rate over epochs. See
            `opendeep.utils.decay` for its effect for given decay functions.
        momentum : float
            The momentum to use during gradient updates.
        momentum_decay : str
            The type of decay function to use for changing the momentum over epochs. See
            `opendeep.utils.decay` for options.
        momentum_factor : float
            The amount to use for the decay function when changing the momentum over epochs. See
            `opendeep.utils.decay` for its effect for given decay functions.
        nesterov_momentum : bool
            Whether or not to use Nesterov momentum.
        grad_clip : float, optional
            Whether to clip gradients. This will clip with a maximum of grad_clip or the parameter norm.
        hard_clip : bool
            Whether to use a hard cutoff or rescaling for clipping gradients.
        """
        # superclass init
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(SGD, self).__init__(**initial_parameters)

        # Momentum - smoothing over the parameter changes (see Hinton)
        if momentum:
            self.momentum = sharedX(momentum, 'momentum')
            if momentum_decay is not None and \
                            momentum_decay is not False and \
                            momentum_factor is not None:
                self.momentum_decay = get_decay_function(
                    momentum_decay, self.momentum, self.momentum.get_value(),
                    momentum_factor)
            else:
                self.momentum_decay = False
        else:
            self.momentum = 0
            self.momentum_decay = False

        self.nesterov_momentum = nesterov_momentum
Example #5
0
    def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/lstm/',
                 input_size=None, hidden_size=None, output_size=None,
                 activation='sigmoid', hidden_activation='relu', inner_hidden_activation='sigmoid',
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3,
                 bias_init=0.0,
                 r_weights_init='identity', r_weights_interval='montreal', r_weights_mean=0, r_weights_std=5e-3,
                 r_bias_init=0.0,
                 cost_function='mse', cost_args=None,
                 noise='dropout', noise_level=None, noise_decay=False, noise_decay_amount=.99,
                 direction='forward',
                 clip_recurrent_grads=False):
        """
        Initialize a simple recurrent network.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a
            newly supervised classification model). For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. n_in).
        hiddens_hook : Tuple of (shape, variable)
            Routing information for the model to accept its hidden representation from elsewhere. For recurrent nets,
            this will be the initial starting value for hidden layers.
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters.
        outdir : str
            The location to produce outputs from training or running the :class:`RNN`. If None, nothing will be saved.
        input_size : int
            The size (dimensionality) of the input. If shape is provided in `inputs_hook`, this is optional.
        hidden_size : int
            The size (dimensionality) of the hidden layers. If shape is provided in `hiddens_hook`, this is optional.
        output_size : int
            The size (dimensionality) of the output.
        activation : str or callable
            The nonlinear (or linear) activation to perform after the dot product from hiddens -> output layer.
            This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        hidden_activation : str or callable
            The activation to perform for the hidden units.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        inner_hidden_activation : str or callable
            The activation to perform for the hidden gates.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        r_weights_init : str
            Determines the method for initializing recurrent model weights. See opendeep.utils.nnet for options.
        r_weights_interval : str or float
            If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        r_weights_mean : float
            If Gaussian `r_weights_init`, the mean value to use.
        r_weights_std : float
            If Gaussian `r_weights_init`, the standard deviation to use.
        r_bias_init : float
            The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred.
        cost_function : str or callable
            The function to use when calculating the output cost of the model.
            See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        noise : str
            What type of noise to use for the hidden layers and outputs. See opendeep.utils.noise
            for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        noise_level : float
            The amount of noise to use for the noise function specified by `hidden_noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        noise_decay : str or False
            Whether to use `noise` scheduling (decay `noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_decay_amount : float
            The amount to reduce the `noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        direction : str
            The direction this recurrent model should go over its inputs.
            Can be 'forward', 'backward', or 'bidirectional'.
        clip_recurrent_grads : False or float, optional
            Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights
            connecting previous hidden states to the current hidden state, and not the weights from current
            input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range
            `+-clip_recurrent_grads`.
        """
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(LSTM, self).__init__(**initial_parameters)

        ##################
        # specifications #
        ##################
        backward = direction.lower() == 'backward'
        bidirectional = direction.lower() == 'bidirectional'

        #########################################
        # activation, cost, and noise functions #
        #########################################
        # recurrent hidden activation function!
        self.hidden_activation_func = get_activation_function(hidden_activation)
        self.inner_hidden_activation_func = get_activation_function(inner_hidden_activation)

        # output activation function!
        activation_func = get_activation_function(activation)

        # Cost function
        cost_function = get_cost_function(cost_function)
        cost_args = cost_args or dict()

        # Now deal with noise if we added it:
        if noise:
            log.debug('Adding %s noise switch.' % str(noise))
            if noise_level is not None:
                noise_level = sharedX(value=noise_level)
                noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg)
            else:
                noise_func = get_noise(noise, mrg=mrg)
            # apply the noise as a switch!
            # default to apply noise. this is for the cost and gradient functions to be computed later
            # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
            self.noise_switch = sharedX(value=1, name="basiclayer_noise_switch")

            # noise scheduling
            if noise_decay and noise_level is not None:
                self.noise_schedule = get_decay_function(noise_decay,
                                                         noise_level,
                                                         noise_level.get_value(),
                                                         noise_decay_amount)

        ###############
        # inputs hook #
        ###############
        # grab info from the inputs_hook
        # in the case of an inputs_hook, recurrent will always work with the leading tensor dimension
        # being the temporal dimension.
        # input is 3D tensor of (timesteps, batch_size, data_dim)
        # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D.
        # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D.
        if self.inputs_hook is not None:
            self.input = self.inputs_hook[1]

            if self.input.ndim == 1:
                self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 'x'), [1, 2])
                self.input_size = 1

            elif self.input.ndim == 2:
                self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 1), 1)

            elif self.input.ndim > 3:
                self.input = self.input.flatten(3)
                self.input_size = sum(self.input_size)
            else:
                raise NotImplementedError("Recurrent input with %d dimensions not supported!" % self.input.ndim)
            xs = self.input
        else:
            # Assume input coming from optimizer is (batches, timesteps, data)
            # so, we need to reshape to (timesteps, batches, data)
            self.input = T.tensor3("Xs")
            xs = self.input.dimshuffle(1, 0, 2)

        # The target outputs for supervised training - in the form of (batches, timesteps, output) which is
        # the same dimension ordering as the expected input from optimizer.
        # therefore, we need to swap it like we did to input xs.
        self.target = T.tensor3("Ys")
        ys = self.target.dimshuffle(1, 0, 2)

        ################
        # hiddens hook #
        ################
        # set an initial value for the recurrent hiddens from hook
        if self.hiddens_hook is not None:
            h_init = self.hiddens_hook[1]
            self.hidden_size = self.hiddens_hook[0]
        else:
            # deal with h_init after parameters are made (have to make the same size as hiddens that are computed)
            self.hidden_size = hidden_size

        ##################
        # for generating #
        ##################
        # symbolic scalar for how many recurrent steps to use during generation from the model
        self.n_steps = T.iscalar("generate_n_steps")

        ####################################################
        # parameters - make sure to deal with params_hook! #
        ####################################################
        if self.params_hook is not None:
            if not bidirectional:
                (W_x_c, W_x_i, W_x_f, W_x_o,
                 U_h_c, U_h_i, U_h_f, U_h_o,
                 W_h_y, b_c, b_i, b_f, b_o,
                 b_y) = self.params_hook
                recurrent_params = [U_h_c, U_h_i, U_h_f, U_h_o]
            else:
                (W_x_c, W_x_i, W_x_f, W_x_o,
                 U_h_c, U_h_i, U_h_f, U_h_o,
                 U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b,
                 W_h_y, b_c, b_i, b_f, b_o,
                 b_y) = self.params_hook
                recurrent_params = [U_h_c, U_h_i, U_h_f, U_h_o, U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b]
        # otherwise, construct our params
        else:
            # all input-to-hidden weights
            W_x_c, W_x_i, W_x_f, W_x_o = [
                get_weights(weights_init=weights_init,
                            shape=(self.input_size, self.hidden_size),
                            name="W_x_%s" % sub,
                            # if gaussian
                            mean=weights_mean,
                            std=weights_std,
                            # if uniform
                            interval=weights_interval)
                for sub in ['c', 'i', 'f', 'o']
            ]
            # all hidden-to-hidden weights
            U_h_c, U_h_i, U_h_f, U_h_o = [
                get_weights(weights_init=r_weights_init,
                            shape=(self.hidden_size, self.hidden_size),
                            name="U_h_%s" % sub,
                            # if gaussian
                            mean=r_weights_mean,
                            std=r_weights_std,
                            # if uniform
                            interval=r_weights_interval)
                for sub in ['c', 'i', 'f', 'o']
            ]
            # hidden-to-output weights
            W_h_y = get_weights(weights_init=weights_init,
                                shape=(self.hidden_size, self.output_size),
                                name="W_h_y",
                                # if gaussian
                                mean=weights_mean,
                                std=weights_std,
                                # if uniform
                                interval=weights_interval)
            # biases
            b_c, b_i, b_f, b_o = [
                get_bias(shape=(self.hidden_size,),
                         name="b_%s" % sub,
                         init_values=r_bias_init)
                for sub in ['c', 'i', 'f', 'o']
            ]
            # output bias
            b_y = get_bias(shape=(self.output_size,),
                           name="b_y",
                           init_values=bias_init)
            # clip gradients if we are doing that
            recurrent_params = [U_h_c, U_h_i, U_h_f, U_h_o]
            if clip_recurrent_grads:
                clip = abs(clip_recurrent_grads)
                U_h_c, U_h_i, U_h_f, U_h_o = [theano.gradient.grad_clip(p, -clip, clip) for p in recurrent_params]
            # bidirectional params
                if bidirectional:
                    # all hidden-to-hidden weights
                    U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b = [
                        get_weights(weights_init=r_weights_init,
                                    shape=(self.hidden_size, self.hidden_size),
                                    name="U_h_%s_b" % sub,
                                    # if gaussian
                                    mean=r_weights_mean,
                                    std=r_weights_std,
                                    # if uniform
                                    interval=r_weights_interval)
                        for sub in ['c', 'i', 'f', 'o']
                    ]
                    recurrent_params += [U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b]
                    if clip_recurrent_grads:
                        clip = abs(clip_recurrent_grads)
                        U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b = [theano.gradient.grad_clip(p, -clip, clip) for p in
                                                              [U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b]]

        # put all the parameters into our list, and make sure it is in the same order as when we try to load
        # them from a params_hook!!!
        self.params = [W_x_c, W_x_i, W_x_f, W_x_o] + recurrent_params + [W_h_y, b_c, b_i, b_f, b_o, b_y]

        # make h_init the right sized tensor
        if not self.hiddens_hook:
            h_init = T.zeros_like(T.dot(xs[0], W_x_c))

        c_init = T.zeros_like(T.dot(xs[0], W_x_c))

        ###############
        # computation #
        ###############
        # move some computation outside of scan to speed it up!
        x_c = T.dot(xs, W_x_c) + b_c
        x_i = T.dot(xs, W_x_i) + b_i
        x_f = T.dot(xs, W_x_f) + b_f
        x_o = T.dot(xs, W_x_o) + b_o

        # now do the recurrent stuff
        (self.hiddens, _), self.updates = theano.scan(
            fn=self.recurrent_step,
            sequences=[x_c, x_i, x_f, x_o],
            outputs_info=[h_init, c_init],
            non_sequences=[U_h_c, U_h_i, U_h_f, U_h_o],
            go_backwards=backward,
            name="lstm_scan",
            strict=True
        )

        # if bidirectional, do the same in reverse!
        if bidirectional:
            (hiddens_b, _), updates_b = theano.scan(
                fn=self.recurrent_step,
                sequences=[x_c, x_i, x_f, x_o],
                outputs_info=[h_init, c_init],
                non_sequences=[U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b],
                go_backwards=not backward,
                name="lstm_scan_back",
                strict=True
            )
            # flip the hiddens to be the right direction
            hiddens_b = hiddens_b[::-1]
            # update stuff
            self.updates.update(updates_b)
            self.hiddens += hiddens_b

        # add noise (like dropout) if we wanted it!
        if noise:
            self.hiddens = T.switch(self.noise_switch,
                                    noise_func(input=self.hiddens),
                                    self.hiddens)

        # now compute the outputs from the leftover (top level) hiddens
        self.output = activation_func(
            T.dot(self.hiddens, W_h_y) + b_y
        )

        # now to define the cost of the model - use the cost function to compare our output with the target value.
        self.cost = cost_function(output=self.output, target=ys, **cost_args)

        log.info("Initialized an LSTM!")
Example #6
0
    def __init__(self, model, dataset,
                 n_epoch=10, batch_size=100, minimum_batch_size=1,
                 save_frequency=None, early_stop_threshold=None, early_stop_length=None,
                 learning_rate=.1, lr_decay="exponential", lr_factor=.995,
                 momentum=0.5, momentum_decay="linear", momentum_factor=0, nesterov_momentum=True):
        """
        Initialize SGD.

        Parameters
        ----------
        model : Model
            The Model to train.
        dataset : Dataset
            The Dataset to use when training the Model.
        n_epoch : int
            how many training iterations over the dataset to go.
        batch_size : int
            How many examples from the training dataset to use in parallel.
        minimum_batch_size : int
            The minimum number of examples required at a time (for things like time series, this would be > 1).
        save_frequency : int
            How many epochs to train between each new save of the Model's parameters.
        early_stop_threshold : float
            The factor by how much the best validation training score needs to improve to determine early stopping.
        early_stop_length : int
            The patience or number of epochs to wait after the early_stop_threshold has been reached before stopping.
        learning_rate : float
            The multiplicative amount to adjust parameters based on their gradient values.
        lr_decay : str
            The type of decay function to use for changing the learning rate over epochs. See
            `opendeep.utils.decay` for options.
        lr_factor : float
            The amount to use for the decay function when changing the learning rate over epochs. See
            `opendeep.utils.decay` for its effect for given decay functions.
        momentum : float
            The momentum to use during gradient updates.
        momentum_decay : str
            The type of decay function to use for changing the momentum over epochs. See
            `opendeep.utils.decay` for options.
        momentum_factor : float
            The amount to use for the decay function when changing the momentum over epochs. See
            `opendeep.utils.decay` for its effect for given decay functions.
        nesterov_momentum : bool
            Whether or not to use Nesterov momentum.
        """
        # superclass init
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(SGD, self).__init__(**initial_parameters)

        # Momentum - smoothing over the parameter changes (see Hinton)
        if momentum:
            self.momentum = sharedX(momentum, 'momentum')
            if momentum_decay is not None and \
                            momentum_decay is not False and \
                            momentum_factor is not None:
                self.momentum_decay = get_decay_function(momentum_decay,
                                                         self.momentum,
                                                         self.momentum.get_value(),
                                                         momentum_factor)
            else:
                self.momentum_decay = False
        else:
            self.momentum = 0
            self.momentum_decay = False

        self.nesterov_momentum = nesterov_momentum
Example #7
0
    def __init__(self, model, dataset,
                 config=None, defaults=None,
                 n_epoch=None, batch_size=None, minimum_batch_size=None,
                 save_frequency=None, early_stop_threshold=None, early_stop_length=None,
                 learning_rate=None, lr_decay=None, lr_factor=None,
                 **kwargs):
        # Default values to use for some training parameters
        _defaults = {"n_epoch": 1000,
                     "batch_size": 100,
                     "minimum_batch_size": 1,
                     "save_frequency": 10,
                     "early_stop_threshold": .9995,
                     "early_stop_length": 30,
                     "learning_rate": 0.001,
                     "lr_decay": "exponential",
                     "lr_factor": 1,  # no learning rate decay by default
                     }

        log.debug("Initializing optimizer %s", str(type(self)))

        assert isinstance(model, Model), "Optimizer input model needs to be an opendeep Model class!"
        self.model = model
        self.dataset = dataset
        assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be an opendeep Dataset class!"

        # set self.args to be the combination of the defaults and the config dictionaries from the subclass
        in_args = combine_config_and_defaults(config, defaults)
        self.args = combine_config_and_defaults(in_args, _defaults)

        # if the args are none, make it a blank dictionary
        if self.args is None:
            self.args = {}

        # now that our required variables are out of the way, do the same thing for everything else passed via kwargs
        for arg, val in kwargs.items():
            if (val is not None or str(arg) not in self.args) and str(arg) != 'kwargs':
                self.args[str(arg)] = val
            # flatten kwargs if it was passed as a variable
            elif str(arg) == 'kwargs':
                inner_kwargs = kwargs['kwargs']
                for key, item in inner_kwargs.items():
                    if item is not None or str(key) not in self.args:
                        self.args[str(key)] = item

        # now take care of overriding explicits passed in
        if n_epoch is not None:
            self.args['n_epoch'] = n_epoch
        if batch_size is not None:
            self.args['batch_size'] = batch_size
        if minimum_batch_size is not None:
            self.args['minimum_batch_size'] = minimum_batch_size
        if save_frequency is not None:
            self.args['save_frequency'] = save_frequency
        if early_stop_threshold is not None:
            self.args['early_stop_threshold'] = early_stop_threshold
        if early_stop_length is not None:
            self.args['early_stop_length'] = early_stop_length
        if learning_rate is not None:
            self.args['learning_rate'] = learning_rate
        if lr_decay is not None:
            self.args['lr_decay'] = lr_decay
        if lr_factor is not None:
            self.args['lr_factor'] = lr_factor

        # Magic! Now self.args contains the combination of all the initialization variables, overridden like so:
        # _defaults < defaults < config < kwargs (explicits passed to model's __init__)

        # log the arguments
        log.debug("optimizer config args: %s", str(self.args))

        # Finally, to make things really easy, update the class 'self' with everything in self.args to make
        # all the parameters accessible via self.<param>
        self.__dict__.update(self.args)

        # Learning rate - how drastic of a step do the parameters change
        self.learning_rate = sharedX(self.learning_rate, 'learning_rate')
        self.lr_scalers = self.model.get_lr_scalers()
        if self.lr_decay:
            self.learning_rate_decay = get_decay_function(self.lr_decay,
                                                          self.learning_rate,
                                                          self.learning_rate.get_value(),
                                                          self.lr_factor)
        else:
            self.learning_rate_decay = False
Example #8
0
    def __init__(self, dataset, loss=None, model=None,
                 epochs=1000, batch_size=100, min_batch_size=1,
                 save_freq=10, stop_threshold=None, stop_patience=50,
                 learning_rate=1e-3, lr_decay=None, lr_decay_factor=None,
                 grad_clip=None, hard_clip=False,
                 **kwargs):
        """
        Initialize the Optimizer.

        Parameters
        ----------
        dataset : Dataset
            The :class:`opendeep.data.Dataset` to use when training the Model.
        loss : Loss
            The :class:`opendeep.optimization.loss.Loss` function to compare the model to a 'target' result.
        model : Model
            The :class:`opendeep.models.Model` to train. Needed if the Optimizer isn't being passed to a
            Model's .train() method.
        epochs : int
            How many training iterations over the dataset to go.
        batch_size : int
            How many examples from the training dataset to use in parallel.
        min_batch_size : int
            The minimum number of examples required at a time (for things like time series, this would be > 1).
        save_freq : int, optional
            How many epochs to train between each new save of the Model's parameters.
        stop_threshold : float, optional
            The factor by how much the best validation training score needs to improve to determine early stopping.
        stop_patience : int, optional
            The patience or number of epochs to wait after the stop_threshold has been reached before stopping.
        learning_rate : float
            The multiplicative amount to adjust parameters based on their gradient values.
        lr_decay : str
            The decay function to use for changing the learning rate over epochs. See
            `opendeep.utils.decay` for classes of decay and documentation.
        lr_decay_factor : float
            The amount of decay to use for the ``lr_decay`` type of decay.
        grad_clip : float, optional
            Whether to clip gradients. This will clip the norm of the gradients either with a hard cutoff or rescaling.
        hard_clip : bool
            Whether to use a hard cutoff or rescaling for clipping gradients.
        """
        log.info("Initializing optimizer %s", str(self.__class__.__name__))

        # Deal with early stopping None initializations (no early stopping).
        if not stop_threshold:
            stop_threshold = numpy.inf
        if not save_freq:
            save_freq = 1000000
        if not stop_patience:
            stop_patience = 1

        # Put all init parameters in self.args so we can log the initial configuration.
        self.args = locals().copy()
        self.args.pop('self')
        kwargs = self.args.pop('kwargs')
        self.args = add_kwargs_to_dict(kwargs, self.args)
        # log the arguments
        log.info("Optimizer config args: %s", str(self.args))
        # if the optimizer wasn't initialized with a Model (train() being called from the model class itself),
        # just return. (This seems kinda hacky but hey, people wanted .train() to happen from Model and there
        # wasn't really a better way unless the epoch looping logic was in that method for Model. That wasn't
        # the best option because other methods besides stochastic ones can exist for optimizers in the future.
        # TODO: fix this up - feels like a hack just to make model.train() work...
        if not model:
            return
        # Otherwise, things are proceeding as normal. Carry on...

        assert isinstance(model, Model), "Optimizer input model needs to be a Model class! " \
                                         "Found %s" % str(model.__class__.__name__)
        assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be a Dataset class! " \
                                             "Found %s" % str(dataset.__class__.__name__)
        # deal with loss expression/targets
        if loss is not None:
            assert isinstance(loss, Loss), "Optimizer input loss needs to be a Loss class! " \
                                           "Found %s" % str(loss.__class__.__name__)
        if isinstance(loss, Loss):
            self.loss_targets = loss.get_targets()
            self.loss_expression = loss.get_loss()
        else:
            assert model.get_loss() is not None, "No Loss specified, and the model does not have one implemented."
            if isinstance(model.get_loss(), tuple):
                self.loss_targets = raise_to_list(model.get_loss()[0])
                self.loss_expression = model.get_loss()[1]
            else:
                self.loss_targets = None
                self.loss_expression = model.get_loss()

        model_inputs = raise_to_list(model.get_inputs())
        n_model_inputs = len(model_inputs)

        model_targets = self.loss_targets or []
        for input in model_inputs:
            if input in model_targets:
                model_targets.remove(input)

        n_model_targets = len(model_targets)
        self.unsupervised = (n_model_targets is 0)
        # make sure the number of inputs/targets matches up with the dataset properties
        # train
        assert n_model_inputs == len(raise_to_list(dataset.train_inputs)), \
            "Dataset has %d train inputs, while model expects %d" % \
            (len(raise_to_list(dataset.train_inputs)), n_model_inputs)
        if not self.unsupervised:
            assert n_model_targets == len(raise_to_list(dataset.train_targets) or []), \
                "Dataset has %d train targets, while model expects %d" % \
                (len(raise_to_list(dataset.train_targets) or []), n_model_targets)
        # valid
        if dataset.valid_inputs is not None:
            assert n_model_inputs == len(raise_to_list(dataset.valid_inputs)), \
                "Dataset has %d valid inputs, while model expects %d" % \
                (len(raise_to_list(dataset.valid_inputs)), n_model_inputs)
            if not self.unsupervised:
                assert n_model_targets == len(raise_to_list(dataset.valid_targets) or []), \
                    "Dataset has %d valid targets, while model expects %d" % \
                    (len(raise_to_list(dataset.valid_targets) or []), n_model_targets)
        # test
        if dataset.test_inputs is not None:
            assert n_model_inputs == len(raise_to_list(dataset.test_inputs)), \
                "Dataset has %d test inputs, while model expects %d" % \
                (len(raise_to_list(dataset.test_inputs)), n_model_inputs)
            if not self.unsupervised:
                assert n_model_targets == len(raise_to_list(dataset.test_targets) or []), \
                    "Dataset has %d test targets, while model expects %d" % \
                    (len(raise_to_list(dataset.test_targets) or []), n_model_targets)

        # now we are happy, we can add them to `self`
        self.model = model
        self.dataset = dataset
        self.loss = loss

        # Learning rate - how drastic of a step do the parameters change
        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        # whether to scale individual model parameters' learning rates.
        self.lr_scalers = self.model.get_lr_scalers()
        # whether to decay
        if lr_decay:
            self.learning_rate_decay = get_decay_function(lr_decay,
                                                          self.learning_rate,
                                                          learning_rate,
                                                          lr_decay_factor)
        else:
            self.learning_rate_decay = False

        # rest of initial parameters needed for training.
        self.batch_size = batch_size
        self.min_batch_size = min_batch_size
        self.n_epoch = epochs
        self.save_frequency = save_freq
        self.early_stop_threshold = stop_threshold
        self.early_stop_length = stop_patience
        self.grad_clip = grad_clip
        self.hard_clip = hard_clip
    def __init__(self, dataset, loss, model=None,
                 epochs=10, batch_size=100, min_batch_size=1,
                 save_freq=None, stop_threshold=None, stop_patience=None,
                 learning_rate=.1, lr_decay="exponential", lr_decay_factor=.995,
                 momentum=0.5, momentum_decay="linear", momentum_factor=0, nesterov_momentum=True,
                 grad_clip=None, hard_clip=False):
        """
        Initialize SGD.

        Parameters
        ----------
        dataset : Dataset
            The :class:`opendeep.data.Dataset` to use when training the Model.
        loss : Loss
            The :class:`opendeep.optimization.loss.Loss` function to compare the model to a 'target' result.
        model : Model
            The :class:`opendeep.models.Model` to train. Needed if the Optimizer isn't being passed to a
            Model's .train() method.
        epochs : int
            how many training iterations over the dataset to go.
        batch_size : int
            How many examples from the training dataset to use in parallel.
        min_batch_size : int
            The minimum number of examples required at a time (for things like time series, this would be > 1).
        save_freq : int
            How many epochs to train between each new save of the Model's parameters.
        stop_threshold : float
            The factor by how much the best validation training score needs to improve to determine early stopping.
        stop_patience : int
            The patience or number of epochs to wait after the stop_threshold has been reached before stopping.
        learning_rate : float
            The multiplicative amount to adjust parameters based on their gradient values.
        lr_decay : str
            The type of decay function to use for changing the learning rate over epochs. See
            `opendeep.utils.decay` for options.
        lr_decay_factor : float
            The amount to use for the decay function when changing the learning rate over epochs. See
            `opendeep.utils.decay` for its effect for given decay functions.
        momentum : float
            The momentum to use during gradient updates.
        momentum_decay : str
            The type of decay function to use for changing the momentum over epochs. See
            `opendeep.utils.decay` for options.
        momentum_factor : float
            The amount to use for the decay function when changing the momentum over epochs. See
            `opendeep.utils.decay` for its effect for given decay functions.
        nesterov_momentum : bool
            Whether or not to use Nesterov momentum.
        grad_clip : float, optional
            Whether to clip gradients. This will clip with a maximum of grad_clip or the parameter norm.
        hard_clip : bool
            Whether to use a hard cutoff or rescaling for clipping gradients.
        """
        # superclass init
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(SGD, self).__init__(**initial_parameters)

        # Momentum - smoothing over the parameter changes (see Hinton)
        if momentum:
            self.momentum = sharedX(momentum, 'momentum')
            if momentum_decay is not None and \
                            momentum_decay is not False and \
                            momentum_factor is not None:
                self.momentum_decay = get_decay_function(momentum_decay,
                                                         self.momentum,
                                                         self.momentum.get_value(),
                                                         momentum_factor)
            else:
                self.momentum_decay = False
        else:
            self.momentum = 0
            self.momentum_decay = False

        self.nesterov_momentum = nesterov_momentum
Example #10
0
    def __init__(self,
                 inputs_hook=None,
                 hiddens_hook=None,
                 params_hook=None,
                 outdir='outputs/rnn/',
                 input_size=None,
                 hidden_size=None,
                 output_size=None,
                 layers=1,
                 activation='sigmoid',
                 hidden_activation='relu',
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 weights_init='uniform',
                 weights_interval='montreal',
                 weights_mean=0,
                 weights_std=5e-3,
                 bias_init=0.0,
                 r_weights_init='identity',
                 r_weights_interval='montreal',
                 r_weights_mean=0,
                 r_weights_std=5e-3,
                 r_bias_init=0.0,
                 cost_function='mse',
                 cost_args=None,
                 noise='dropout',
                 noise_level=None,
                 noise_decay=False,
                 noise_decay_amount=.99,
                 direction='forward',
                 clip_recurrent_grads=False):
        """
        Initialize a simple recurrent network.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a
            newly supervised classification model). For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. n_in).
        hiddens_hook : Tuple of (shape, variable)
            Routing information for the model to accept its hidden representation from elsewhere. For recurrent nets,
            this will be the initial starting value for hidden layers.
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters.
        outdir : str
            The location to produce outputs from training or running the :class:`RNN`. If None, nothing will be saved.
        input_size : int
            The size (dimensionality) of the input. If shape is provided in `inputs_hook`, this is optional.
        hidden_size : int
            The size (dimensionality) of the hidden layers. If shape is provided in `hiddens_hook`, this is optional.
        output_size : int
            The size (dimensionality) of the output.
        layers : int
            The number of stacked hidden layers to use.
        activation : str or callable
            The nonlinear (or linear) activation to perform after the dot product from hiddens -> output layer.
            This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        hidden_activation : str or callable
            The activation to perform for the hidden layers.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        r_weights_init : str
            Determines the method for initializing recurrent model weights. See opendeep.utils.nnet for options.
        r_weights_interval : str or float
            If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        r_weights_mean : float
            If Gaussian `r_weights_init`, the mean value to use.
        r_weights_std : float
            If Gaussian `r_weights_init`, the standard deviation to use.
        r_bias_init : float
            The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred.
        cost_function : str or callable
            The function to use when calculating the output cost of the model.
            See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        noise : str
            What type of noise to use for the hidden layers and outputs. See opendeep.utils.noise
            for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        noise_level : float
            The amount of noise to use for the noise function specified by `hidden_noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        noise_decay : str or False
            Whether to use `noise` scheduling (decay `noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_decay_amount : float
            The amount to reduce the `noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        direction : str
            The direction this recurrent model should go over its inputs. Can be 'forward', 'backward', or
            'bidirectional'. In the case of 'bidirectional', it will make two passes over the sequence,
            computing two sets of hiddens and merging them before running through the final decoder.
        clip_recurrent_grads : False or float, optional
            Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights
            connecting previous hidden states to the current hidden state, and not the weights from current
            input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range
            `+-clip_recurrent_grads`.

        Raises
        ------
        AssertionError
            When asserting various properties of input parameters. See error messages.
        """
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(RNN, self).__init__(**initial_parameters)

        ##################
        # specifications #
        ##################
        self.direction = direction
        self.bidirectional = (direction == "bidirectional")
        self.backward = (direction == "backward")
        self.layers = layers
        self.noise = noise

        self.weights_init = weights_init
        self.weights_mean = weights_mean
        self.weights_std = weights_std
        self.weights_interval = weights_interval

        self.r_weights_init = r_weights_init
        self.r_weights_mean = r_weights_mean
        self.r_weights_std = r_weights_std
        self.r_weights_interval = r_weights_interval

        self.bias_init = bias_init
        self.r_bias_init = r_bias_init

        #########################################
        # activation, cost, and noise functions #
        #########################################
        # recurrent hidden activation function!
        self.hidden_activation_func = get_activation_function(
            hidden_activation)

        # output activation function!
        self.activation_func = get_activation_function(activation)

        # Cost function
        self.cost_function = get_cost_function(cost_function)
        self.cost_args = cost_args or dict()

        # Now deal with noise if we added it:
        if self.noise:
            log.debug('Adding %s noise switch.' % str(noise))
            if noise_level is not None:
                noise_level = sharedX(value=noise_level)
                self.noise_func = get_noise(noise,
                                            noise_level=noise_level,
                                            mrg=mrg)
            else:
                self.noise_func = get_noise(noise, mrg=mrg)
            # apply the noise as a switch!
            # default to apply noise. this is for the cost and gradient functions to be computed later
            # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
            self.noise_switch = sharedX(value=1,
                                        name="basiclayer_noise_switch")

            # noise scheduling
            if noise_decay and noise_level is not None:
                self.noise_schedule = get_decay_function(
                    noise_decay, noise_level, noise_level.get_value(),
                    noise_decay_amount)

        ###############
        # inputs hook #
        ###############
        # grab info from the inputs_hook
        # in the case of an inputs_hook, recurrent will always work with the leading tensor dimension
        # being the temporal dimension.
        # input is 3D tensor of (timesteps, batch_size, data_dim)
        # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D.
        # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D.
        if self.inputs_hook is not None:
            self.input = self.inputs_hook[1]

            if self.input.ndim == 1:
                self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 'x'),
                                           [1, 2])
                self.input_size = 1

            elif self.input.ndim == 2:
                self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 1), 1)

            elif self.input.ndim == 3:
                pass

            elif self.input.ndim > 3:
                self.input = self.input.flatten(3)
                self.input_size = sum(self.input_size)
            else:
                raise NotImplementedError(
                    "Recurrent input with %d dimensions not supported!" %
                    self.input.ndim)
        else:
            # Assume input coming from optimizer is (batches, timesteps, data)
            # so, we need to reshape to (timesteps, batches, data)
            xs = T.tensor3("Xs")
            xs = xs.dimshuffle(1, 0, 2)
            self.input = xs

        # The target outputs for supervised training - in the form of (batches, timesteps, output) which is
        # the same dimension ordering as the expected input from optimizer.
        # therefore, we need to swap it like we did to input xs.
        ys = T.tensor3("Ys")
        ys = ys.dimshuffle(1, 0, 2)
        self.target = ys

        ################
        # hiddens hook #
        ################
        # set an initial value for the recurrent hiddens from hook
        if self.hiddens_hook is not None:
            self.h_init = self.hiddens_hook[1]
            self.hidden_size = self.hiddens_hook[0]
        else:
            # deal with h_init after parameters are made (have to make the same size as hiddens that are computed)
            self.hidden_size = hidden_size

        ##################
        # for generating #
        ##################
        # symbolic scalar for how many recurrent steps to use during generation from the model
        self.n_steps = T.iscalar("generate_n_steps")

        self.output, self.hiddens, self.updates, self.cost, self.params = self.build_computation_graph(
        )
Example #11
0
    def __init__(self,
                 inputs_hook=None,
                 hiddens_hook=None,
                 params_hook=None,
                 outdir='outputs/gru/',
                 input_size=None,
                 hidden_size=None,
                 output_size=None,
                 activation='sigmoid',
                 hidden_activation='relu',
                 inner_hidden_activation='sigmoid',
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 weights_init='uniform',
                 weights_interval='montreal',
                 weights_mean=0,
                 weights_std=5e-3,
                 bias_init=0.0,
                 r_weights_init='identity',
                 r_weights_interval='montreal',
                 r_weights_mean=0,
                 r_weights_std=5e-3,
                 r_bias_init=0.0,
                 cost_function='mse',
                 cost_args=None,
                 noise='dropout',
                 noise_level=None,
                 noise_decay=False,
                 noise_decay_amount=.99,
                 forward=True,
                 clip_recurrent_grads=False):
        """
        Initialize a simple recurrent network.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a
            newly supervised classification model). For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. n_in).
        hiddens_hook : Tuple of (shape, variable)
            Routing information for the model to accept its hidden representation from elsewhere. For recurrent nets,
            this will be the initial starting value for hidden layers.
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters.
        outdir : str
            The location to produce outputs from training or running the :class:`RNN`. If None, nothing will be saved.
        input_size : int
            The size (dimensionality) of the input. If shape is provided in `inputs_hook`, this is optional.
        hidden_size : int
            The size (dimensionality) of the hidden layers. If shape is provided in `hiddens_hook`, this is optional.
        output_size : int
            The size (dimensionality) of the output.
        activation : str or callable
            The nonlinear (or linear) activation to perform after the dot product from hiddens -> output layer.
            This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        hidden_activation : str or callable
            The activation to perform for the hidden units.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        inner_hidden_activation : str or callable
            The activation to perform for the hidden gates.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        r_weights_init : str
            Determines the method for initializing recurrent model weights. See opendeep.utils.nnet for options.
        r_weights_interval : str or float
            If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        r_weights_mean : float
            If Gaussian `r_weights_init`, the mean value to use.
        r_weights_std : float
            If Gaussian `r_weights_init`, the standard deviation to use.
        r_bias_init : float
            The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred.
        cost_function : str or callable
            The function to use when calculating the output cost of the model.
            See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        noise : str
            What type of noise to use for the hidden layers and outputs. See opendeep.utils.noise
            for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        noise_level : float
            The amount of noise to use for the noise function specified by `hidden_noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        noise_decay : str or False
            Whether to use `noise` scheduling (decay `noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_decay_amount : float
            The amount to reduce the `noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        forward : bool
            The direction this recurrent model should go over its inputs. True means forward, False mean backward.
        clip_recurrent_grads : False or float, optional
            Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights
            connecting previous hidden states to the current hidden state, and not the weights from current
            input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range
            `+-clip_recurrent_grads`.
        """
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(GRU, self).__init__(**initial_parameters)

        ##################
        # specifications #
        ##################

        #########################################
        # activation, cost, and noise functions #
        #########################################
        # recurrent hidden activation function!
        self.hidden_activation_func = get_activation_function(
            hidden_activation)
        self.inner_hidden_activation_func = get_activation_function(
            inner_hidden_activation)

        # output activation function!
        activation_func = get_activation_function(activation)

        # Cost function
        cost_function = get_cost_function(cost_function)
        cost_args = cost_args or dict()

        # Now deal with noise if we added it:
        if noise:
            log.debug('Adding %s noise switch.' % str(noise))
            if noise_level is not None:
                noise_level = sharedX(value=noise_level)
                noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg)
            else:
                noise_func = get_noise(noise, mrg=mrg)
            # apply the noise as a switch!
            # default to apply noise. this is for the cost and gradient functions to be computed later
            # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
            self.noise_switch = sharedX(value=1, name="gru_noise_switch")

            # noise scheduling
            if noise_decay and noise_level is not None:
                self.noise_schedule = get_decay_function(
                    noise_decay, noise_level, noise_level.get_value(),
                    noise_decay_amount)

        ###############
        # inputs hook #
        ###############
        # grab info from the inputs_hook
        # in the case of an inputs_hook, recurrent will always work with the leading tensor dimension
        # being the temporal dimension.
        # input is 3D tensor of (timesteps, batch_size, data_dim)
        # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D.
        # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D.
        if self.inputs_hook is not None:
            self.input = self.inputs_hook[1]

            if self.input.ndim == 1:
                self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 'x'),
                                           [1, 2])
                self.input_size = 1

            elif self.input.ndim == 2:
                self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 1), 1)

            elif self.input.ndim > 3:
                self.input = self.input.flatten(3)
                self.input_size = sum(self.input_size)
            else:
                raise NotImplementedError(
                    "Recurrent input with %d dimensions not supported!" %
                    self.input.ndim)
            xs = self.input
        else:
            # Assume input coming from optimizer is (batches, timesteps, data)
            # so, we need to reshape to (timesteps, batches, data)
            self.input = T.tensor3("Xs")
            xs = self.input.dimshuffle(1, 0, 2)

        # The target outputs for supervised training - in the form of (batches, timesteps, output) which is
        # the same dimension ordering as the expected input from optimizer.
        # therefore, we need to swap it like we did to input xs.
        self.target = T.tensor3("Ys")
        ys = self.target.dimshuffle(1, 0, 2)

        ################
        # hiddens hook #
        ################
        # set an initial value for the recurrent hiddens from hook
        if self.hiddens_hook is not None:
            h_init = self.hiddens_hook[1]
            self.hidden_size = self.hiddens_hook[0]
        else:
            # deal with h_init after parameters are made (have to make the same size as hiddens that are computed)
            self.hidden_size = hidden_size

        ##################
        # for generating #
        ##################
        # symbolic scalar for how many recurrent steps to use during generation from the model
        self.n_steps = T.iscalar("generate_n_steps")

        ####################################################
        # parameters - make sure to deal with params_hook! #
        ####################################################
        if self.params_hook is not None:
            (W_x_z, W_x_r, W_x_h, U_h_z, U_h_r, U_h_h, W_h_y, b_z, b_r, b_h,
             b_y) = self.params_hook
            recurrent_params = [U_h_z, U_h_r, U_h_h]
        # otherwise, construct our params
        else:
            # all input-to-hidden weights
            W_x_z, W_x_r, W_x_h = [
                get_weights(
                    weights_init=weights_init,
                    shape=(self.input_size, self.hidden_size),
                    name="W_x_%s" % sub,
                    # if gaussian
                    mean=weights_mean,
                    std=weights_std,
                    # if uniform
                    interval=weights_interval) for sub in ['z', 'r', 'h']
            ]
            # all hidden-to-hidden weights
            U_h_z, U_h_r, U_h_h = [
                get_weights(
                    weights_init=r_weights_init,
                    shape=(self.hidden_size, self.hidden_size),
                    name="U_h_%s" % sub,
                    # if gaussian
                    mean=r_weights_mean,
                    std=r_weights_std,
                    # if uniform
                    interval=r_weights_interval) for sub in ['z', 'r', 'h']
            ]
            # hidden-to-output weights
            W_h_y = get_weights(
                weights_init=weights_init,
                shape=(self.hidden_size, self.output_size),
                name="W_h_y",
                # if gaussian
                mean=weights_mean,
                std=weights_std,
                # if uniform
                interval=weights_interval)
            # biases
            b_z, b_r, b_h = [
                get_bias(shape=(self.hidden_size, ),
                         name="b_%s" % sub,
                         init_values=r_bias_init) for sub in ['z', 'r', 'h']
            ]
            # output bias
            b_y = get_bias(shape=(self.output_size, ),
                           name="b_y",
                           init_values=bias_init)
            # clip gradients if we are doing that
            recurrent_params = [U_h_z, U_h_r, U_h_h]
            if clip_recurrent_grads:
                clip = abs(clip_recurrent_grads)
                U_h_z, U_h_r, U_h_h = [
                    theano.gradient.grad_clip(p, -clip, clip)
                    for p in recurrent_params
                ]

        # put all the parameters into our list, and make sure it is in the same order as when we try to load
        # them from a params_hook!!!
        self.params = [W_x_z, W_x_r, W_x_h
                       ] + recurrent_params + [W_h_y, b_z, b_r, b_h, b_y]

        # make h_init the right sized tensor
        if not self.hiddens_hook:
            h_init = T.zeros_like(T.dot(xs[0], W_x_h))

        ###############
        # computation #
        ###############
        # move some computation outside of scan to speed it up!
        x_z = T.dot(xs, W_x_z) + b_z
        x_r = T.dot(xs, W_x_r) + b_r
        x_h = T.dot(xs, W_x_h) + b_h

        # now do the recurrent stuff
        self.hiddens, self.updates = theano.scan(
            fn=self.recurrent_step,
            sequences=[x_z, x_r, x_h],
            outputs_info=[h_init],
            non_sequences=[U_h_z, U_h_r, U_h_h],
            go_backwards=not forward,
            name="gru_scan",
            strict=True)

        # add noise (like dropout) if we wanted it!
        if noise:
            self.hiddens = T.switch(self.noise_switch,
                                    noise_func(input=self.hiddens),
                                    self.hiddens)

        # now compute the outputs from the leftover (top level) hiddens
        self.output = activation_func(T.dot(self.hiddens, W_h_y) + b_y)

        # now to define the cost of the model - use the cost function to compare our output with the target value.
        self.cost = cost_function(output=self.output, target=ys, **cost_args)

        log.info("Initialized a GRU!")
Example #12
0
    def __init__(self, model, dataset,
                 n_epoch=1000, batch_size=100, minimum_batch_size=1,
                 save_frequency=10, early_stop_threshold=.9995, early_stop_length=30,
                 learning_rate=1e-3, lr_decay='exponential', lr_factor=1,
                 **kwargs):
        """
        Initialize the Optimizer.

        Parameters
        ----------
        model : Model
            The Model to train.
        dataset : Dataset
            The Dataset to use when training the Model.
        n_epoch : int
            how many training iterations over the dataset to go.
        batch_size : int
            How many examples from the training dataset to use in parallel.
        minimum_batch_size : int
            The minimum number of examples required at a time (for things like time series, this would be > 1).
        save_frequency : int
            How many epochs to train between each new save of the Model's parameters.
        early_stop_threshold : float
            The factor by how much the best validation training score needs to improve to determine early stopping.
        early_stop_length : int
            The patience or number of epochs to wait after the early_stop_threshold has been reached before stopping.
        learning_rate : float
            The multiplicative amount to adjust parameters based on their gradient values.
        lr_decay : str
            The type of decay function to use for changing the learning rate over epochs. See
            `opendeep.utils.decay` for options.
        lr_factor : float
            The amount to use for the decay function when changing the learning rate over epochs. See
            `opendeep.utils.decay` for its effect for given decay functions.
        """
        log.info("Initializing optimizer %s", str(type(self)))

        if early_stop_threshold is None:
            early_stop_threshold = 1.
        if save_frequency is None:
            save_frequency = 1000000
        if early_stop_length is None:
            early_stop_length = 100

        self.args = locals().copy()
        self.args.pop('self')
        kwargs = self.args.pop('kwargs')
        self.args = add_kwargs_to_dict(kwargs, self.args)
        # log the arguments
        log.info("optimizer config args: %s", str(self.args))

        assert isinstance(model, Model), "Optimizer input model needs to be an opendeep Model class!"
        assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be an opendeep Dataset class!"
        self.model = model
        self.dataset = dataset

        # Learning rate - how drastic of a step do the parameters change
        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        self.lr_scalers = self.model.get_lr_scalers()
        if lr_decay:
            self.learning_rate_decay = get_decay_function(lr_decay,
                                                          self.learning_rate,
                                                          self.learning_rate.get_value(),
                                                          lr_factor)
        else:
            self.learning_rate_decay = False

        self.noise_switches = raise_to_list(self.model.get_noise_switch())
        self.batch_size = batch_size
        self.minimum_batch_size = minimum_batch_size
        self.n_epoch = n_epoch
        self.save_frequency = save_frequency
        self.early_stop_threshold = early_stop_threshold
        self.early_stop_length = early_stop_length
Example #13
0
    def __init__(self, dataset, loss=None, model=None,
                 epochs=1000, batch_size=100, min_batch_size=1,
                 save_freq=10, stop_threshold=None, stop_patience=50,
                 learning_rate=1e-3, lr_decay=None, lr_decay_factor=None,
                 grad_clip=None, hard_clip=False,
                 **kwargs):
        """
        Initialize the Optimizer.

        Parameters
        ----------
        dataset : Dataset
            The :class:`opendeep.data.Dataset` to use when training the Model.
        loss : Loss
            The :class:`opendeep.optimization.loss.Loss` function to compare the model to a 'target' result.
        model : Model
            The :class:`opendeep.models.Model` to train. Needed if the Optimizer isn't being passed to a
            Model's .train() method.
        epochs : int
            How many training iterations over the dataset to go.
        batch_size : int
            How many examples from the training dataset to use in parallel.
        min_batch_size : int
            The minimum number of examples required at a time (for things like time series, this would be > 1).
        save_freq : int, optional
            How many epochs to train between each new save of the Model's parameters.
        stop_threshold : float, optional
            The factor by how much the best validation training score needs to improve to determine early stopping.
        stop_patience : int, optional
            The patience or number of epochs to wait after the stop_threshold has been reached before stopping.
        learning_rate : float
            The multiplicative amount to adjust parameters based on their gradient values.
        lr_decay : str
            The decay function to use for changing the learning rate over epochs. See
            `opendeep.utils.decay` for classes of decay and documentation.
        lr_decay_factor : float
            The amount of decay to use for the ``lr_decay`` type of decay.
        grad_clip : float, optional
            Whether to clip gradients. This will clip the norm of the gradients either with a hard cutoff or rescaling.
        hard_clip : bool
            Whether to use a hard cutoff or rescaling for clipping gradients.
        """
        log.info("Initializing optimizer %s", str(self.__class__.__name__))

        # Deal with early stopping None initializations (no early stopping).
        if not stop_threshold:
            stop_threshold = numpy.inf
        if not save_freq:
            save_freq = 1000000
        if not stop_patience:
            stop_patience = 1

        # Put all init parameters in self.args so we can log the initial configuration.
        self.args = locals().copy()
        self.args.pop('self')
        kwargs = self.args.pop('kwargs')
        self.args = add_kwargs_to_dict(kwargs, self.args)
        # log the arguments
        log.info("Optimizer config args: %s", str(self.args))
        # if the optimizer wasn't initialized with a Model (train() being called from the model class itself),
        # just return. (This seems kinda hacky but hey, people wanted .train() to happen from Model and there
        # wasn't really a better way unless the epoch looping logic was in that method for Model. That wasn't
        # the best option because other methods besides stochastic ones can exist for optimizers in the future.
        # TODO: fix this up - feels like a hack just to make model.train() work...
        if not model:
            return
        # Otherwise, things are proceeding as normal. Carry on...

        assert isinstance(model, Model), "Optimizer input model needs to be a Model class! " \
                                         "Found %s" % str(model.__class__.__name__)
        assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be a Dataset class! " \
                                             "Found %s" % str(dataset.__class__.__name__)
        # deal with loss expression/targets
        if loss is not None:
            assert isinstance(loss, Loss), "Optimizer input loss needs to be a Loss class! " \
                                           "Found %s" % str(loss.__class__.__name__)
        if isinstance(loss, Loss):
            self.loss_targets = loss.get_targets()
            self.loss_expression = loss.get_loss()
        else:
            assert model.get_loss() is not None, "No Loss specified, and the model does not have one implemented."
            if isinstance(model.get_loss(), tuple):
                self.loss_targets = raise_to_list(model.get_loss()[0])
                self.loss_expression = model.get_loss()[1]
            else:
                self.loss_targets = None
                self.loss_expression = model.get_loss()

        model_inputs = raise_to_list(model.get_inputs())
        n_model_inputs = len(model_inputs)

        model_targets = self.loss_targets or []
        for input in model_inputs:
            if input in model_targets:
                model_targets.remove(input)

        n_model_targets = len(model_targets)
        self.unsupervised = (n_model_targets is 0)
        # make sure the number of inputs/targets matches up with the dataset properties
        # train
        assert n_model_inputs == len(raise_to_list(dataset.train_inputs)), \
            "Dataset has %d train inputs, while model expects %d" % \
            (len(raise_to_list(dataset.train_inputs)), n_model_inputs)
        if not self.unsupervised:
            assert n_model_targets == len(raise_to_list(dataset.train_targets) or []), \
                "Dataset has %d train targets, while model expects %d" % \
                (len(raise_to_list(dataset.train_targets) or []), n_model_targets)
        # valid
        if dataset.valid_inputs is not None:
            assert n_model_inputs == len(raise_to_list(dataset.valid_inputs)), \
                "Dataset has %d valid inputs, while model expects %d" % \
                (len(raise_to_list(dataset.valid_inputs)), n_model_inputs)
            if not self.unsupervised:
                assert n_model_targets == len(raise_to_list(dataset.valid_targets) or []), \
                    "Dataset has %d valid targets, while model expects %d" % \
                    (len(raise_to_list(dataset.valid_targets) or []), n_model_targets)
        # test
        if dataset.test_inputs is not None:
            assert n_model_inputs == len(raise_to_list(dataset.test_inputs)), \
                "Dataset has %d test inputs, while model expects %d" % \
                (len(raise_to_list(dataset.test_inputs)), n_model_inputs)
            if not self.unsupervised:
                assert n_model_targets == len(raise_to_list(dataset.test_targets) or []), \
                    "Dataset has %d test targets, while model expects %d" % \
                    (len(raise_to_list(dataset.test_targets) or []), n_model_targets)

        # now we are happy, we can add them to `self`
        self.model = model
        self.dataset = dataset
        self.loss = loss

        # Learning rate - how drastic of a step do the parameters change
        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        # whether to scale individual model parameters' learning rates.
        self.lr_scalers = self.model.get_lr_scalers()
        # whether to decay
        if lr_decay:
            self.learning_rate_decay = get_decay_function(lr_decay,
                                                          self.learning_rate,
                                                          learning_rate,
                                                          lr_decay_factor)
        else:
            self.learning_rate_decay = False

        # rest of initial parameters needed for training.
        self.batch_size = batch_size
        self.min_batch_size = min_batch_size
        self.n_epoch = epochs
        self.save_frequency = save_freq
        self.early_stop_threshold = stop_threshold
        self.early_stop_length = stop_patience
        self.grad_clip = grad_clip
        self.hard_clip = hard_clip
Example #14
0
    def __init__(self,
                 model,
                 dataset,
                 iterator_class=SequentialIterator,
                 config=None,
                 defaults=_defaults,
                 rng=None,
                 n_epoch=None,
                 batch_size=None,
                 minimum_batch_size=None,
                 save_frequency=None,
                 early_stop_threshold=None,
                 early_stop_length=None,
                 learning_rate=None,
                 lr_decay=None,
                 lr_factor=None,
                 momentum=None,
                 momentum_decay=None,
                 momentum_factor=None,
                 nesterov_momentum=None,
                 flag_para_load=None):
        # superclass init
        super(SGD, self).__init__(config=config, defaults=defaults)
        # config and defaults are now combined in self.args! yay!

        self.model = model
        self.dataset = dataset
        self.iterator = iterator_class

        # Training epochs - how many times to iterate over the whole dataset
        self.n_epoch = n_epoch or self.args.get('n_epoch')

        # Dataset iteration batch sizes - number of examples in each calculation
        self.batch_size = batch_size or self.args.get('batch_size')
        self.minimum_batch_size = minimum_batch_size or self.args.get(
            'minimum_batch_size')

        # Number of epochs between saving model parameters
        self.save_frequency = save_frequency or self.args.get('save_frequency')

        # Early stopping threshold and patience - by how much does the cost have to improve over a number of epochs
        self.early_stop_threshold = early_stop_threshold or self.args.get(
            'early_stop_threshold')
        self.early_stop_length = early_stop_length or self.args.get(
            'early_stop_length')

        # Learning rate - how drastic of a step do the parameters change
        lr = learning_rate or self.args.get('learning_rate')
        self.learning_rate = sharedX(lr, 'learning_rate')
        self.lr_scalers = self.model.get_lr_scalers()
        if lr_decay or self.args.get('lr_decay'):
            self.learning_rate_decay = get_decay_function(
                lr_decay or self.args.get('lr_decay'), self.learning_rate,
                self.learning_rate.get_value(), lr_factor
                or self.args.get('lr_factor'))

        # Momentum - smoothing over the parameter changes (see Hinton)
        self.momentum = sharedX(momentum or self.args.get('momentum'),
                                'momentum')
        if self.args.get('momentum_decay'):
            self.momentum_decay = get_decay_function(
                momentum_decay or self.args.get('momentum_decay'),
                self.momentum, self.momentum.get_value(), momentum_factor
                or self.args.get('momentum_factor'))
        self.nesterov_momentum = nesterov_momentum or self.args.get(
            'nesterov_momentum')

        # RNG for working on random iterator
        if rng is None:
            random.seed(123)
            self.rng = random
        else:
            self.rng = rng

        self.params = self.model.get_params()

        # Now create the training cost function for the model to use while training - update parameters
        log.info("%s params: %s", str(type(self.model)), str(self.params))
        # gradient!
        gradient = grad(self.model.get_train_cost(), self.params)
        grads = OrderedDict(zip(self.params, gradient))

        # Calculate the optimizer updates each run
        # This is where the magic happens for a lot of sub-implementations of SGD, including AdaDelta!
        # It tells how to update the params each training epoch
        gradient_updates = self.get_updates(grads)

        # Combine the updates from the model also if applicable
        train_updates = model.get_updates()
        if train_updates:
            train_updates.update(gradient_updates)
        else:
            train_updates = gradient_updates

        # Compile the training function!
        log.info('Compiling f_learn function for model %s...',
                 str(type(self.model)))
        t = time.time()
        self.f_learn = function(inputs=model.get_inputs(),
                                updates=train_updates,
                                outputs=self.model.get_train_cost(),
                                name='f_learn')
        log.info('f_learn compilation took %s',
                 make_time_units_string(time.time() - t))

        # Determine if this function is unsupervised or not by looking at the number of inputs to the f_learn function.
        # If there is only one input, it is unsupervised, otherwise, it is supervised.
        # This workaround was provided by Pascal Lamblin on the theano-users google group
        num_inputs = len(
            [i for i in self.f_learn.maker.inputs if not i.shared])
        if num_inputs == 1:
            log.debug("Model is unsupervised: 1 input to f_learn.")
            self.unsupervised = True
        elif num_inputs == 2:
            log.debug("Model is supervised: 2 inputs to f_learn.")
            self.unsupervised = False
        else:
            log.error(
                "Number of inputs to f_learn on model %s was %s. Needs to be 1 for unsupervised or 2 for supervised.",
                str(type(self.model)), str(num_inputs))
            raise AssertionError(
                "Number of inputs to f_learn on model %s was %s. Needs to be 1 for unsupervised or 2 for supervised."
                % str(type(self.model)), str(num_inputs))

        # grab the function(s) to use to monitor different model values during training
        self.monitors = self.model.get_monitors()
Example #15
0
    def __init__(self, inputs=None,
                 noise='dropout', noise_level=0.5, noise_decay=False, noise_decay_amount=0.99,
                 mrg=RNG_MRG.MRG_RandomStreams(1), switch=True):
        """
        Parameters
        ----------
        inputs : tuple(shape, `Theano.TensorType`)
            tuple(shape, `Theano.TensorType`) describing the inputs to use for this layer.
            `shape` will be a monad tuple representing known sizes for each dimension in the `Theano.TensorType`.
            The length of `shape` should be equal to number of dimensions in `Theano.TensorType`, where the shape
            element is an integer representing the size for its dimension, or None if the shape isn't known.
            For example, if you have a matrix with unknown batch size but fixed feature size of 784, `shape` would
            be: (None, 784). The full form of `inputs` would be:
            [((None, 784), <TensorType(float32, matrix)>)].
        noise : str
            What type of noise to use for the output. See opendeep.utils.noise
            for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        noise_level : float
            The amount of noise to use for the noise function specified by `noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        noise_decay : str or False
            Whether to use `noise` scheduling (decay `noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_decay_amount : float
            The amount to reduce the `noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        switch : boolean
            Whether to create a switch to turn noise on during training and off during testing (True). If False,
            noise will be applied at both training and testing times.
        """
        super(Noise, self).__init__(inputs=inputs, outputs=inputs[0],
                                    noise=noise, noise_level=noise_level,
                                    noise_decay=noise_decay, noise_decay_amount=noise_decay_amount,
                                    mrg=mrg, switch=switch)
        # self.inputs is a list from superclass initialization, grab the first element
        self.inputs = self.inputs[0][1]
        log.debug('Adding %s noise switch.' % str(noise))
        if noise_level is not None:
            noise_level = sharedX(value=noise_level)
            noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg)
        else:
            noise_func = get_noise(noise, mrg=mrg)

        # apply the noise as a switch!
        # default to apply noise. this is for the cost and gradient functions to be computed later
        # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
        if switch:
            self.noise_switch = sharedX(value=1, name="noise_switch")

        # noise scheduling
        if noise_decay and noise_level is not None:
            self.noise_schedule = get_decay_function(noise_decay,
                                                     noise_level,
                                                     noise_level.get_value(),
                                                     noise_decay_amount)
        # apply noise to the inputs!
        if switch:
            self.outputs = Tswitch(self.noise_switch,
                                   noise_func(input=self.inputs),
                                   self.inputs)
        else:
            self.outputs = noise_func(input=self.inputs)
Example #16
0
    def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/rnn/',
                 input_size=None, hidden_size=None, output_size=None,
                 layers=1,
                 activation='sigmoid', hidden_activation='relu',
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3,
                 bias_init=0.0,
                 r_weights_init='identity', r_weights_interval='montreal', r_weights_mean=0, r_weights_std=5e-3,
                 r_bias_init=0.0,
                 cost_function='mse', cost_args=None,
                 noise='dropout', noise_level=None, noise_decay=False, noise_decay_amount=.99,
                 direction='forward',
                 clip_recurrent_grads=False):
        """
        Initialize a simple recurrent network.

        Parameters
        ----------
        inputs_hook : Tuple of (shape, variable)
            Routing information for the model to accept inputs from elsewhere. This is used for linking
            different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a
            newly supervised classification model). For now, it needs to include the shape information (normally the
            dimensionality of the input i.e. n_in).
        hiddens_hook : Tuple of (shape, variable)
            Routing information for the model to accept its hidden representation from elsewhere. For recurrent nets,
            this will be the initial starting value for hidden layers.
        params_hook : List(theano shared variable)
            A list of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters.
        outdir : str
            The location to produce outputs from training or running the :class:`RNN`. If None, nothing will be saved.
        input_size : int
            The size (dimensionality) of the input. If shape is provided in `inputs_hook`, this is optional.
        hidden_size : int
            The size (dimensionality) of the hidden layers. If shape is provided in `hiddens_hook`, this is optional.
        output_size : int
            The size (dimensionality) of the output.
        layers : int
            The number of stacked hidden layers to use.
        activation : str or callable
            The nonlinear (or linear) activation to perform after the dot product from hiddens -> output layer.
            This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        hidden_activation : str or callable
            The activation to perform for the hidden layers.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        weights_init : str
            Determines the method for initializing model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        r_weights_init : str
            Determines the method for initializing recurrent model weights. See opendeep.utils.nnet for options.
        r_weights_interval : str or float
            If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        r_weights_mean : float
            If Gaussian `r_weights_init`, the mean value to use.
        r_weights_std : float
            If Gaussian `r_weights_init`, the standard deviation to use.
        r_bias_init : float
            The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred.
        cost_function : str or callable
            The function to use when calculating the output cost of the model.
            See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable.
        cost_args : dict
            Any additional named keyword arguments to pass to the specified `cost_function`.
        noise : str
            What type of noise to use for the hidden layers and outputs. See opendeep.utils.noise
            for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        noise_level : float
            The amount of noise to use for the noise function specified by `hidden_noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        noise_decay : str or False
            Whether to use `noise` scheduling (decay `noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_decay_amount : float
            The amount to reduce the `noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        direction : str
            The direction this recurrent model should go over its inputs. Can be 'forward', 'backward', or
            'bidirectional'. In the case of 'bidirectional', it will make two passes over the sequence,
            computing two sets of hiddens and merging them before running through the final decoder.
        clip_recurrent_grads : False or float, optional
            Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights
            connecting previous hidden states to the current hidden state, and not the weights from current
            input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range
            `+-clip_recurrent_grads`.

        Raises
        ------
        AssertionError
            When asserting various properties of input parameters. See error messages.
        """
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(RNN, self).__init__(**initial_parameters)

        ##################
        # specifications #
        ##################
        self.direction = direction
        self.bidirectional = (direction == "bidirectional")
        self.backward = (direction == "backward")
        self.layers = layers
        self.noise = noise

        self.weights_init = weights_init
        self.weights_mean = weights_mean
        self.weights_std = weights_std
        self.weights_interval = weights_interval

        self.r_weights_init = r_weights_init
        self.r_weights_mean = r_weights_mean
        self.r_weights_std = r_weights_std
        self.r_weights_interval = r_weights_interval

        self.bias_init = bias_init
        self.r_bias_init = r_bias_init

        #########################################
        # activation, cost, and noise functions #
        #########################################
        # recurrent hidden activation function!
        self.hidden_activation_func = get_activation_function(hidden_activation)

        # output activation function!
        self.activation_func = get_activation_function(activation)

        # Cost function
        self.cost_function = get_cost_function(cost_function)
        self.cost_args = cost_args or dict()

        # Now deal with noise if we added it:
        if self.noise:
            log.debug('Adding %s noise switch.' % str(noise))
            if noise_level is not None:
                noise_level = sharedX(value=noise_level)
                self.noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg)
            else:
                self.noise_func = get_noise(noise, mrg=mrg)
            # apply the noise as a switch!
            # default to apply noise. this is for the cost and gradient functions to be computed later
            # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
            self.noise_switch = sharedX(value=1, name="basiclayer_noise_switch")

            # noise scheduling
            if noise_decay and noise_level is not None:
                self.noise_schedule = get_decay_function(noise_decay,
                                                         noise_level,
                                                         noise_level.get_value(),
                                                         noise_decay_amount)

        ###############
        # inputs hook #
        ###############
        # grab info from the inputs_hook
        # in the case of an inputs_hook, recurrent will always work with the leading tensor dimension
        # being the temporal dimension.
        # input is 3D tensor of (timesteps, batch_size, data_dim)
        # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D.
        # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D.
        if self.inputs_hook is not None:
            self.input = self.inputs_hook[1]

            if self.input.ndim == 1:
                self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 'x'), [1, 2])
                self.input_size = 1

            elif self.input.ndim == 2:
                self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 1), 1)

            elif self.input.ndim == 3:
                pass

            elif self.input.ndim > 3:
                self.input = self.input.flatten(3)
                self.input_size = sum(self.input_size)
            else:
                raise NotImplementedError("Recurrent input with %d dimensions not supported!" % self.input.ndim)
        else:
            # Assume input coming from optimizer is (batches, timesteps, data)
            # so, we need to reshape to (timesteps, batches, data)
            xs = T.tensor3("Xs")
            xs = xs.dimshuffle(1, 0, 2)
            self.input = xs

        # The target outputs for supervised training - in the form of (batches, timesteps, output) which is
        # the same dimension ordering as the expected input from optimizer.
        # therefore, we need to swap it like we did to input xs.
        ys = T.tensor3("Ys")
        ys = ys.dimshuffle(1, 0, 2)
        self.target = ys

        ################
        # hiddens hook #
        ################
        # set an initial value for the recurrent hiddens from hook
        if self.hiddens_hook is not None:
            self.h_init = self.hiddens_hook[1]
            self.hidden_size = self.hiddens_hook[0]
        else:
            # deal with h_init after parameters are made (have to make the same size as hiddens that are computed)
            self.hidden_size = hidden_size

        ##################
        # for generating #
        ##################
        # symbolic scalar for how many recurrent steps to use during generation from the model
        self.n_steps = T.iscalar("generate_n_steps")

        self.output, self.hiddens, self.updates, self.cost, self.params = self.build_computation_graph()
Example #17
0
    def __init__(self, inputs=None,
                 noise='dropout', noise_level=0.5, noise_decay=False, noise_decay_amount=0.99,
                 mrg=RNG_MRG.MRG_RandomStreams(1), switch=True):
        """
        Parameters
        ----------
        inputs : tuple(shape, `Theano.TensorType`)
            tuple(shape, `Theano.TensorType`) describing the inputs to use for this layer.
            `shape` will be a monad tuple representing known sizes for each dimension in the `Theano.TensorType`.
            The length of `shape` should be equal to number of dimensions in `Theano.TensorType`, where the shape
            element is an integer representing the size for its dimension, or None if the shape isn't known.
            For example, if you have a matrix with unknown batch size but fixed feature size of 784, `shape` would
            be: (None, 784). The full form of `inputs` would be:
            [((None, 784), <TensorType(float32, matrix)>)].
        noise : str
            What type of noise to use for the output. See opendeep.utils.noise
            for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other
            real-valued activations, etc.
        noise_level : float
            The amount of noise to use for the noise function specified by `noise`. This could be the
            standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc.
        noise_decay : str or False
            Whether to use `noise` scheduling (decay `noise_level` during the course of training),
            and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options.
            Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first,
            and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster.
        noise_decay_amount : float
            The amount to reduce the `noise_level` after each training epoch based on the decay function specified
            in `noise_decay`.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        switch : boolean
            Whether to create a switch to turn noise on during training and off during testing (True). If False,
            noise will be applied at both training and testing times.
        """
        super(Noise, self).__init__(inputs=inputs,
                                    noise=noise, noise_level=noise_level,
                                    noise_decay=noise_decay, noise_decay_amount=noise_decay_amount,
                                    mrg=mrg, switch=switch)
        # self.inputs is a list from superclass initialization, grab the first element
        self.output_size, self.inputs = self.inputs[0]
        log.debug('Adding %s noise switch.' % str(noise))
        if noise_level is not None:
            noise_level = sharedX(value=noise_level)
            noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg)
        else:
            noise_func = get_noise(noise, mrg=mrg)

        # apply the noise as a switch!
        # default to apply noise. this is for the cost and gradient functions to be computed later
        # (not sure if the above statement is accurate such that gradient depends on initial value of switch)
        if switch:
            self.noise_switch = sharedX(value=1, name="noise_switch")

        # noise scheduling
        if noise_decay and noise_level is not None:
            self.noise_schedule = get_decay_function(noise_decay,
                                                     noise_level,
                                                     noise_level.get_value(),
                                                     noise_decay_amount)
        # apply noise to the inputs!
        if switch:
            self.outputs = Tswitch(self.noise_switch,
                                   noise_func(input=self.inputs),
                                   self.inputs)
        else:
            self.outputs = noise_func(input=self.inputs)