def get_updates(self, gradients): """ Compute the AdaDelta updates (see the paper for details). Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. """ log.debug('Setting up ADADELTA for optimizer...') updates = OrderedDict() for param in gradients.keys(): # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(param.get_value() * 0.) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(param.get_value() * 0.) if param.name is not None: mean_square_grad.name = 'mean_square_grad_' + param.name mean_square_dx.name = 'mean_square_dx_' + param.name # Accumulate gradient new_mean_squared_grad = ( self.decay * mean_square_grad + (1 - self.decay) * T.sqr(gradients[param]) ) # Compute update epsilon = self.lr_scalers.get(param, 1.) * self.learning_rate rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon) rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon) delta_x_t = - (rms_dx_tm1 / rms_grad_t) * gradients[param] # Accumulate updates new_mean_square_dx = ( self.decay * mean_square_dx + (1 - self.decay) * T.sqr(delta_x_t) ) # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[param] = param + delta_x_t return updates
def __call__(self, shape, name=None): """ Create the shared variable with given shape as an Identity matrix. Parameters ---------- shape : tuple A tuple giving the shape information for this variable. name : str, optional The name to give the shared variable. Returns ------- shared variable The shared variable with given shape and name as an Identity matrix. """ log.debug("Creating variable {!s} with shape {!s} as Identity".format(name, shape)) weights = numpy.eye(N=shape[0], M=int(numpy.prod(shape[1:])), k=0, dtype=config.floatX) if self.add_noise: if isinstance(self.add_noise, partial): weights = self.add_noise(input=weights) else: log.error("Add noise to identity weights was not a functools.partial object. Ignoring...") # multiply by gain factor if self.gain != 1.: log.debug("Multiplying {!s} by {!s}".format(name, self.gain)) val = weights * self.gain return sharedX(value=val, name=name)
def get_weights_gaussian(shape, mean=None, std=None, name="W", rng=None, gain=1.): """ This initializes a shared variable with the given shape for weights drawn from a Gaussian distribution with mean and std. Parameters ---------- shape : tuple A tuple giving the shape information for this weight matrix. mean : float The mean to use for the Gaussian distribution. std : float The standard deviation to use dor the Gaussian distribution. name : str The name to give the shared variable. rng : random A given random number generator to use with .normal method. gain : float A multiplicative factor to affect the whole weights matrix. Returns ------- shared variable The theano shared variable with given shape and drawn from a Gaussian distribution. """ default_mean = 0 default_std = 0.05 mean = mean or default_mean std = std or default_std log.debug( "Creating weights %s with shape %s from Gaussian mean=%s, std=%s", name, str(shape), str(mean), str(std)) if rng is None: rng = numpy.random if std != 0: if isinstance(rng, type(numpy.random)): val = numpy.asarray(rng.normal(loc=mean, scale=std, size=shape), dtype=config.floatX) else: val = numpy.asarray(rng.normal(avg=mean, std=std, size=shape).eval(), dtype=config.floatX) else: val = as_floatX(mean * numpy.ones(shape, dtype=config.floatX)) # check if a theano rng was used if isinstance(val, TensorVariable): val = val.eval() val = val * gain # make it into a shared variable return sharedX(value=val, name=name)
def get_weights_identity(shape, name="W", add_noise=None, gain=1.): """ This will return a weights matrix as close to the identity as possible. If a non-square shape, it will make a matrix of the form (I 0) Identity matrix for weights is useful for RNNs with ReLU! http://arxiv.org/abs/1504.00941 Parameters ---------- shape : tuple Tuple giving the shape information for the weight matrix. name : str Name to give the shared variable. add_noise : functools.partial A partially applied noise function (just missing the input parameter) to add noise to the identity initialization. Noise functions can be found in opendeep.utils.noise. gain : float A multiplicative factor to affect the whole weights matrix. Returns ------- shared variable The theano shared variable identity matrix with given shape. """ log.debug("Creating Identity matrix weights %s with shape %s", name, str(shape)) weights = numpy.eye(N=shape[0], M=int(numpy.prod(shape[1:])), k=0, dtype=theano.config.floatX) if add_noise: if isinstance(add_noise, partial): weights = add_noise(input=weights) else: log.error("Add noise to identity weights was not a functools.partial object. Ignoring...") val = weights * gain return sharedX(value=val, name=name)
def get_updates(self, gradients): """ Provides the symbolic (theano) description of the updates needed to perform this learning rule. See Notes for side-effects. Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. Notes ----- This method has the side effect of storing the moving average of the square gradient in `self.mean_square_grads`. This is necessary in order for the monitoring channels to be able to track the value of these moving averages. Therefore, this method should only get called once for each instance of RMSProp. """ log.debug('Setting up RMSProp for optimizer...') updates = OrderedDict() for param in gradients: # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(param.get_value() * 0.) if param.name is None: raise ValueError("Model parameters must be named.") mean_square_grad.name = 'mean_square_grad_' + param.name if param.name in self.mean_square_grads: log.warning("Calling get_updates more than once on the " "gradients of `%s` may make monitored values " "incorrect." % param.name) # Store variable in self.mean_square_grads for monitoring. self.mean_square_grads[param.name] = mean_square_grad # Accumulate gradient new_mean_squared_grad = ( self.decay * mean_square_grad + (1 - self.decay) * T.sqr(gradients[param])) # Compute update scaled_lr = self.lr_scalers.get(param, 1.) * self.learning_rate rms_grad_t = T.sqrt(new_mean_squared_grad) rms_grad_t = T.maximum(rms_grad_t, self.epsilon) delta_x_t = -scaled_lr * gradients[param] / rms_grad_t # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[param] = param + delta_x_t return updates
def get_bias(shape, name="b", init_values=None): """ This creates a theano shared variable for the bias parameter - normally initialized to zeros, but you can specify other values Parameters ---------- shape : tuple The shape to use for the bias vector/matrix. name : str The name to give the shared variable. offset : float or array_like Values to add to the zeros, if you want a nonzero bias initially. Returns ------- shared variable The theano shared variable with given shape. """ default_init = 0 init_values = init_values or default_init log.debug("Initializing bias %s variable with shape %s", name, str(shape)) # init to zeros plus the offset val = as_floatX(numpy.ones(shape=shape, dtype=theano.config.floatX) * init_values) return sharedX(value=val, name=name)
def get_updates(self, gradients): """ Provides the symbolic (theano) description of the updates needed to perform this learning rule. See Notes for side-effects. Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. Notes ----- This method has the side effect of storing the moving average of the square gradient in `self.mean_square_grads`. This is necessary in order for the monitoring channels to be able to track the value of these moving averages. Therefore, this method should only get called once for each instance of RMSProp. """ log.debug('Setting up RMSProp for optimizer...') updates = OrderedDict() for param in gradients: # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(param.get_value() * 0.) if param.name is None: raise ValueError("Model parameters must be named.") mean_square_grad.name = 'mean_square_grad_' + param.name if param.name in self.mean_square_grads: log.warning("Calling get_updates more than once on the " "gradients of `%s` may make monitored values " "incorrect." % param.name) # Store variable in self.mean_square_grads for monitoring. self.mean_square_grads[param.name] = mean_square_grad # Accumulate gradient new_mean_squared_grad = (self.decay * mean_square_grad + (1 - self.decay) * T.sqr(gradients[param])) # Compute update scaled_lr = self.lr_scalers.get(param, 1.) * self.learning_rate rms_grad_t = T.sqrt(new_mean_squared_grad) rms_grad_t = T.maximum(rms_grad_t, self.epsilon) delta_x_t = - scaled_lr * gradients[param] / rms_grad_t # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[param] = param + delta_x_t return updates
def get_weights_uniform(shape, interval='montreal', name="W", rng=None, gain=1.): """ This initializes a shared variable with a given shape for weights drawn from a Uniform distribution with low = -interval and high = interval. Interval can either be a number to use, or a string key to one of the predefined formulas in the _uniform_interval dictionary. Parameters ---------- shape : tuple A tuple giving the shape information for this weight matrix. interval : float or str Either a number for your own custom interval, or a string key to one of the predefined formulas. name : str The name to give the shared variable. rng : random The random number generator to use with a .uniform method. gain : float A multiplicative factor to affect the whole weights matrix. Returns ------- shared variable The theano shared variable with given shape and name drawn from a uniform distribution. Raises ------ NotImplementedError If the string name for the interval couldn't be found in the dictionary. """ if rng is None: rng = numpy.random # If the interval parameter is a string, grab the appropriate formula from the function dictionary, # and apply the appropriate shape numbers to it. if isinstance(interval, six.string_types): interval_func = _uniform_interval.get(interval) if interval_func is None: log.error('Could not find uniform interval formula %s, try one of %s instead.' % (str(interval), str(_uniform_interval.keys()))) raise NotImplementedError('Could not find uniform interval formula %s, try one of %s instead.' % (str(interval), str(_uniform_interval.keys()))) else: log.debug("Creating weights %s with shape %s from Uniform distribution with formula name: %s", name, str(shape), str(interval)) interval = interval_func(shape) else: log.debug("Creating weights %s with shape %s from Uniform distribution with given interval +- %s", name, str(shape), str(interval)) # build the uniform weights tensor val = as_floatX(rng.uniform(low=-interval, high=interval, size=shape)) # check if a theano rng was used if isinstance(val, T.TensorVariable): val = val.eval() val = val * gain # make it into a shared variable return sharedX(value=val, name=name)
def get_weights_orthogonal(shape, name="W", rng=None, gain=1.): """ This returns orthonormal random values to initialize a weight matrix (using SVD). Some discussion here: http://www.reddit.com/r/MachineLearning/comments/2qsje7/how_do_you_initialize_your_neural_network_weights/ From Lasagne: For n-dimensional shapes where n > 2, the n-1 trailing axes are flattened. For convolutional layers, this corresponds to the fan-in, so this makes the initialization usable for both dense and convolutional layers. Parameters ---------- shape : tuple Tuple giving the shape information for the weight matrix. name : str Name to give the shared variable. rng : random A given random number generator to use with .normal method. gain : float A multiplicative factor to affect the whole weights matrix. Returns ------- shared variable The theano shared variable orthogonal matrix with given shape. """ log.debug("Creating Orthogonal matrix weights %s with shape %s", name, str(shape)) if rng is None: rng = numpy.random if len(shape) == 1: shape = (shape[0], shape[0]) else: # flatten shapes bigger than 2 # From Lasagne: For n-dimensional shapes where n > 2, the n-1 trailing axes are flattened. # For convolutional layers, this corresponds to the fan-in, so this makes the initialization # usable for both dense and convolutional layers. shape = (shape[0], numpy.prod(shape[1:])) # Sample from the standard normal distribution if isinstance(rng, type(numpy.random)): a = numpy.asarray(rng.normal(loc=0., scale=1., size=shape), dtype=config.floatX) else: a = numpy.asarray(rng.normal(avg=0., std=1., size=shape).eval(), dtype=config.floatX) u, _, _ = numpy.linalg.svd(a, full_matrices=False) val = u * gain return sharedX(value=val, name=name)
def get_updates(self, gradients): """ Based on Pylearn2 (https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/training_algorithms/learning_rule.py) Implements momentum as described in Section 9 of "A Practical Guide to Training Restricted Boltzmann Machines", Geoffrey Hinton. Parameters are updated by the formula: inc := momentum * inc - learning_rate * d cost / d param param := param + inc Also has the option to implement Nesterov momentum (accelerated momentum), which works better in a lot of cases. Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. """ log.debug( 'Setting up Stochastic Gradient Descent with momentum for optimizer...' ) updates = OrderedDict() for (param, gradient) in six.iteritems(gradients): velocity = sharedX(param.get_value() * 0.) assert param.dtype == velocity.dtype assert gradient.dtype == param.dtype if param.name is not None: velocity.name = 'vel_' + param.name scaled_lr = self.learning_rate * self.lr_scalers.get(param, 1.) updates[velocity] = self.momentum * velocity - scaled_lr * gradient inc = updates[velocity] if self.nesterov_momentum: log.debug('Using Nesterov momentum for parameter %s', str(param)) inc = self.momentum * inc - scaled_lr * gradient assert inc.dtype == velocity.dtype updates[param] = param + inc return updates
def get_weights_orthogonal(shape, name="W", rng=None, gain=1.): """ This returns orthonormal random values to initialize a weight matrix (using SVD). Some discussion here: http://www.reddit.com/r/MachineLearning/comments/2qsje7/how_do_you_initialize_your_neural_network_weights/ From Lasagne: For n-dimensional shapes where n > 2, the n-1 trailing axes are flattened. For convolutional layers, this corresponds to the fan-in, so this makes the initialization usable for both dense and convolutional layers. Parameters ---------- shape : tuple Tuple giving the shape information for the weight matrix. name : str Name to give the shared variable. rng : random A given random number generator to use with .normal method. gain : float A multiplicative factor to affect the whole weights matrix. Returns ------- shared variable The theano shared variable orthogonal matrix with given shape. """ log.debug("Creating Orthogonal matrix weights %s with shape %s", name, str(shape)) if rng is None: rng = numpy.random if len(shape) == 1: shape = (shape[0], shape[0]) else: # flatten shapes bigger than 2 # From Lasagne: For n-dimensional shapes where n > 2, the n-1 trailing axes are flattened. # For convolutional layers, this corresponds to the fan-in, so this makes the initialization # usable for both dense and convolutional layers. shape = (shape[0], numpy.prod(shape[1:])) # Sample from the standard normal distribution if isinstance(rng, type(numpy.random)): a = numpy.asarray(rng.normal(loc=0., scale=1., size=shape), dtype=theano.config.floatX) else: a = numpy.asarray(rng.normal(avg=0., std=1., size=shape).eval(), dtype=theano.config.floatX) u, _, _ = numpy.linalg.svd(a, full_matrices=False) val = u * gain return sharedX(value=val, name=name)
def get_weights_gaussian(shape, mean=None, std=None, name="W", rng=None, gain=1.): """ This initializes a shared variable with the given shape for weights drawn from a Gaussian distribution with mean and std. Parameters ---------- shape : tuple A tuple giving the shape information for this weight matrix. mean : float The mean to use for the Gaussian distribution. std : float The standard deviation to use dor the Gaussian distribution. name : str The name to give the shared variable. rng : random A given random number generator to use with .normal method. gain : float A multiplicative factor to affect the whole weights matrix. Returns ------- shared variable The theano shared variable with given shape and drawn from a Gaussian distribution. """ default_mean = 0 default_std = 0.05 mean = mean or default_mean std = std or default_std log.debug("Creating weights %s with shape %s from Gaussian mean=%s, std=%s", name, str(shape), str(mean), str(std)) if rng is None: rng = numpy.random if std != 0: if isinstance(rng, type(numpy.random)): val = numpy.asarray(rng.normal(loc=mean, scale=std, size=shape), dtype=theano.config.floatX) else: val = numpy.asarray(rng.normal(avg=mean, std=std, size=shape).eval(), dtype=theano.config.floatX) else: val = as_floatX(mean * numpy.ones(shape, dtype=theano.config.floatX)) # check if a theano rng was used if isinstance(val, T.TensorVariable): val = val.eval() val = val * gain # make it into a shared variable return sharedX(value=val, name=name)
def get_updates(self, gradients): """ Based on Pylearn2 (https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/training_algorithms/learning_rule.py) Implements momentum as described in Section 9 of "A Practical Guide to Training Restricted Boltzmann Machines", Geoffrey Hinton. Parameters are updated by the formula: inc := momentum * inc - learning_rate * d cost / d param param := param + inc Also has the option to implement Nesterov momentum (accelerated momentum), which works better in a lot of cases. Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. """ log.debug('Setting up Stochastic Gradient Descent with momentum for optimizer...') updates = OrderedDict() for (param, gradient) in iteritems(gradients): velocity = sharedX(param.get_value() * 0.) assert param.dtype == velocity.dtype assert gradient.dtype == param.dtype if param.name is not None: velocity.name = 'vel_' + param.name scaled_lr = self.learning_rate * self.lr_scalers.get(param, 1.) updates[velocity] = self.momentum * velocity - scaled_lr * gradient inc = updates[velocity] if self.nesterov_momentum: log.debug('Using Nesterov momentum for parameter %s', str(param)) inc = self.momentum * inc - scaled_lr * gradient assert inc.dtype == velocity.dtype updates[param] = param + inc return updates
def __call__(self, shape, name=None): """ Parameters ---------- shape : tuple Tuple giving the shape information for the weight matrix. name : str Name to give the shared variable. Returns ------- shared variable The shared variable matrix with given shape. """ log.debug("Initializing bias %s variable with shape %s", name, str(shape)) # init to zeros plus the offset val = as_floatX(numpy.ones(shape=shape, dtype=config.floatX) * self.init_values) return sharedX(value=val, name=name)
def __call__(self, shape, name=None): """ Create the shared variable with given shape from the uniform distribution with interval described in __init__. Parameters ---------- shape : tuple A tuple giving the shape information for this variable. name : str, optional The name to give the shared variable. Returns ------- shared variable The shared variable with given shape and name drawn from a uniform distribution. """ # if the min and max are determined by a function of shape if hasattr(self, "interval_func"): try: interval = self.interval_func(shape) if isinstance(interval, number_types): self._parse_single_number(interval) elif isinstance(interval, Iterable): self._parse_tuple(interval) except Exception as err: msg = "Expected interval function to output a number or Iterable of numbers, found {!s}".format(type(interval)) log.error(msg) raise AttributeError(msg) # build the uniform weights tensor log.debug("Creating variable {!s} with shape {!s} from Uniform interval [{!s}, {!s}]".format( name, shape, self.min, self.max )) val = as_floatX(self.rng.uniform(low=self.min, high=self.max, size=shape)) # check if a theano rng was used if isinstance(val, TensorVariable): val = val.eval() # multiply by gain factor if self.gain != 1.: log.debug("Multiplying {!s} by {!s}".format(name, self.gain)) val = val * self.gain # make it into a shared variable return sharedX(value=val, name=name)
def __call__(self, shape, name=None): """ Parameters ---------- shape : tuple Tuple giving the shape information for the weight matrix. name : str Name to give the shared variable. Returns ------- shared variable The shared variable orthogonal matrix with given shape. """ log.debug("Creating Orthogonal matrix weights {!s} with shape {!s}".format(name, shape)) if len(shape) == 1: shape = (shape[0], shape[0]) else: # flatten shapes bigger than 2 # From Lasagne: For n-dimensional shapes where n > 2, the n-1 trailing axes are flattened. # For convolutional layers, this corresponds to the fan-in, so this makes the initialization # usable for both dense and convolutional layers. shape = (shape[0], numpy.prod(shape[1:])) # Sample from the standard normal distribution if isinstance(self.rng, type(numpy.random)): a = numpy.asarray(self.rng.normal(loc=0., scale=1., size=shape), dtype=config.floatX) else: a = numpy.asarray(self.rng.normal(avg=0., std=1., size=shape).eval(), dtype=config.floatX) u, _, _ = numpy.linalg.svd(a, full_matrices=False) # multiply by gain factor if self.gain != 1.: log.debug("Multiplying {!s} by {!s}".format(name, self.gain)) val = u * self.gain return sharedX(value=val, name=name)
def __call__(self, shape, name=None): """ Create the shared variable with given shape from the Gaussian distribution described in __init__. Parameters ---------- shape : tuple A tuple giving the shape information for this variable. name : str, optional The name to give the shared variable. Returns ------- shared variable The shared variable with given shape and name drawn from a Gaussian (normal) distribution. """ log.debug("Creating variable {!s} with shape {!s} from Gaussian mean={!s}, std={!s}".format( name, shape, self.mean, self.std )) if self.std != 0: if isinstance(self.rng, type(numpy.random)): val = numpy.asarray(self.rng.normal(loc=self.mean, scale=self.std, size=shape), dtype=config.floatX) else: val = numpy.asarray(self.rng.normal(avg=self.mean, std=self.std, size=shape).eval(), dtype=config.floatX) else: val = as_floatX(self.mean * numpy.ones(shape, dtype=config.floatX)) # check if a theano rng was used if isinstance(val, TensorVariable): val = val.eval() # multiply by gain factor if self.gain != 1.: log.debug("Multiplying {!s} by {!s}".format(name, self.gain)) val = val * self.gain # make it into a shared variable return sharedX(value=val, name=name)
def __init__(self, dataset, loss, model=None, epochs=10, batch_size=100, min_batch_size=1, save_freq=None, stop_threshold=None, stop_patience=None, learning_rate=.1, lr_decay="exponential", lr_decay_factor=.995, momentum=0.5, momentum_decay="linear", momentum_factor=0, nesterov_momentum=True, grad_clip=None, hard_clip=False): """ Initialize SGD. Parameters ---------- dataset : Dataset The :class:`opendeep.data.Dataset` to use when training the Model. loss : Loss The :class:`opendeep.optimization.loss.Loss` function to compare the model to a 'target' result. model : Model The :class:`opendeep.models.Model` to train. Needed if the Optimizer isn't being passed to a Model's .train() method. epochs : int how many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. min_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_freq : int How many epochs to train between each new save of the Model's parameters. stop_threshold : float The factor by how much the best validation training score needs to improve to determine early stopping. stop_patience : int The patience or number of epochs to wait after the stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The type of decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for options. lr_decay_factor : float The amount to use for the decay function when changing the learning rate over epochs. See `opendeep.utils.decay` for its effect for given decay functions. momentum : float The momentum to use during gradient updates. momentum_decay : str The type of decay function to use for changing the momentum over epochs. See `opendeep.utils.decay` for options. momentum_factor : float The amount to use for the decay function when changing the momentum over epochs. See `opendeep.utils.decay` for its effect for given decay functions. nesterov_momentum : bool Whether or not to use Nesterov momentum. grad_clip : float, optional Whether to clip gradients. This will clip with a maximum of grad_clip or the parameter norm. hard_clip : bool Whether to use a hard cutoff or rescaling for clipping gradients. """ # superclass init initial_parameters = locals().copy() initial_parameters.pop('self') super(SGD, self).__init__(**initial_parameters) # Momentum - smoothing over the parameter changes (see Hinton) if momentum: self.momentum = sharedX(momentum, 'momentum') if momentum_decay is not None and \ momentum_decay is not False and \ momentum_factor is not None: self.momentum_decay = get_decay_function(momentum_decay, self.momentum, self.momentum.get_value(), momentum_factor) else: self.momentum_decay = False else: self.momentum = 0 self.momentum_decay = False self.nesterov_momentum = nesterov_momentum
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/rnn/', input_size=None, hidden_size=None, output_size=None, layers=1, activation='sigmoid', hidden_activation='relu', mrg=RNG_MRG.MRG_RandomStreams(1), weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3, bias_init=0.0, r_weights_init='identity', r_weights_interval='montreal', r_weights_mean=0, r_weights_std=5e-3, r_bias_init=0.0, cost_function='mse', cost_args=None, noise='dropout', noise_level=None, noise_decay=False, noise_decay_amount=.99, direction='forward', clip_recurrent_grads=False): """ Initialize a simple recurrent network. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, it needs to include the shape information (normally the dimensionality of the input i.e. n_in). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. For recurrent nets, this will be the initial starting value for hidden layers. params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters. outdir : str The location to produce outputs from training or running the :class:`RNN`. If None, nothing will be saved. input_size : int The size (dimensionality) of the input. If shape is provided in `inputs_hook`, this is optional. hidden_size : int The size (dimensionality) of the hidden layers. If shape is provided in `hiddens_hook`, this is optional. output_size : int The size (dimensionality) of the output. layers : int The number of stacked hidden layers to use. activation : str or callable The nonlinear (or linear) activation to perform after the dot product from hiddens -> output layer. This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The activation to perform for the hidden layers. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. mrg : random A random number generator that is used when adding noise. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. r_weights_init : str Determines the method for initializing recurrent model weights. See opendeep.utils.nnet for options. r_weights_interval : str or float If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options. r_weights_mean : float If Gaussian `r_weights_init`, the mean value to use. r_weights_std : float If Gaussian `r_weights_init`, the standard deviation to use. r_bias_init : float The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred. cost_function : str or callable The function to use when calculating the output cost of the model. See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. noise : str What type of noise to use for the hidden layers and outputs. See opendeep.utils.noise for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. noise_level : float The amount of noise to use for the noise function specified by `hidden_noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. noise_decay : str or False Whether to use `noise` scheduling (decay `noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_decay_amount : float The amount to reduce the `noise_level` after each training epoch based on the decay function specified in `noise_decay`. direction : str The direction this recurrent model should go over its inputs. Can be 'forward', 'backward', or 'bidirectional'. In the case of 'bidirectional', it will make two passes over the sequence, computing two sets of hiddens and merging them before running through the final decoder. clip_recurrent_grads : False or float, optional Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights connecting previous hidden states to the current hidden state, and not the weights from current input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range `+-clip_recurrent_grads`. Raises ------ AssertionError When asserting various properties of input parameters. See error messages. """ initial_parameters = locals().copy() initial_parameters.pop('self') super(RNN, self).__init__(**initial_parameters) ################## # specifications # ################## self.direction = direction self.bidirectional = (direction == "bidirectional") self.backward = (direction == "backward") self.layers = layers self.noise = noise self.weights_init = weights_init self.weights_mean = weights_mean self.weights_std = weights_std self.weights_interval = weights_interval self.r_weights_init = r_weights_init self.r_weights_mean = r_weights_mean self.r_weights_std = r_weights_std self.r_weights_interval = r_weights_interval self.bias_init = bias_init self.r_bias_init = r_bias_init ######################################### # activation, cost, and noise functions # ######################################### # recurrent hidden activation function! self.hidden_activation_func = get_activation_function(hidden_activation) # output activation function! self.activation_func = get_activation_function(activation) # Cost function self.cost_function = get_cost_function(cost_function) self.cost_args = cost_args or dict() # Now deal with noise if we added it: if self.noise: log.debug('Adding %s noise switch.' % str(noise)) if noise_level is not None: noise_level = sharedX(value=noise_level) self.noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg) else: self.noise_func = get_noise(noise, mrg=mrg) # apply the noise as a switch! # default to apply noise. this is for the cost and gradient functions to be computed later # (not sure if the above statement is accurate such that gradient depends on initial value of switch) self.noise_switch = sharedX(value=1, name="basiclayer_noise_switch") # noise scheduling if noise_decay and noise_level is not None: self.noise_schedule = get_decay_function(noise_decay, noise_level, noise_level.get_value(), noise_decay_amount) ############### # inputs hook # ############### # grab info from the inputs_hook # in the case of an inputs_hook, recurrent will always work with the leading tensor dimension # being the temporal dimension. # input is 3D tensor of (timesteps, batch_size, data_dim) # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D. # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D. if self.inputs_hook is not None: self.input = self.inputs_hook[1] if self.input.ndim == 1: self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 'x'), [1, 2]) self.input_size = 1 elif self.input.ndim == 2: self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 1), 1) elif self.input.ndim == 3: pass elif self.input.ndim > 3: self.input = self.input.flatten(3) self.input_size = sum(self.input_size) else: raise NotImplementedError("Recurrent input with %d dimensions not supported!" % self.input.ndim) else: # Assume input coming from optimizer is (batches, timesteps, data) # so, we need to reshape to (timesteps, batches, data) xs = T.tensor3("Xs") xs = xs.dimshuffle(1, 0, 2) self.input = xs # The target outputs for supervised training - in the form of (batches, timesteps, output) which is # the same dimension ordering as the expected input from optimizer. # therefore, we need to swap it like we did to input xs. ys = T.tensor3("Ys") ys = ys.dimshuffle(1, 0, 2) self.target = ys ################ # hiddens hook # ################ # set an initial value for the recurrent hiddens from hook if self.hiddens_hook is not None: self.h_init = self.hiddens_hook[1] self.hidden_size = self.hiddens_hook[0] else: # deal with h_init after parameters are made (have to make the same size as hiddens that are computed) self.hidden_size = hidden_size ################## # for generating # ################## # symbolic scalar for how many recurrent steps to use during generation from the model self.n_steps = T.iscalar("generate_n_steps") self.output, self.hiddens, self.updates, self.cost, self.params = self.build_computation_graph()
def __init__(self, dataset, loss, model=None, epochs=10, batch_size=100, min_batch_size=1, save_freq=None, stop_threshold=None, stop_patience=None, learning_rate=1e-6, lr_decay=None, lr_decay_factor=None, decay=0.95, gamma_clip=1.8, damping=1e-7, grad_clip=None, hard_clip=False, start_var_reduction=0, delta_clip=None, use_adagrad=False, skip_nan_inf=False, upper_bound_tau=1e8, lower_bound_tau=1.5, use_corrected_grad=True): """ Initialize AdaSecant. Parameters ---------- dataset : Dataset The :class:`opendeep.data.Dataset` to use when training the Model. loss : Loss The :class:`opendeep.optimization.loss.Loss` function to compare the model to a 'target' result. model : Model The :class:`opendeep.models.Model` to train. Needed if the Optimizer isn't being passed to a Model's .train() method. epochs : int how many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. min_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_freq : int How many epochs to train between each new save of the Model's parameters. stop_threshold : float The factor by how much the best validation training score needs to improve to determine early stopping. stop_patience : int The patience or number of epochs to wait after the stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The type of decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for options. lr_decay_factor : float The amount to use for the decay function when changing the learning rate over epochs. See `opendeep.utils.decay` for its effect for given decay functions. decay : float, optional Decay rate :math:`\\rho` in Algorithm 1 of the aforementioned paper. Decay 0.95 seems to work fine for several tasks. gamma_clip : float, optional The clipping threshold for the gamma. In general 1.8 seems to work fine for several tasks. start_var_reduction: float, optional, How many updates later should the variance reduction start from? delta_clip: float, optional, The threshold to clip the deltas after. grad_clip: float, optional, Apply gradient clipping for RNNs (not necessary for feedforward networks). But this is a constraint on the norm of the gradient per layer. hard_clip : bool Whether to use a hard cutoff or rescaling for clipping gradients. use_adagrad: bool, optional Either to use clipped adagrad or not. use_corrected_grad: bool, optional Either to use correction for gradients (referred as variance reduction in the workshop paper). """ # get everything together with the Optimizer class initial_parameters = locals().copy() initial_parameters.pop('self') super(AdaSecant, self).__init__(**initial_parameters) assert decay >= 0., "Decay needs to be >=0." assert decay < 1., "Decay needs to be <1." self.decay = sharedX(decay, "decay") self.damping = damping self.skip_nan_inf = skip_nan_inf # if grad_clip: # assert grad_clip > 0. # assert grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1." # self.grad_clip = grad_clip self.use_adagrad = use_adagrad self.use_corrected_grad = use_corrected_grad self.gamma_clip = gamma_clip self.start_var_reduction = start_var_reduction self.delta_clip = delta_clip # We have to bound the tau to prevent it to # grow to an arbitrarily large number, oftenwise # that causes numerical instabilities for very deep # networks. Note that once tau become very large, it will keep, # increasing indefinitely. self.lower_bound_tau = lower_bound_tau self.upper_bound_tau = upper_bound_tau
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/gsn/', input_size=None, hidden_size=1000, layers=2, walkbacks=4, visible_activation='sigmoid', hidden_activation='tanh', input_sampling=True, mrg=RNG_MRG.MRG_RandomStreams(1), tied_weights=True, weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3, bias_init=0.0, cost_function='binary_crossentropy', cost_args=None, add_noise=True, noiseless_h1=True, hidden_noise='gaussian', hidden_noise_level=2, input_noise='salt_and_pepper', input_noise_level=0.4, noise_decay='exponential', noise_annealing=1, image_width=None, image_height=None, **kwargs): """ Initialize a GSN. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, it needs to include the shape information (normally the dimensionality of the input i.e. n_in). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. This is used for linking different models together (e.g. setting the DAE model's hidden layers to the RNN's output layer gives a generative recurrent model.) For now, it needs to include the shape information (normally the dimensionality of the hiddens i.e. n_hidden). params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as a training model with dropout applied to layers and one without for testing, where the parameters are shared between the two. outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. input_size : int The size (dimensionality) of the input to the DAE. If shape is provided in `inputs_hook`, this is optional. The :class:`Model` requires an `output_size`, which gets set to this value because the DAE is an unsupervised model. The output is a reconstruction of the input. hidden_size : int The size (dimensionality) of the hidden layer for the DAE. Generally, you want it to be larger than `input_size`, which is known as *overcomplete*. visible_activation : str or callable The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer. This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. layers : int The number of hidden layers to use. walkbacks : int The number of walkbacks to perform (the variable K in Bengio's paper above). A walkback is a Gibbs sample from the DAE, which means the model generates inputs in sequence, where each generated input is compared to the original input to create the reconstruction cost for training. For running the model, the very last generated input in the Gibbs chain is used as the output. input_sampling : bool During walkbacks, whether to sample from the generated input to create a new starting point for the next walkback (next step in the Gibbs chain). This generally makes walkbacks more effective by making the process more stochastic - more likely to find spurious modes in the model's representation. mrg : random A random number generator that is used when adding noise into the network and for sampling from the input. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. tied_weights : bool DAE has two weight matrices - W from input -> hiddens and V from hiddens -> input. This boolean determines if V = W.T, which 'ties' V to W and reduces the number of parameters necessary during training. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. cost_function : str or callable The function to use when calculating the reconstruction cost of the model. This should be appropriate for the type of input, i.e. use 'binary_crossentropy' for binary inputs, or 'mse' for real-valued inputs. See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. add_noise : bool Whether to add noise (corrupt) the input before passing it through the computation graph during training. This should most likely be set to the default of True, because this is a *denoising* autoencoder after all. noiseless_h1 : bool Whether to not add noise (corrupt) the hidden layer during computation. hidden_noise : str What type of noise to use for corrupting the hidden layer (if not `noiseless_h1`). See opendeep.utils.noise for options. This should be appropriate for the hidden unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. hidden_noise_level : float The amount of noise to use for the noise function specified by `hidden_noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. input_noise : str What type of noise to use for corrupting the input before computation (if `add_noise`). See opendeep.utils.noise for options. This should be appropriate for the input units, i.e. salt-and-pepper for binary units, etc. input_noise_level : float The amount of noise used to corrupt the input. This could be the masking probability for salt-and-pepper, standard deviation for Gaussian, interval for Uniform, etc. noise_decay : str or False Whether to use `input_noise` scheduling (decay `input_noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the DAE learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_annealing : float The amount to reduce the `input_noise_level` after each training epoch based on the decay function specified in `noise_decay`. image_width : int If the input should be represented as an image, the width of the input image. If not specified, it will be close to the square factor of the `input_size`. image_height : int If the input should be represented as an image, the height of the input image. If not specified, it will be close to the square factor of the `input_size`. """ # init Model to combine the defaults and config dictionaries with the initial parameters. initial_parameters = locals().copy() initial_parameters.pop('self') super(GSN, self).__init__(**initial_parameters) # when the input should be thought of as an image, either use the specified width and height, # or try to make as square as possible. if image_height is None and image_width is None: (_h, _w) = closest_to_square_factors(self.input_size) self.image_width = _w self.image_height = _h else: self.image_height = image_height self.image_width = image_width ############################ # Theano variables and RNG # ############################ if self.inputs_hook is None: self.X = T.matrix('X') else: # inputs_hook is a (shape, input) tuple self.X = self.inputs_hook[1] ########################## # Network specifications # ########################## # generally, walkbacks should be at least 2*layers if layers % 2 == 0: if walkbacks < 2 * layers: log.warning( 'Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. ' 'Generaly want 2X walkbacks to layers', str(layers), str(walkbacks)) else: if walkbacks < 2 * layers - 1: log.warning( 'Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. ' 'Generaly want 2X walkbacks to layers', str(layers), str(walkbacks)) self.add_noise = add_noise self.noise_annealing = as_floatX( noise_annealing) # noise schedule parameter self.hidden_noise_level = sharedX(hidden_noise_level, dtype=theano.config.floatX) self.hidden_noise = get_noise(name=hidden_noise, noise_level=self.hidden_noise_level, mrg=mrg) self.input_noise_level = sharedX(input_noise_level, dtype=theano.config.floatX) self.input_noise = get_noise(name=input_noise, noise_level=self.input_noise_level, mrg=mrg) self.walkbacks = walkbacks self.tied_weights = tied_weights self.layers = layers self.noiseless_h1 = noiseless_h1 self.input_sampling = input_sampling self.noise_decay = noise_decay # if there was a hiddens_hook, unpack the hidden layers in the tensor if self.hiddens_hook is not None: hidden_size = self.hiddens_hook[0] self.hiddens_flag = True else: self.hiddens_flag = False # determine the sizes of each layer in a list. # layer sizes, from h0 to hK (h0 is the visible layer) hidden_size = list(raise_to_list(hidden_size)) if len(hidden_size) == 1: self.layer_sizes = [self.input_size] + hidden_size * self.layers else: assert len(hidden_size) == self.layers, "Hiddens sizes and number of hidden layers mismatch." + \ "Hiddens %d and layers %d" % (len(hidden_size), self.layers) self.layer_sizes = [self.input_size] + hidden_size if self.hiddens_hook is not None: self.hiddens = self.unpack_hiddens(self.hiddens_hook[1]) ######################### # Activation functions! # ######################### # hidden unit activation self.hidden_activation = get_activation_function(hidden_activation) # Visible layer activation self.visible_activation = get_activation_function(visible_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.visible_activation): self.visible_sampling = mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary visible activation not supported yet!") raise NotImplementedError( "Non-binary visible activation not supported yet!") # Cost function self.cost_function = get_cost_function(cost_function) self.cost_args = cost_args or dict() ############### # Parameters! # ############### # make sure to deal with params_hook! if self.params_hook is not None: # if tied weights, expect layers*2 + 1 params if self.tied_weights: assert len(self.params_hook) == 2*layers + 1, \ "Tied weights: expected {0!s} params, found {1!s}!".format(2*layers+1, len(self.params_hook)) self.weights_list = self.params_hook[:layers] self.bias_list = self.params_hook[layers:] # if untied weights, expect layers*3 + 1 params else: assert len(self.params_hook) == 3*layers + 1, \ "Untied weights: expected {0!s} params, found {1!s}!".format(3*layers+1, len(self.params_hook)) self.weights_list = self.params_hook[:2 * layers] self.bias_list = self.params_hook[2 * layers:] # otherwise, construct our params else: # initialize a list of weights and biases based on layer_sizes for the GSN self.weights_list = [ get_weights( weights_init=weights_init, shape=(self.layer_sizes[i], self.layer_sizes[i + 1]), name="W_{0!s}_{1!s}".format(i, i + 1), rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for i in range(layers) ] # add more weights if we aren't tying weights between layers (need to add for higher-lower layers now) if not tied_weights: self.weights_list.extend([ get_weights( weights_init=weights_init, shape=(self.layer_sizes[i + 1], self.layer_sizes[i]), name="W_{0!s}_{1!s}".format(i + 1, i), rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for i in reversed(range(layers)) ]) # initialize each layer bias to 0's. self.bias_list = [ get_bias(shape=(self.layer_sizes[i], ), name='b_' + str(i), init_values=bias_init) for i in range(layers + 1) ] # build the params of the model into a list self.params = self.weights_list + self.bias_list log.debug("gsn params: %s", str(self.params)) # using the properties, build the computational graph self.cost, self.monitors, self.output, self.hiddens = self.build_computation_graph( )
def __init__(self, inputs=None, noise='dropout', noise_level=0.5, noise_decay=False, noise_decay_amount=0.99, mrg=RNG_MRG.MRG_RandomStreams(1), switch=True): """ Parameters ---------- inputs : tuple(shape, `Theano.TensorType`) tuple(shape, `Theano.TensorType`) describing the inputs to use for this layer. `shape` will be a monad tuple representing known sizes for each dimension in the `Theano.TensorType`. The length of `shape` should be equal to number of dimensions in `Theano.TensorType`, where the shape element is an integer representing the size for its dimension, or None if the shape isn't known. For example, if you have a matrix with unknown batch size but fixed feature size of 784, `shape` would be: (None, 784). The full form of `inputs` would be: [((None, 784), <TensorType(float32, matrix)>)]. noise : str What type of noise to use for the output. See opendeep.utils.noise for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. noise_level : float The amount of noise to use for the noise function specified by `noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. noise_decay : str or False Whether to use `noise` scheduling (decay `noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_decay_amount : float The amount to reduce the `noise_level` after each training epoch based on the decay function specified in `noise_decay`. mrg : random A random number generator that is used when adding noise. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. switch : boolean Whether to create a switch to turn noise on during training and off during testing (True). If False, noise will be applied at both training and testing times. """ super(Noise, self).__init__(inputs=inputs, outputs=inputs[0], noise=noise, noise_level=noise_level, noise_decay=noise_decay, noise_decay_amount=noise_decay_amount, mrg=mrg, switch=switch) # self.inputs is a list from superclass initialization, grab the first element self.inputs = self.inputs[0][1] log.debug('Adding %s noise switch.' % str(noise)) if noise_level is not None: noise_level = sharedX(value=noise_level) noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg) else: noise_func = get_noise(noise, mrg=mrg) # apply the noise as a switch! # default to apply noise. this is for the cost and gradient functions to be computed later # (not sure if the above statement is accurate such that gradient depends on initial value of switch) if switch: self.noise_switch = sharedX(value=1, name="noise_switch") # noise scheduling if noise_decay and noise_level is not None: self.noise_schedule = get_decay_function(noise_decay, noise_level, noise_level.get_value(), noise_decay_amount) # apply noise to the inputs! if switch: self.outputs = Tswitch(self.noise_switch, noise_func(input=self.inputs), self.inputs) else: self.outputs = noise_func(input=self.inputs)
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/gru/', input_size=None, hidden_size=None, output_size=None, activation='sigmoid', hidden_activation='relu', inner_hidden_activation='sigmoid', mrg=RNG_MRG.MRG_RandomStreams(1), weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3, bias_init=0.0, r_weights_init='identity', r_weights_interval='montreal', r_weights_mean=0, r_weights_std=5e-3, r_bias_init=0.0, cost_function='mse', cost_args=None, noise='dropout', noise_level=None, noise_decay=False, noise_decay_amount=.99, forward=True, clip_recurrent_grads=False): """ Initialize a simple recurrent network. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, it needs to include the shape information (normally the dimensionality of the input i.e. n_in). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. For recurrent nets, this will be the initial starting value for hidden layers. params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters. outdir : str The location to produce outputs from training or running the :class:`RNN`. If None, nothing will be saved. input_size : int The size (dimensionality) of the input. If shape is provided in `inputs_hook`, this is optional. hidden_size : int The size (dimensionality) of the hidden layers. If shape is provided in `hiddens_hook`, this is optional. output_size : int The size (dimensionality) of the output. activation : str or callable The nonlinear (or linear) activation to perform after the dot product from hiddens -> output layer. This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The activation to perform for the hidden units. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. inner_hidden_activation : str or callable The activation to perform for the hidden gates. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. mrg : random A random number generator that is used when adding noise. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. r_weights_init : str Determines the method for initializing recurrent model weights. See opendeep.utils.nnet for options. r_weights_interval : str or float If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options. r_weights_mean : float If Gaussian `r_weights_init`, the mean value to use. r_weights_std : float If Gaussian `r_weights_init`, the standard deviation to use. r_bias_init : float The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred. cost_function : str or callable The function to use when calculating the output cost of the model. See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. noise : str What type of noise to use for the hidden layers and outputs. See opendeep.utils.noise for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. noise_level : float The amount of noise to use for the noise function specified by `hidden_noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. noise_decay : str or False Whether to use `noise` scheduling (decay `noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_decay_amount : float The amount to reduce the `noise_level` after each training epoch based on the decay function specified in `noise_decay`. forward : bool The direction this recurrent model should go over its inputs. True means forward, False mean backward. clip_recurrent_grads : False or float, optional Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights connecting previous hidden states to the current hidden state, and not the weights from current input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range `+-clip_recurrent_grads`. """ initial_parameters = locals().copy() initial_parameters.pop('self') super(GRU, self).__init__(**initial_parameters) ################## # specifications # ################## ######################################### # activation, cost, and noise functions # ######################################### # recurrent hidden activation function! self.hidden_activation_func = get_activation_function( hidden_activation) self.inner_hidden_activation_func = get_activation_function( inner_hidden_activation) # output activation function! activation_func = get_activation_function(activation) # Cost function cost_function = get_cost_function(cost_function) cost_args = cost_args or dict() # Now deal with noise if we added it: if noise: log.debug('Adding %s noise switch.' % str(noise)) if noise_level is not None: noise_level = sharedX(value=noise_level) noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg) else: noise_func = get_noise(noise, mrg=mrg) # apply the noise as a switch! # default to apply noise. this is for the cost and gradient functions to be computed later # (not sure if the above statement is accurate such that gradient depends on initial value of switch) self.noise_switch = sharedX(value=1, name="gru_noise_switch") # noise scheduling if noise_decay and noise_level is not None: self.noise_schedule = get_decay_function( noise_decay, noise_level, noise_level.get_value(), noise_decay_amount) ############### # inputs hook # ############### # grab info from the inputs_hook # in the case of an inputs_hook, recurrent will always work with the leading tensor dimension # being the temporal dimension. # input is 3D tensor of (timesteps, batch_size, data_dim) # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D. # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D. if self.inputs_hook is not None: self.input = self.inputs_hook[1] if self.input.ndim == 1: self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 'x'), [1, 2]) self.input_size = 1 elif self.input.ndim == 2: self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 1), 1) elif self.input.ndim > 3: self.input = self.input.flatten(3) self.input_size = sum(self.input_size) else: raise NotImplementedError( "Recurrent input with %d dimensions not supported!" % self.input.ndim) xs = self.input else: # Assume input coming from optimizer is (batches, timesteps, data) # so, we need to reshape to (timesteps, batches, data) self.input = T.tensor3("Xs") xs = self.input.dimshuffle(1, 0, 2) # The target outputs for supervised training - in the form of (batches, timesteps, output) which is # the same dimension ordering as the expected input from optimizer. # therefore, we need to swap it like we did to input xs. self.target = T.tensor3("Ys") ys = self.target.dimshuffle(1, 0, 2) ################ # hiddens hook # ################ # set an initial value for the recurrent hiddens from hook if self.hiddens_hook is not None: h_init = self.hiddens_hook[1] self.hidden_size = self.hiddens_hook[0] else: # deal with h_init after parameters are made (have to make the same size as hiddens that are computed) self.hidden_size = hidden_size ################## # for generating # ################## # symbolic scalar for how many recurrent steps to use during generation from the model self.n_steps = T.iscalar("generate_n_steps") #################################################### # parameters - make sure to deal with params_hook! # #################################################### if self.params_hook is not None: (W_x_z, W_x_r, W_x_h, U_h_z, U_h_r, U_h_h, W_h_y, b_z, b_r, b_h, b_y) = self.params_hook recurrent_params = [U_h_z, U_h_r, U_h_h] # otherwise, construct our params else: # all input-to-hidden weights W_x_z, W_x_r, W_x_h = [ get_weights( weights_init=weights_init, shape=(self.input_size, self.hidden_size), name="W_x_%s" % sub, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for sub in ['z', 'r', 'h'] ] # all hidden-to-hidden weights U_h_z, U_h_r, U_h_h = [ get_weights( weights_init=r_weights_init, shape=(self.hidden_size, self.hidden_size), name="U_h_%s" % sub, # if gaussian mean=r_weights_mean, std=r_weights_std, # if uniform interval=r_weights_interval) for sub in ['z', 'r', 'h'] ] # hidden-to-output weights W_h_y = get_weights( weights_init=weights_init, shape=(self.hidden_size, self.output_size), name="W_h_y", # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) # biases b_z, b_r, b_h = [ get_bias(shape=(self.hidden_size, ), name="b_%s" % sub, init_values=r_bias_init) for sub in ['z', 'r', 'h'] ] # output bias b_y = get_bias(shape=(self.output_size, ), name="b_y", init_values=bias_init) # clip gradients if we are doing that recurrent_params = [U_h_z, U_h_r, U_h_h] if clip_recurrent_grads: clip = abs(clip_recurrent_grads) U_h_z, U_h_r, U_h_h = [ theano.gradient.grad_clip(p, -clip, clip) for p in recurrent_params ] # put all the parameters into our list, and make sure it is in the same order as when we try to load # them from a params_hook!!! self.params = [W_x_z, W_x_r, W_x_h ] + recurrent_params + [W_h_y, b_z, b_r, b_h, b_y] # make h_init the right sized tensor if not self.hiddens_hook: h_init = T.zeros_like(T.dot(xs[0], W_x_h)) ############### # computation # ############### # move some computation outside of scan to speed it up! x_z = T.dot(xs, W_x_z) + b_z x_r = T.dot(xs, W_x_r) + b_r x_h = T.dot(xs, W_x_h) + b_h # now do the recurrent stuff self.hiddens, self.updates = theano.scan( fn=self.recurrent_step, sequences=[x_z, x_r, x_h], outputs_info=[h_init], non_sequences=[U_h_z, U_h_r, U_h_h], go_backwards=not forward, name="gru_scan", strict=True) # add noise (like dropout) if we wanted it! if noise: self.hiddens = T.switch(self.noise_switch, noise_func(input=self.hiddens), self.hiddens) # now compute the outputs from the leftover (top level) hiddens self.output = activation_func(T.dot(self.hiddens, W_h_y) + b_y) # now to define the cost of the model - use the cost function to compare our output with the target value. self.cost = cost_function(output=self.output, target=ys, **cost_args) log.info("Initialized a GRU!")
def __init__(self, dataset, loss=None, model=None, epochs=1000, batch_size=100, min_batch_size=1, save_freq=10, stop_threshold=None, stop_patience=50, learning_rate=1e-3, lr_decay=None, lr_decay_factor=None, grad_clip=None, hard_clip=False, **kwargs): """ Initialize the Optimizer. Parameters ---------- dataset : Dataset The :class:`opendeep.data.Dataset` to use when training the Model. loss : Loss The :class:`opendeep.optimization.loss.Loss` function to compare the model to a 'target' result. model : Model The :class:`opendeep.models.Model` to train. Needed if the Optimizer isn't being passed to a Model's .train() method. epochs : int How many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. min_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_freq : int, optional How many epochs to train between each new save of the Model's parameters. stop_threshold : float, optional The factor by how much the best validation training score needs to improve to determine early stopping. stop_patience : int, optional The patience or number of epochs to wait after the stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for classes of decay and documentation. lr_decay_factor : float The amount of decay to use for the ``lr_decay`` type of decay. grad_clip : float, optional Whether to clip gradients. This will clip the norm of the gradients either with a hard cutoff or rescaling. hard_clip : bool Whether to use a hard cutoff or rescaling for clipping gradients. """ log.info("Initializing optimizer %s", str(self.__class__.__name__)) # Deal with early stopping None initializations (no early stopping). if not stop_threshold: stop_threshold = numpy.inf if not save_freq: save_freq = 1000000 if not stop_patience: stop_patience = 1 # Put all init parameters in self.args so we can log the initial configuration. self.args = locals().copy() self.args.pop('self') kwargs = self.args.pop('kwargs') self.args = add_kwargs_to_dict(kwargs, self.args) # log the arguments log.info("Optimizer config args: %s", str(self.args)) # if the optimizer wasn't initialized with a Model (train() being called from the model class itself), # just return. (This seems kinda hacky but hey, people wanted .train() to happen from Model and there # wasn't really a better way unless the epoch looping logic was in that method for Model. That wasn't # the best option because other methods besides stochastic ones can exist for optimizers in the future. # TODO: fix this up - feels like a hack just to make model.train() work... if not model: return # Otherwise, things are proceeding as normal. Carry on... assert isinstance(model, Model), "Optimizer input model needs to be a Model class! " \ "Found %s" % str(model.__class__.__name__) assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be a Dataset class! " \ "Found %s" % str(dataset.__class__.__name__) # deal with loss expression/targets if loss is not None: assert isinstance(loss, Loss), "Optimizer input loss needs to be a Loss class! " \ "Found %s" % str(loss.__class__.__name__) if isinstance(loss, Loss): self.loss_targets = loss.get_targets() self.loss_expression = loss.get_loss() else: assert model.get_loss() is not None, "No Loss specified, and the model does not have one implemented." if isinstance(model.get_loss(), tuple): self.loss_targets = raise_to_list(model.get_loss()[0]) self.loss_expression = model.get_loss()[1] else: self.loss_targets = None self.loss_expression = model.get_loss() model_inputs = raise_to_list(model.get_inputs()) n_model_inputs = len(model_inputs) model_targets = self.loss_targets or [] for input in model_inputs: if input in model_targets: model_targets.remove(input) n_model_targets = len(model_targets) self.unsupervised = (n_model_targets is 0) # make sure the number of inputs/targets matches up with the dataset properties # train assert n_model_inputs == len(raise_to_list(dataset.train_inputs)), \ "Dataset has %d train inputs, while model expects %d" % \ (len(raise_to_list(dataset.train_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.train_targets) or []), \ "Dataset has %d train targets, while model expects %d" % \ (len(raise_to_list(dataset.train_targets) or []), n_model_targets) # valid if dataset.valid_inputs is not None: assert n_model_inputs == len(raise_to_list(dataset.valid_inputs)), \ "Dataset has %d valid inputs, while model expects %d" % \ (len(raise_to_list(dataset.valid_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.valid_targets) or []), \ "Dataset has %d valid targets, while model expects %d" % \ (len(raise_to_list(dataset.valid_targets) or []), n_model_targets) # test if dataset.test_inputs is not None: assert n_model_inputs == len(raise_to_list(dataset.test_inputs)), \ "Dataset has %d test inputs, while model expects %d" % \ (len(raise_to_list(dataset.test_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.test_targets) or []), \ "Dataset has %d test targets, while model expects %d" % \ (len(raise_to_list(dataset.test_targets) or []), n_model_targets) # now we are happy, we can add them to `self` self.model = model self.dataset = dataset self.loss = loss # Learning rate - how drastic of a step do the parameters change self.learning_rate = sharedX(learning_rate, 'learning_rate') # whether to scale individual model parameters' learning rates. self.lr_scalers = self.model.get_lr_scalers() # whether to decay if lr_decay: self.learning_rate_decay = get_decay_function(lr_decay, self.learning_rate, learning_rate, lr_decay_factor) else: self.learning_rate_decay = False # rest of initial parameters needed for training. self.batch_size = batch_size self.min_batch_size = min_batch_size self.n_epoch = epochs self.save_frequency = save_freq self.early_stop_threshold = stop_threshold self.early_stop_length = stop_patience self.grad_clip = grad_clip self.hard_clip = hard_clip
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/rnn/', input_size=None, hidden_size=None, output_size=None, layers=1, activation='sigmoid', hidden_activation='relu', mrg=RNG_MRG.MRG_RandomStreams(1), weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3, bias_init=0.0, r_weights_init='identity', r_weights_interval='montreal', r_weights_mean=0, r_weights_std=5e-3, r_bias_init=0.0, cost_function='mse', cost_args=None, noise='dropout', noise_level=None, noise_decay=False, noise_decay_amount=.99, direction='forward', clip_recurrent_grads=False): """ Initialize a simple recurrent network. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, it needs to include the shape information (normally the dimensionality of the input i.e. n_in). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. For recurrent nets, this will be the initial starting value for hidden layers. params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters. outdir : str The location to produce outputs from training or running the :class:`RNN`. If None, nothing will be saved. input_size : int The size (dimensionality) of the input. If shape is provided in `inputs_hook`, this is optional. hidden_size : int The size (dimensionality) of the hidden layers. If shape is provided in `hiddens_hook`, this is optional. output_size : int The size (dimensionality) of the output. layers : int The number of stacked hidden layers to use. activation : str or callable The nonlinear (or linear) activation to perform after the dot product from hiddens -> output layer. This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The activation to perform for the hidden layers. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. mrg : random A random number generator that is used when adding noise. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. r_weights_init : str Determines the method for initializing recurrent model weights. See opendeep.utils.nnet for options. r_weights_interval : str or float If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options. r_weights_mean : float If Gaussian `r_weights_init`, the mean value to use. r_weights_std : float If Gaussian `r_weights_init`, the standard deviation to use. r_bias_init : float The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred. cost_function : str or callable The function to use when calculating the output cost of the model. See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. noise : str What type of noise to use for the hidden layers and outputs. See opendeep.utils.noise for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. noise_level : float The amount of noise to use for the noise function specified by `hidden_noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. noise_decay : str or False Whether to use `noise` scheduling (decay `noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_decay_amount : float The amount to reduce the `noise_level` after each training epoch based on the decay function specified in `noise_decay`. direction : str The direction this recurrent model should go over its inputs. Can be 'forward', 'backward', or 'bidirectional'. In the case of 'bidirectional', it will make two passes over the sequence, computing two sets of hiddens and merging them before running through the final decoder. clip_recurrent_grads : False or float, optional Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights connecting previous hidden states to the current hidden state, and not the weights from current input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range `+-clip_recurrent_grads`. Raises ------ AssertionError When asserting various properties of input parameters. See error messages. """ initial_parameters = locals().copy() initial_parameters.pop('self') super(RNN, self).__init__(**initial_parameters) ################## # specifications # ################## self.direction = direction self.bidirectional = (direction == "bidirectional") self.backward = (direction == "backward") self.layers = layers self.noise = noise self.weights_init = weights_init self.weights_mean = weights_mean self.weights_std = weights_std self.weights_interval = weights_interval self.r_weights_init = r_weights_init self.r_weights_mean = r_weights_mean self.r_weights_std = r_weights_std self.r_weights_interval = r_weights_interval self.bias_init = bias_init self.r_bias_init = r_bias_init ######################################### # activation, cost, and noise functions # ######################################### # recurrent hidden activation function! self.hidden_activation_func = get_activation_function( hidden_activation) # output activation function! self.activation_func = get_activation_function(activation) # Cost function self.cost_function = get_cost_function(cost_function) self.cost_args = cost_args or dict() # Now deal with noise if we added it: if self.noise: log.debug('Adding %s noise switch.' % str(noise)) if noise_level is not None: noise_level = sharedX(value=noise_level) self.noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg) else: self.noise_func = get_noise(noise, mrg=mrg) # apply the noise as a switch! # default to apply noise. this is for the cost and gradient functions to be computed later # (not sure if the above statement is accurate such that gradient depends on initial value of switch) self.noise_switch = sharedX(value=1, name="basiclayer_noise_switch") # noise scheduling if noise_decay and noise_level is not None: self.noise_schedule = get_decay_function( noise_decay, noise_level, noise_level.get_value(), noise_decay_amount) ############### # inputs hook # ############### # grab info from the inputs_hook # in the case of an inputs_hook, recurrent will always work with the leading tensor dimension # being the temporal dimension. # input is 3D tensor of (timesteps, batch_size, data_dim) # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D. # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D. if self.inputs_hook is not None: self.input = self.inputs_hook[1] if self.input.ndim == 1: self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 'x'), [1, 2]) self.input_size = 1 elif self.input.ndim == 2: self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 1), 1) elif self.input.ndim == 3: pass elif self.input.ndim > 3: self.input = self.input.flatten(3) self.input_size = sum(self.input_size) else: raise NotImplementedError( "Recurrent input with %d dimensions not supported!" % self.input.ndim) else: # Assume input coming from optimizer is (batches, timesteps, data) # so, we need to reshape to (timesteps, batches, data) xs = T.tensor3("Xs") xs = xs.dimshuffle(1, 0, 2) self.input = xs # The target outputs for supervised training - in the form of (batches, timesteps, output) which is # the same dimension ordering as the expected input from optimizer. # therefore, we need to swap it like we did to input xs. ys = T.tensor3("Ys") ys = ys.dimshuffle(1, 0, 2) self.target = ys ################ # hiddens hook # ################ # set an initial value for the recurrent hiddens from hook if self.hiddens_hook is not None: self.h_init = self.hiddens_hook[1] self.hidden_size = self.hiddens_hook[0] else: # deal with h_init after parameters are made (have to make the same size as hiddens that are computed) self.hidden_size = hidden_size ################## # for generating # ################## # symbolic scalar for how many recurrent steps to use during generation from the model self.n_steps = T.iscalar("generate_n_steps") self.output, self.hiddens, self.updates, self.cost, self.params = self.build_computation_graph( )
def __init__( self, inputs_hook=None, params_hook=None, outdir="outputs/basic", input_size=None, output_size=None, activation="rectifier", cost="mse", cost_args=None, weights_init="uniform", weights_mean=0, weights_std=5e-3, weights_interval="montreal", bias_init=0.0, noise=None, noise_level=None, mrg=RNG_MRG.MRG_RandomStreams(1), **kwargs ): """ Initialize a basic layer. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together. For now, it needs to include the shape information (normally the dimensionality of the input i.e. input_size). params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as a training model with dropout applied to layers and one without for testing, where the parameters are shared between the two. outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. input_size : int The size (dimensionality) of the input to the layer. If shape is provided in `inputs_hook`, this is optional. output_size : int The size (dimensionality) of the output from the layer. activation : str or callable The activation function to use after the dot product going from input -> output. This can be a string representing an option from opendeep.utils.activation, or your own function as long as it is callable. cost : str or callable The cost function to use when training the layer. This should be appropriate for the output type, i.e. mse for real-valued outputs, binary cross-entropy for binary outputs, etc. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. weights_init : str Determines the method for initializing input -> output weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. noise : str What type of noise to use for corrupting the output (if not None). See opendeep.utils.noise for options. This should be appropriate for the output activation, i.e. Gaussian for tanh or other real-valued activations, etc. Often, you will use 'dropout' here as a regularization in BasicLayers. noise_level : float The amount of noise to use for the noise function specified by `noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. mrg : random A random number generator that is used when adding noise. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. """ # init Model to combine the defaults and config dictionaries with the initial parameters. initial_parameters = locals().copy() initial_parameters.pop("self") super(Dense, self).__init__(**initial_parameters) ################## # specifications # ################## # grab info from the inputs_hook, or from parameters if inputs_hook is not None: # inputs_hook is a tuple of (Shape, Input) assert len(inputs_hook) == 2, "Expected inputs_hook to be tuple!" # make sure inputs_hook is a tuple self.input = inputs_hook[1] else: # make the input a symbolic matrix self.input = T.matrix("X") # now that we have the input specs, define the output 'target' variable to be used in supervised training! if kwargs.get("out_as_probs") == False: self.target = T.vector("Y", dtype="int64") else: self.target = T.matrix("Y") # either grab the output's desired size from the parameter directly, or copy input_size self.output_size = self.output_size or self.input_size # other specifications # activation function! activation_func = get_activation_function(activation) # cost function! cost_func = get_cost_function(cost) cost_args = cost_args or dict() #################################################### # parameters - make sure to deal with params_hook! # #################################################### if params_hook is not None: # make sure the params_hook has W (weights matrix) and b (bias vector) assert len(params_hook) == 2, "Expected 2 params (W and b) for Dense, found {0!s}!".format(len(params_hook)) W, b = params_hook else: W = get_weights( weights_init=weights_init, shape=(self.input_size, self.output_size), name="W", rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval, ) # grab the bias vector b = get_bias(shape=output_size, name="b", init_values=bias_init) # Finally have the two parameters - weights matrix W and bias vector b. That is all! self.params = [W, b] ############### # computation # ############### # Here is the meat of the computation transforming input -> output # It simply involves a matrix multiplication of inputs*weights, adding the bias vector, and then passing # the result through our activation function (normally something nonlinear such as: max(0, output)) self.output = activation_func(T.dot(self.input, W) + b) # Now deal with noise if we added it: if noise: log.debug("Adding noise switch.") if noise_level is not None: noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg) else: noise_func = get_noise(noise, mrg=mrg) # apply the noise as a switch! # default to apply noise. this is for the cost and gradient functions to be computed later # (not sure if the above statement is accurate such that gradient depends on initial value of switch) self.switch = sharedX(value=1, name="basiclayer_noise_switch") self.output = T.switch(self.switch, noise_func(input=self.output), self.output) # now to define the cost of the model - use the cost function to compare our output with the target value. self.cost = cost_func(output=self.output, target=self.target, **cost_args) log.debug( "Initialized a basic fully-connected layer with shape %s and activation: %s", str((self.input_size, self.output_size)), str(activation), )
def __init__(self, inputs_hook=None, params_hook=None, outdir='outputs/basic', input_size=None, output_size=None, activation='rectifier', cost='mse', cost_args=None, weights_init='uniform', weights_mean=0, weights_std=5e-3, weights_interval='montreal', bias_init=0.0, noise=None, noise_level=None, mrg=RNG_MRG.MRG_RandomStreams(1), **kwargs): """ Initialize a basic layer. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together. For now, it needs to include the shape information (normally the dimensionality of the input i.e. input_size). params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as a training model with dropout applied to layers and one without for testing, where the parameters are shared between the two. outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. input_size : int The size (dimensionality) of the input to the layer. If shape is provided in `inputs_hook`, this is optional. output_size : int The size (dimensionality) of the output from the layer. activation : str or callable The activation function to use after the dot product going from input -> output. This can be a string representing an option from opendeep.utils.activation, or your own function as long as it is callable. cost : str or callable The cost function to use when training the layer. This should be appropriate for the output type, i.e. mse for real-valued outputs, binary cross-entropy for binary outputs, etc. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. weights_init : str Determines the method for initializing input -> output weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. noise : str What type of noise to use for corrupting the output (if not None). See opendeep.utils.noise for options. This should be appropriate for the output activation, i.e. Gaussian for tanh or other real-valued activations, etc. Often, you will use 'dropout' here as a regularization in BasicLayers. noise_level : float The amount of noise to use for the noise function specified by `noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. mrg : random A random number generator that is used when adding noise. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. """ # init Model to combine the defaults and config dictionaries with the initial parameters. initial_parameters = locals().copy() initial_parameters.pop('self') super(Dense, self).__init__(**initial_parameters) ################## # specifications # ################## # grab info from the inputs_hook, or from parameters if inputs_hook is not None: # inputs_hook is a tuple of (Shape, Input) assert len( inputs_hook ) == 2, 'Expected inputs_hook to be tuple!' # make sure inputs_hook is a tuple self.input = inputs_hook[1] else: # make the input a symbolic matrix self.input = T.matrix('X') # now that we have the input specs, define the output 'target' variable to be used in supervised training! if kwargs.get('out_as_probs') == False: self.target = T.vector('Y', dtype='int64') else: self.target = T.matrix('Y') # either grab the output's desired size from the parameter directly, or copy input_size self.output_size = self.output_size or self.input_size # other specifications # activation function! activation_func = get_activation_function(activation) # cost function! cost_func = get_cost_function(cost) cost_args = cost_args or dict() #################################################### # parameters - make sure to deal with params_hook! # #################################################### if params_hook is not None: # make sure the params_hook has W (weights matrix) and b (bias vector) assert len(params_hook) == 2, \ "Expected 2 params (W and b) for Dense, found {0!s}!".format(len(params_hook)) W, b = params_hook else: W = get_weights( weights_init=weights_init, shape=(self.input_size, self.output_size), name="W", rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) # grab the bias vector b = get_bias(shape=output_size, name="b", init_values=bias_init) # Finally have the two parameters - weights matrix W and bias vector b. That is all! self.params = [W, b] ############### # computation # ############### # Here is the meat of the computation transforming input -> output # It simply involves a matrix multiplication of inputs*weights, adding the bias vector, and then passing # the result through our activation function (normally something nonlinear such as: max(0, output)) self.output = activation_func(T.dot(self.input, W) + b) # Now deal with noise if we added it: if noise: log.debug('Adding noise switch.') if noise_level is not None: noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg) else: noise_func = get_noise(noise, mrg=mrg) # apply the noise as a switch! # default to apply noise. this is for the cost and gradient functions to be computed later # (not sure if the above statement is accurate such that gradient depends on initial value of switch) self.switch = sharedX(value=1, name="basiclayer_noise_switch") self.output = T.switch(self.switch, noise_func(input=self.output), self.output) # now to define the cost of the model - use the cost function to compare our output with the target value. self.cost = cost_func(output=self.output, target=self.target, **cost_args) log.debug( "Initialized a basic fully-connected layer with shape %s and activation: %s", str((self.input_size, self.output_size)), str(activation))
def get_updates(self, gradients): """ Compute AdaSecant updates (see the paper for details). Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- OrderdDict A dictionary mapping from the old model parameters to their new values after a single iteration of the learning rule. """ updates = OrderedDict({}) eps = self.damping step = sharedX(0., name="step") if self.skip_nan_inf: #If norm of the gradients of a parameter is inf or nan don't update that parameter #That might be useful for RNNs. gradients = OrderedDict({ p: T.switch(T.or_(T.isinf(gradients[p]), T.isnan(gradients[p])), 0, gradients[p]) for p in gradients.keys() }) #Block-normalize gradients: gradients = OrderedDict({ p: gradients[p] / (gradients[p].norm(2) + eps) for p in gradients.keys() }) # nparams = len(gradients.keys()) # # #Apply the gradient clipping, this is only necessary for RNNs and sometimes for very deep # #networks # if self.grad_clip: # gnorm = sum([g.norm(2) for g in gradients.values()]) # # gradients = OrderedDict({p: T.switch(gnorm/nparams > self.grad_clip, # g * self.grad_clip * nparams / gnorm , g)\ # for p, g in gradients.items()}) for param in gradients.keys(): gradients[param].name = "grad_%s" % param.name mean_grad = sharedX(param.get_value() * 0. + eps, name="mean_grad_%s" % param.name) # mean_corrected_grad = sharedX(param.get_value() * 0 + eps, name="mean_corrected_grad_%s" % param.name) slow_constant = 2.1 if self.use_adagrad: # sum_square_grad := \sum_i g_i^2 sum_square_grad = sharedX(param.get_value(borrow=True) * 0., name="sum_square_grad_%s" % param.name) """ Initialization of accumulators """ taus_x_t = sharedX( (numpy.ones_like(param.get_value()) + eps) * slow_constant, name="taus_x_t_" + param.name) self.taus_x_t = taus_x_t #Variance reduction parameters #Numerator of the gamma: gamma_nume_sqr = sharedX(numpy.zeros_like(param.get_value()) + eps, name="gamma_nume_sqr_" + param.name) #Denominator of the gamma: gamma_deno_sqr = sharedX(numpy.zeros_like(param.get_value()) + eps, name="gamma_deno_sqr_" + param.name) #For the covariance parameter := E[\gamma \alpha]_{t-1} cov_num_t = sharedX(numpy.zeros_like(param.get_value()) + eps, name="cov_num_t_" + param.name) # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(numpy.zeros_like(param.get_value()) + eps, name="msg_" + param.name) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(param.get_value() * 0., name="msd_" + param.name) if self.use_corrected_grad: old_grad = sharedX(param.get_value() * 0. + eps) #The uncorrected gradient of previous of the previous update: old_plain_grad = sharedX(param.get_value() * 0. + eps) mean_curvature = sharedX(param.get_value() * 0. + eps) mean_curvature_sqr = sharedX(param.get_value() * 0. + eps) # Initialize the E[\Delta]_{t-1} mean_dx = sharedX(param.get_value() * 0.) # Block-wise normalize the gradient: norm_grad = gradients[param] #For the first time-step, assume that delta_x_t := norm_grad cond = T.eq(step, 0) msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx mdx = cond * norm_grad + (1 - cond) * mean_dx """ Compute the new updated values. """ # E[g_i^2]_t new_mean_squared_grad = (mean_square_grad * self.decay + T.sqr(norm_grad) * (1 - self.decay)) new_mean_squared_grad.name = "msg_" + param.name # E[g_i]_t new_mean_grad = (mean_grad * self.decay + norm_grad * (1 - self.decay)) new_mean_grad.name = "nmg_" + param.name mg = new_mean_grad mgsq = new_mean_squared_grad # Keep the rms for numerator and denominator of gamma. new_gamma_nume_sqr = (gamma_nume_sqr * (1 - 1 / taus_x_t) + T.sqr( (norm_grad - old_grad) * (old_grad - mg)) / taus_x_t) new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name new_gamma_deno_sqr = (gamma_deno_sqr * (1 - 1 / taus_x_t) + T.sqr( (mg - norm_grad) * (old_grad - mg)) / taus_x_t) new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name gamma = T.sqrt(gamma_nume_sqr) / T.sqrt(gamma_deno_sqr + eps) gamma.name = "gamma_" + param.name if self.gamma_clip: gamma = T.minimum(gamma, self.gamma_clip) momentum_step = gamma * mg corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma) #For starting the variance reduction. if self.start_var_reduction > -1: cond = T.le(self.start_var_reduction, step) corrected_grad = cond * corrected_grad_cand + ( 1 - cond) * norm_grad else: corrected_grad = norm_grad new_sum_squared_grad = None if self.use_adagrad: g = corrected_grad # Accumulate gradient new_sum_squared_grad = (sum_square_grad + T.sqr(g)) rms_g_t = T.sqrt(new_sum_squared_grad) rms_g_t = T.maximum(rms_g_t, 1.0) # Use the gradients from the previous update # to compute the \nabla f(x_t) - \nabla f(x_{t-1}) cur_curvature = norm_grad - old_plain_grad cur_curvature_sqr = T.sqr(cur_curvature) new_curvature_ave = (mean_curvature * (1 - 1 / taus_x_t) + (cur_curvature / taus_x_t)) new_curvature_ave.name = "ncurve_ave_" + param.name #Average average curvature nc_ave = new_curvature_ave new_curvature_sqr_ave = (mean_curvature_sqr * (1 - 1 / taus_x_t) + (cur_curvature_sqr / taus_x_t)) new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name #Unbiased average squared curvature nc_sq_ave = new_curvature_sqr_ave epsilon = self.lr_scalers.get(param, 1.) * self.learning_rate scaled_lr = self.lr_scalers.get(param, 1.) * sharedX(1.0) rms_dx_tm1 = T.sqrt(msdx + epsilon) rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon) #This is where the update step is being defined delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t / (new_curvature_sqr_ave + epsilon)) delta_x_t.name = "delta_x_t_" + param.name # This part seems to be necessary for only RNNs # For feedforward networks this does not seem to be important. if self.delta_clip: log.info("Clipping will be applied on the adaptive step size.") delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip) if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: log.info("Clipped adagrad is disabled.") delta_x_t = delta_x_t * corrected_grad else: log.info( "Clipping will not be applied on the adaptive step size.") if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: log.info("Clipped adagrad will not be used.") delta_x_t = delta_x_t * corrected_grad new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX( 1 + eps, "stabilized") #To compute the E[\Delta^2]_t new_mean_square_dx = (msdx * (1 - 1 / taus_x_t) + (T.sqr(delta_x_t) / taus_x_t)) #To compute the E[\Delta]_t new_mean_dx = (mean_dx * (1 - 1 / taus_x_t) + (delta_x_t / (taus_x_t))) #Perform the outlier detection: #This outlier detection is slightly different: new_taus_t = T.switch( T.or_( abs(norm_grad - mg) > (2 * T.sqrt(mgsq - mg**2)), abs(cur_curvature - nc_ave) > (2 * T.sqrt(nc_sq_ave - nc_ave**2))), sharedX(2.2), new_taus_t) #Apply the bound constraints on tau: new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t) new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t) new_cov_num_t = (cov_num_t * (1 - 1 / taus_x_t) + (delta_x_t * cur_curvature) * (1 / taus_x_t)) update_step = delta_x_t # Apply updates updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[mean_dx] = new_mean_dx updates[gamma_nume_sqr] = new_gamma_nume_sqr updates[gamma_deno_sqr] = new_gamma_deno_sqr updates[taus_x_t] = new_taus_t updates[cov_num_t] = new_cov_num_t updates[mean_grad] = new_mean_grad updates[old_plain_grad] = norm_grad updates[mean_curvature] = new_curvature_ave updates[mean_curvature_sqr] = new_curvature_sqr_ave updates[param] = param + update_step updates[step] = step + 1 if self.use_adagrad: updates[sum_square_grad] = new_sum_squared_grad if self.use_corrected_grad: updates[old_grad] = corrected_grad return updates
def get_updates(self, gradients): """ Compute AdaSecant updates (see the paper for details). Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- OrderdDict A dictionary mapping from the old model parameters to their new values after a single iteration of the learning rule. """ updates = OrderedDict({}) eps = self.damping step = sharedX(0., name="step") if self.skip_nan_inf: #If norm of the gradients of a parameter is inf or nan don't update that parameter #That might be useful for RNNs. gradients = OrderedDict({p: T.switch(T.or_(T.isinf(gradients[p]), T.isnan(gradients[p])), 0, gradients[p]) for p in gradients.keys()}) #Block-normalize gradients: gradients = OrderedDict({p: gradients[p] / (gradients[p].norm(2) + eps) for p in gradients.keys()}) # nparams = len(gradients.keys()) # # #Apply the gradient clipping, this is only necessary for RNNs and sometimes for very deep # #networks # if self.grad_clip: # gnorm = sum([g.norm(2) for g in gradients.values()]) # # gradients = OrderedDict({p: T.switch(gnorm/nparams > self.grad_clip, # g * self.grad_clip * nparams / gnorm , g)\ # for p, g in gradients.items()}) for param in gradients.keys(): gradients[param].name = "grad_%s" % param.name mean_grad = sharedX(param.get_value() * 0. + eps, name="mean_grad_%s" % param.name) # mean_corrected_grad = sharedX(param.get_value() * 0 + eps, name="mean_corrected_grad_%s" % param.name) slow_constant = 2.1 if self.use_adagrad: # sum_square_grad := \sum_i g_i^2 sum_square_grad = sharedX(param.get_value(borrow=True) * 0., name="sum_square_grad_%s" % param.name) """ Initialization of accumulators """ taus_x_t = sharedX((numpy.ones_like(param.get_value()) + eps) * slow_constant, name="taus_x_t_" + param.name) self.taus_x_t = taus_x_t #Variance reduction parameters #Numerator of the gamma: gamma_nume_sqr = sharedX(numpy.zeros_like(param.get_value()) + eps, name="gamma_nume_sqr_" + param.name) #Denominator of the gamma: gamma_deno_sqr = sharedX(numpy.zeros_like(param.get_value()) + eps, name="gamma_deno_sqr_" + param.name) #For the covariance parameter := E[\gamma \alpha]_{t-1} cov_num_t = sharedX(numpy.zeros_like(param.get_value()) + eps, name="cov_num_t_" + param.name) # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(numpy.zeros_like(param.get_value()) + eps, name="msg_" + param.name) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(param.get_value() * 0., name="msd_" + param.name) if self.use_corrected_grad: old_grad = sharedX(param.get_value() * 0. + eps) #The uncorrected gradient of previous of the previous update: old_plain_grad = sharedX(param.get_value() * 0. + eps) mean_curvature = sharedX(param.get_value() * 0. + eps) mean_curvature_sqr = sharedX(param.get_value() * 0. + eps) # Initialize the E[\Delta]_{t-1} mean_dx = sharedX(param.get_value() * 0.) # Block-wise normalize the gradient: norm_grad = gradients[param] #For the first time-step, assume that delta_x_t := norm_grad cond = T.eq(step, 0) msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx mdx = cond * norm_grad + (1 - cond) * mean_dx """ Compute the new updated values. """ # E[g_i^2]_t new_mean_squared_grad = (mean_square_grad * self.decay + T.sqr(norm_grad) * (1 - self.decay)) new_mean_squared_grad.name = "msg_" + param.name # E[g_i]_t new_mean_grad = (mean_grad * self.decay + norm_grad * (1 - self.decay)) new_mean_grad.name = "nmg_" + param.name mg = new_mean_grad mgsq = new_mean_squared_grad # Keep the rms for numerator and denominator of gamma. new_gamma_nume_sqr = ( gamma_nume_sqr * (1 - 1 / taus_x_t) + T.sqr((norm_grad - old_grad) * (old_grad - mg)) / taus_x_t ) new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name new_gamma_deno_sqr = ( gamma_deno_sqr * (1 - 1 / taus_x_t) + T.sqr((mg - norm_grad) * (old_grad - mg)) / taus_x_t ) new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name gamma = T.sqrt(gamma_nume_sqr) / T.sqrt(gamma_deno_sqr + eps) gamma.name = "gamma_" + param.name if self.gamma_clip: gamma = T.minimum(gamma, self.gamma_clip) momentum_step = gamma * mg corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma) #For starting the variance reduction. if self.start_var_reduction > -1: cond = T.le(self.start_var_reduction, step) corrected_grad = cond * corrected_grad_cand + (1 - cond) * norm_grad else: corrected_grad = norm_grad new_sum_squared_grad = None if self.use_adagrad: g = corrected_grad # Accumulate gradient new_sum_squared_grad = (sum_square_grad + T.sqr(g)) rms_g_t = T.sqrt(new_sum_squared_grad) rms_g_t = T.maximum(rms_g_t, 1.0) # Use the gradients from the previous update # to compute the \nabla f(x_t) - \nabla f(x_{t-1}) cur_curvature = norm_grad - old_plain_grad cur_curvature_sqr = T.sqr(cur_curvature) new_curvature_ave = (mean_curvature * (1 - 1 / taus_x_t) + (cur_curvature / taus_x_t)) new_curvature_ave.name = "ncurve_ave_" + param.name #Average average curvature nc_ave = new_curvature_ave new_curvature_sqr_ave = (mean_curvature_sqr * (1 - 1 / taus_x_t) + (cur_curvature_sqr / taus_x_t)) new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name #Unbiased average squared curvature nc_sq_ave = new_curvature_sqr_ave epsilon = self.lr_scalers.get(param, 1.) * self.learning_rate scaled_lr = self.lr_scalers.get(param, 1.) * sharedX(1.0) rms_dx_tm1 = T.sqrt(msdx + epsilon) rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon) #This is where the update step is being defined delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t / (new_curvature_sqr_ave + epsilon)) delta_x_t.name = "delta_x_t_" + param.name # This part seems to be necessary for only RNNs # For feedforward networks this does not seem to be important. if self.delta_clip: log.info("Clipping will be applied on the adaptive step size.") delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip) if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: log.info("Clipped adagrad is disabled.") delta_x_t = delta_x_t * corrected_grad else: log.info("Clipping will not be applied on the adaptive step size.") if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: log.info("Clipped adagrad will not be used.") delta_x_t = delta_x_t * corrected_grad new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX(1 + eps, "stabilized") #To compute the E[\Delta^2]_t new_mean_square_dx = (msdx * (1 - 1 / taus_x_t) + (T.sqr(delta_x_t) / taus_x_t)) #To compute the E[\Delta]_t new_mean_dx = (mean_dx * (1 - 1 / taus_x_t) + (delta_x_t / (taus_x_t))) #Perform the outlier detection: #This outlier detection is slightly different: new_taus_t = T.switch(T.or_(abs(norm_grad - mg) > (2 * T.sqrt(mgsq - mg**2)), abs(cur_curvature - nc_ave) > (2 * T.sqrt(nc_sq_ave - nc_ave**2))), sharedX(2.2), new_taus_t) #Apply the bound constraints on tau: new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t) new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t) new_cov_num_t = (cov_num_t * (1 - 1 / taus_x_t) + (delta_x_t * cur_curvature) * (1 / taus_x_t)) update_step = delta_x_t # Apply updates updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[mean_dx] = new_mean_dx updates[gamma_nume_sqr] = new_gamma_nume_sqr updates[gamma_deno_sqr] = new_gamma_deno_sqr updates[taus_x_t] = new_taus_t updates[cov_num_t] = new_cov_num_t updates[mean_grad] = new_mean_grad updates[old_plain_grad] = norm_grad updates[mean_curvature] = new_curvature_ave updates[mean_curvature_sqr] = new_curvature_sqr_ave updates[param] = param + update_step updates[step] = step + 1 if self.use_adagrad: updates[sum_square_grad] = new_sum_squared_grad if self.use_corrected_grad: updates[old_grad] = corrected_grad return updates
def __init__(self, dataset, model=None, epochs=10, batch_size=100, min_batch_size=1, save_freq=None, stop_threshold=None, stop_patience=None, learning_rate=1e-6, lr_decay=None, lr_decay_factor=None, decay=0.95, gamma_clip=1.8, damping=1e-7, grad_clip=None, hard_clip=False, start_var_reduction=0, delta_clip=None, use_adagrad=False, skip_nan_inf=False, upper_bound_tau=1e8, lower_bound_tau=1.5, use_corrected_grad=True): """ Initialize AdaSecant. Parameters ---------- dataset : Dataset The Dataset to use when training the Model. model : Model The Model to train. Needed if the Optimizer isn't being passed to a Model's .train() method. epochs : int how many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. min_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_freq : int How many epochs to train between each new save of the Model's parameters. stop_threshold : float The factor by how much the best validation training score needs to improve to determine early stopping. stop_patience : int The patience or number of epochs to wait after the stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The type of decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for options. lr_decay_factor : float The amount to use for the decay function when changing the learning rate over epochs. See `opendeep.utils.decay` for its effect for given decay functions. decay : float, optional Decay rate :math:`\\rho` in Algorithm 1 of the aforementioned paper. Decay 0.95 seems to work fine for several tasks. gamma_clip : float, optional The clipping threshold for the gamma. In general 1.8 seems to work fine for several tasks. start_var_reduction: float, optional, How many updates later should the variance reduction start from? delta_clip: float, optional, The threshold to clip the deltas after. grad_clip: float, optional, Apply gradient clipping for RNNs (not necessary for feedforward networks). But this is a constraint on the norm of the gradient per layer. hard_clip : bool Whether to use a hard cutoff or rescaling for clipping gradients. use_adagrad: bool, optional Either to use clipped adagrad or not. use_corrected_grad: bool, optional Either to use correction for gradients (referred as variance reduction in the workshop paper). """ # get everything together with the Optimizer class initial_parameters = locals().copy() initial_parameters.pop('self') super(AdaSecant, self).__init__(**initial_parameters) assert decay >= 0., "Decay needs to be >=0." assert decay < 1., "Decay needs to be <1." self.decay = sharedX(decay, "decay") self.damping = damping self.skip_nan_inf = skip_nan_inf # if grad_clip: # assert grad_clip > 0. # assert grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1." # self.grad_clip = grad_clip self.use_adagrad = use_adagrad self.use_corrected_grad = use_corrected_grad self.gamma_clip = gamma_clip self.start_var_reduction = start_var_reduction self.delta_clip = delta_clip # We have to bound the tau to prevent it to # grow to an arbitrarily large number, oftenwise # that causes numerical instabilities for very deep # networks. Note that once tau become very large, it will keep, # increasing indefinitely. self.lower_bound_tau = lower_bound_tau self.upper_bound_tau = upper_bound_tau
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/lstm/', input_size=None, hidden_size=None, output_size=None, activation='sigmoid', hidden_activation='relu', inner_hidden_activation='sigmoid', mrg=RNG_MRG.MRG_RandomStreams(1), weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3, bias_init=0.0, r_weights_init='identity', r_weights_interval='montreal', r_weights_mean=0, r_weights_std=5e-3, r_bias_init=0.0, cost_function='mse', cost_args=None, noise='dropout', noise_level=None, noise_decay=False, noise_decay_amount=.99, direction='forward', clip_recurrent_grads=False): """ Initialize a simple recurrent network. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, it needs to include the shape information (normally the dimensionality of the input i.e. n_in). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. For recurrent nets, this will be the initial starting value for hidden layers. params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters. outdir : str The location to produce outputs from training or running the :class:`RNN`. If None, nothing will be saved. input_size : int The size (dimensionality) of the input. If shape is provided in `inputs_hook`, this is optional. hidden_size : int The size (dimensionality) of the hidden layers. If shape is provided in `hiddens_hook`, this is optional. output_size : int The size (dimensionality) of the output. activation : str or callable The nonlinear (or linear) activation to perform after the dot product from hiddens -> output layer. This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The activation to perform for the hidden units. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. inner_hidden_activation : str or callable The activation to perform for the hidden gates. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. mrg : random A random number generator that is used when adding noise. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. r_weights_init : str Determines the method for initializing recurrent model weights. See opendeep.utils.nnet for options. r_weights_interval : str or float If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options. r_weights_mean : float If Gaussian `r_weights_init`, the mean value to use. r_weights_std : float If Gaussian `r_weights_init`, the standard deviation to use. r_bias_init : float The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred. cost_function : str or callable The function to use when calculating the output cost of the model. See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. noise : str What type of noise to use for the hidden layers and outputs. See opendeep.utils.noise for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. noise_level : float The amount of noise to use for the noise function specified by `hidden_noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. noise_decay : str or False Whether to use `noise` scheduling (decay `noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_decay_amount : float The amount to reduce the `noise_level` after each training epoch based on the decay function specified in `noise_decay`. direction : str The direction this recurrent model should go over its inputs. Can be 'forward', 'backward', or 'bidirectional'. clip_recurrent_grads : False or float, optional Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights connecting previous hidden states to the current hidden state, and not the weights from current input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range `+-clip_recurrent_grads`. """ initial_parameters = locals().copy() initial_parameters.pop('self') super(LSTM, self).__init__(**initial_parameters) ################## # specifications # ################## backward = direction.lower() == 'backward' bidirectional = direction.lower() == 'bidirectional' ######################################### # activation, cost, and noise functions # ######################################### # recurrent hidden activation function! self.hidden_activation_func = get_activation_function(hidden_activation) self.inner_hidden_activation_func = get_activation_function(inner_hidden_activation) # output activation function! activation_func = get_activation_function(activation) # Cost function cost_function = get_cost_function(cost_function) cost_args = cost_args or dict() # Now deal with noise if we added it: if noise: log.debug('Adding %s noise switch.' % str(noise)) if noise_level is not None: noise_level = sharedX(value=noise_level) noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg) else: noise_func = get_noise(noise, mrg=mrg) # apply the noise as a switch! # default to apply noise. this is for the cost and gradient functions to be computed later # (not sure if the above statement is accurate such that gradient depends on initial value of switch) self.noise_switch = sharedX(value=1, name="basiclayer_noise_switch") # noise scheduling if noise_decay and noise_level is not None: self.noise_schedule = get_decay_function(noise_decay, noise_level, noise_level.get_value(), noise_decay_amount) ############### # inputs hook # ############### # grab info from the inputs_hook # in the case of an inputs_hook, recurrent will always work with the leading tensor dimension # being the temporal dimension. # input is 3D tensor of (timesteps, batch_size, data_dim) # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D. # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D. if self.inputs_hook is not None: self.input = self.inputs_hook[1] if self.input.ndim == 1: self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 'x'), [1, 2]) self.input_size = 1 elif self.input.ndim == 2: self.input = T.unbroadcast(self.input.dimshuffle(0, 'x', 1), 1) elif self.input.ndim > 3: self.input = self.input.flatten(3) self.input_size = sum(self.input_size) else: raise NotImplementedError("Recurrent input with %d dimensions not supported!" % self.input.ndim) xs = self.input else: # Assume input coming from optimizer is (batches, timesteps, data) # so, we need to reshape to (timesteps, batches, data) self.input = T.tensor3("Xs") xs = self.input.dimshuffle(1, 0, 2) # The target outputs for supervised training - in the form of (batches, timesteps, output) which is # the same dimension ordering as the expected input from optimizer. # therefore, we need to swap it like we did to input xs. self.target = T.tensor3("Ys") ys = self.target.dimshuffle(1, 0, 2) ################ # hiddens hook # ################ # set an initial value for the recurrent hiddens from hook if self.hiddens_hook is not None: h_init = self.hiddens_hook[1] self.hidden_size = self.hiddens_hook[0] else: # deal with h_init after parameters are made (have to make the same size as hiddens that are computed) self.hidden_size = hidden_size ################## # for generating # ################## # symbolic scalar for how many recurrent steps to use during generation from the model self.n_steps = T.iscalar("generate_n_steps") #################################################### # parameters - make sure to deal with params_hook! # #################################################### if self.params_hook is not None: if not bidirectional: (W_x_c, W_x_i, W_x_f, W_x_o, U_h_c, U_h_i, U_h_f, U_h_o, W_h_y, b_c, b_i, b_f, b_o, b_y) = self.params_hook recurrent_params = [U_h_c, U_h_i, U_h_f, U_h_o] else: (W_x_c, W_x_i, W_x_f, W_x_o, U_h_c, U_h_i, U_h_f, U_h_o, U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b, W_h_y, b_c, b_i, b_f, b_o, b_y) = self.params_hook recurrent_params = [U_h_c, U_h_i, U_h_f, U_h_o, U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b] # otherwise, construct our params else: # all input-to-hidden weights W_x_c, W_x_i, W_x_f, W_x_o = [ get_weights(weights_init=weights_init, shape=(self.input_size, self.hidden_size), name="W_x_%s" % sub, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for sub in ['c', 'i', 'f', 'o'] ] # all hidden-to-hidden weights U_h_c, U_h_i, U_h_f, U_h_o = [ get_weights(weights_init=r_weights_init, shape=(self.hidden_size, self.hidden_size), name="U_h_%s" % sub, # if gaussian mean=r_weights_mean, std=r_weights_std, # if uniform interval=r_weights_interval) for sub in ['c', 'i', 'f', 'o'] ] # hidden-to-output weights W_h_y = get_weights(weights_init=weights_init, shape=(self.hidden_size, self.output_size), name="W_h_y", # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) # biases b_c, b_i, b_f, b_o = [ get_bias(shape=(self.hidden_size,), name="b_%s" % sub, init_values=r_bias_init) for sub in ['c', 'i', 'f', 'o'] ] # output bias b_y = get_bias(shape=(self.output_size,), name="b_y", init_values=bias_init) # clip gradients if we are doing that recurrent_params = [U_h_c, U_h_i, U_h_f, U_h_o] if clip_recurrent_grads: clip = abs(clip_recurrent_grads) U_h_c, U_h_i, U_h_f, U_h_o = [theano.gradient.grad_clip(p, -clip, clip) for p in recurrent_params] # bidirectional params if bidirectional: # all hidden-to-hidden weights U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b = [ get_weights(weights_init=r_weights_init, shape=(self.hidden_size, self.hidden_size), name="U_h_%s_b" % sub, # if gaussian mean=r_weights_mean, std=r_weights_std, # if uniform interval=r_weights_interval) for sub in ['c', 'i', 'f', 'o'] ] recurrent_params += [U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b] if clip_recurrent_grads: clip = abs(clip_recurrent_grads) U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b = [theano.gradient.grad_clip(p, -clip, clip) for p in [U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b]] # put all the parameters into our list, and make sure it is in the same order as when we try to load # them from a params_hook!!! self.params = [W_x_c, W_x_i, W_x_f, W_x_o] + recurrent_params + [W_h_y, b_c, b_i, b_f, b_o, b_y] # make h_init the right sized tensor if not self.hiddens_hook: h_init = T.zeros_like(T.dot(xs[0], W_x_c)) c_init = T.zeros_like(T.dot(xs[0], W_x_c)) ############### # computation # ############### # move some computation outside of scan to speed it up! x_c = T.dot(xs, W_x_c) + b_c x_i = T.dot(xs, W_x_i) + b_i x_f = T.dot(xs, W_x_f) + b_f x_o = T.dot(xs, W_x_o) + b_o # now do the recurrent stuff (self.hiddens, _), self.updates = theano.scan( fn=self.recurrent_step, sequences=[x_c, x_i, x_f, x_o], outputs_info=[h_init, c_init], non_sequences=[U_h_c, U_h_i, U_h_f, U_h_o], go_backwards=backward, name="lstm_scan", strict=True ) # if bidirectional, do the same in reverse! if bidirectional: (hiddens_b, _), updates_b = theano.scan( fn=self.recurrent_step, sequences=[x_c, x_i, x_f, x_o], outputs_info=[h_init, c_init], non_sequences=[U_h_c_b, U_h_i_b, U_h_f_b, U_h_o_b], go_backwards=not backward, name="lstm_scan_back", strict=True ) # flip the hiddens to be the right direction hiddens_b = hiddens_b[::-1] # update stuff self.updates.update(updates_b) self.hiddens += hiddens_b # add noise (like dropout) if we wanted it! if noise: self.hiddens = T.switch(self.noise_switch, noise_func(input=self.hiddens), self.hiddens) # now compute the outputs from the leftover (top level) hiddens self.output = activation_func( T.dot(self.hiddens, W_h_y) + b_y ) # now to define the cost of the model - use the cost function to compare our output with the target value. self.cost = cost_function(output=self.output, target=ys, **cost_args) log.info("Initialized an LSTM!")
def __init__(self, inputs=None, noise='dropout', noise_level=0.5, noise_decay=False, noise_decay_amount=0.99, mrg=RNG_MRG.MRG_RandomStreams(1), switch=True): """ Parameters ---------- inputs : tuple(shape, `Theano.TensorType`) tuple(shape, `Theano.TensorType`) describing the inputs to use for this layer. `shape` will be a monad tuple representing known sizes for each dimension in the `Theano.TensorType`. The length of `shape` should be equal to number of dimensions in `Theano.TensorType`, where the shape element is an integer representing the size for its dimension, or None if the shape isn't known. For example, if you have a matrix with unknown batch size but fixed feature size of 784, `shape` would be: (None, 784). The full form of `inputs` would be: [((None, 784), <TensorType(float32, matrix)>)]. noise : str What type of noise to use for the output. See opendeep.utils.noise for options. This should be appropriate for the unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. noise_level : float The amount of noise to use for the noise function specified by `noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. noise_decay : str or False Whether to use `noise` scheduling (decay `noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the model learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_decay_amount : float The amount to reduce the `noise_level` after each training epoch based on the decay function specified in `noise_decay`. mrg : random A random number generator that is used when adding noise. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. switch : boolean Whether to create a switch to turn noise on during training and off during testing (True). If False, noise will be applied at both training and testing times. """ super(Noise, self).__init__(inputs=inputs, noise=noise, noise_level=noise_level, noise_decay=noise_decay, noise_decay_amount=noise_decay_amount, mrg=mrg, switch=switch) # self.inputs is a list from superclass initialization, grab the first element self.output_size, self.inputs = self.inputs[0] log.debug('Adding %s noise switch.' % str(noise)) if noise_level is not None: noise_level = sharedX(value=noise_level) noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg) else: noise_func = get_noise(noise, mrg=mrg) # apply the noise as a switch! # default to apply noise. this is for the cost and gradient functions to be computed later # (not sure if the above statement is accurate such that gradient depends on initial value of switch) if switch: self.noise_switch = sharedX(value=1, name="noise_switch") # noise scheduling if noise_decay and noise_level is not None: self.noise_schedule = get_decay_function(noise_decay, noise_level, noise_level.get_value(), noise_decay_amount) # apply noise to the inputs! if switch: self.outputs = Tswitch(self.noise_switch, noise_func(input=self.inputs), self.inputs) else: self.outputs = noise_func(input=self.inputs)
def __init__(self, dataset, loss, model=None, epochs=10, batch_size=100, min_batch_size=1, save_freq=None, stop_threshold=None, stop_patience=None, learning_rate=.1, lr_decay="exponential", lr_decay_factor=.995, momentum=0.5, momentum_decay="linear", momentum_factor=0, nesterov_momentum=True, grad_clip=None, hard_clip=False): """ Initialize SGD. Parameters ---------- dataset : Dataset The :class:`opendeep.data.Dataset` to use when training the Model. loss : Loss The :class:`opendeep.optimization.loss.Loss` function to compare the model to a 'target' result. model : Model The :class:`opendeep.models.Model` to train. Needed if the Optimizer isn't being passed to a Model's .train() method. epochs : int how many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. min_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_freq : int How many epochs to train between each new save of the Model's parameters. stop_threshold : float The factor by how much the best validation training score needs to improve to determine early stopping. stop_patience : int The patience or number of epochs to wait after the stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The type of decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for options. lr_decay_factor : float The amount to use for the decay function when changing the learning rate over epochs. See `opendeep.utils.decay` for its effect for given decay functions. momentum : float The momentum to use during gradient updates. momentum_decay : str The type of decay function to use for changing the momentum over epochs. See `opendeep.utils.decay` for options. momentum_factor : float The amount to use for the decay function when changing the momentum over epochs. See `opendeep.utils.decay` for its effect for given decay functions. nesterov_momentum : bool Whether or not to use Nesterov momentum. grad_clip : float, optional Whether to clip gradients. This will clip with a maximum of grad_clip or the parameter norm. hard_clip : bool Whether to use a hard cutoff or rescaling for clipping gradients. """ # superclass init initial_parameters = locals().copy() initial_parameters.pop('self') super(SGD, self).__init__(**initial_parameters) # Momentum - smoothing over the parameter changes (see Hinton) if momentum: self.momentum = sharedX(momentum, 'momentum') if momentum_decay is not None and \ momentum_decay is not False and \ momentum_factor is not None: self.momentum_decay = get_decay_function( momentum_decay, self.momentum, self.momentum.get_value(), momentum_factor) else: self.momentum_decay = False else: self.momentum = 0 self.momentum_decay = False self.nesterov_momentum = nesterov_momentum
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/gsn/', input_size=None, hidden_size=1000, layers=2, walkbacks=4, visible_activation='sigmoid', hidden_activation='tanh', input_sampling=True, mrg=RNG_MRG.MRG_RandomStreams(1), tied_weights=True, weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3, bias_init=0.0, cost_function='binary_crossentropy', cost_args=None, add_noise=True, noiseless_h1=True, hidden_noise='gaussian', hidden_noise_level=2, input_noise='salt_and_pepper', input_noise_level=0.4, noise_decay='exponential', noise_annealing=1, image_width=None, image_height=None, **kwargs): """ Initialize a GSN. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, it needs to include the shape information (normally the dimensionality of the input i.e. n_in). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. This is used for linking different models together (e.g. setting the DAE model's hidden layers to the RNN's output layer gives a generative recurrent model.) For now, it needs to include the shape information (normally the dimensionality of the hiddens i.e. n_hidden). params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as a training model with dropout applied to layers and one without for testing, where the parameters are shared between the two. outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. input_size : int The size (dimensionality) of the input to the DAE. If shape is provided in `inputs_hook`, this is optional. The :class:`Model` requires an `output_size`, which gets set to this value because the DAE is an unsupervised model. The output is a reconstruction of the input. hidden_size : int The size (dimensionality) of the hidden layer for the DAE. Generally, you want it to be larger than `input_size`, which is known as *overcomplete*. visible_activation : str or callable The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer. This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. layers : int The number of hidden layers to use. walkbacks : int The number of walkbacks to perform (the variable K in Bengio's paper above). A walkback is a Gibbs sample from the DAE, which means the model generates inputs in sequence, where each generated input is compared to the original input to create the reconstruction cost for training. For running the model, the very last generated input in the Gibbs chain is used as the output. input_sampling : bool During walkbacks, whether to sample from the generated input to create a new starting point for the next walkback (next step in the Gibbs chain). This generally makes walkbacks more effective by making the process more stochastic - more likely to find spurious modes in the model's representation. mrg : random A random number generator that is used when adding noise into the network and for sampling from the input. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. tied_weights : bool DAE has two weight matrices - W from input -> hiddens and V from hiddens -> input. This boolean determines if V = W.T, which 'ties' V to W and reduces the number of parameters necessary during training. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. cost_function : str or callable The function to use when calculating the reconstruction cost of the model. This should be appropriate for the type of input, i.e. use 'binary_crossentropy' for binary inputs, or 'mse' for real-valued inputs. See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. add_noise : bool Whether to add noise (corrupt) the input before passing it through the computation graph during training. This should most likely be set to the default of True, because this is a *denoising* autoencoder after all. noiseless_h1 : bool Whether to not add noise (corrupt) the hidden layer during computation. hidden_noise : str What type of noise to use for corrupting the hidden layer (if not `noiseless_h1`). See opendeep.utils.noise for options. This should be appropriate for the hidden unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. hidden_noise_level : float The amount of noise to use for the noise function specified by `hidden_noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. input_noise : str What type of noise to use for corrupting the input before computation (if `add_noise`). See opendeep.utils.noise for options. This should be appropriate for the input units, i.e. salt-and-pepper for binary units, etc. input_noise_level : float The amount of noise used to corrupt the input. This could be the masking probability for salt-and-pepper, standard deviation for Gaussian, interval for Uniform, etc. noise_decay : str or False Whether to use `input_noise` scheduling (decay `input_noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the DAE learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_annealing : float The amount to reduce the `input_noise_level` after each training epoch based on the decay function specified in `noise_decay`. image_width : int If the input should be represented as an image, the width of the input image. If not specified, it will be close to the square factor of the `input_size`. image_height : int If the input should be represented as an image, the height of the input image. If not specified, it will be close to the square factor of the `input_size`. """ # init Model to combine the defaults and config dictionaries with the initial parameters. initial_parameters = locals().copy() initial_parameters.pop('self') super(GSN, self).__init__(**initial_parameters) self.input_size = input_size # when the input should be thought of as an image, either use the specified width and height, # or try to make as square as possible. if image_height is None and image_width is None: (_h, _w) = closest_to_square_factors(self.input_size) self.image_width = _w self.image_height = _h else: self.image_height = image_height self.image_width = image_width ############################ # Theano variables and RNG # ############################ if self.inputs_hook is None: self.X = T.matrix('X') else: # inputs_hook is a (shape, input) tuple self.X = self.inputs_hook[1] ########################## # Network specifications # ########################## # generally, walkbacks should be at least 2*layers if layers % 2 == 0: if walkbacks < 2*layers: log.warning('Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. ' 'Generaly want 2X walkbacks to layers', str(layers), str(walkbacks)) else: if walkbacks < 2*layers-1: log.warning('Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. ' 'Generaly want 2X walkbacks to layers', str(layers), str(walkbacks)) self.add_noise = add_noise self.noise_annealing = as_floatX(noise_annealing) # noise schedule parameter self.hidden_noise_level = sharedX(hidden_noise_level, dtype=theano.config.floatX) self.hidden_noise = get_noise(name=hidden_noise, noise_level=self.hidden_noise_level, mrg=mrg) self.input_noise_level = sharedX(input_noise_level, dtype=theano.config.floatX) self.input_noise = get_noise(name=input_noise, noise_level=self.input_noise_level, mrg=mrg) self.walkbacks = walkbacks self.tied_weights = tied_weights self.layers = layers self.noiseless_h1 = noiseless_h1 self.input_sampling = input_sampling self.noise_decay = noise_decay # if there was a hiddens_hook, unpack the hidden layers in the tensor if self.hiddens_hook is not None: hidden_size = self.hiddens_hook[0] self.hiddens_flag = True else: self.hiddens_flag = False # determine the sizes of each layer in a list. # layer sizes, from h0 to hK (h0 is the visible layer) hidden_size = list(raise_to_list(hidden_size)) if len(hidden_size) == 1: self.layer_sizes = [self.input_size] + hidden_size * self.layers else: assert len(hidden_size) == self.layers, "Hiddens sizes and number of hidden layers mismatch." + \ "Hiddens %d and layers %d" % (len(hidden_size), self.layers) self.layer_sizes = [self.input_size] + hidden_size if self.hiddens_hook is not None: self.hiddens = self.unpack_hiddens(self.hiddens_hook[1]) ######################### # Activation functions! # ######################### # hidden unit activation self.hidden_activation = get_activation_function(hidden_activation) # Visible layer activation self.visible_activation = get_activation_function(visible_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.visible_activation): self.visible_sampling = mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary visible activation not supported yet!") raise NotImplementedError("Non-binary visible activation not supported yet!") # Cost function self.cost_function = get_cost_function(cost_function) self.cost_args = cost_args or dict() ############### # Parameters! # ############### # make sure to deal with params_hook! if self.params_hook is not None: # if tied weights, expect layers*2 + 1 params if self.tied_weights: assert len(self.params_hook) == 2*layers + 1, \ "Tied weights: expected {0!s} params, found {1!s}!".format(2*layers+1, len(self.params_hook)) self.weights_list = self.params_hook[:layers] self.bias_list = self.params_hook[layers:] # if untied weights, expect layers*3 + 1 params else: assert len(self.params_hook) == 3*layers + 1, \ "Untied weights: expected {0!s} params, found {1!s}!".format(3*layers+1, len(self.params_hook)) self.weights_list = self.params_hook[:2*layers] self.bias_list = self.params_hook[2*layers:] # otherwise, construct our params else: # initialize a list of weights and biases based on layer_sizes for the GSN self.weights_list = [get_weights(weights_init=weights_init, shape=(self.layer_sizes[i], self.layer_sizes[i+1]), name="W_{0!s}_{1!s}".format(i, i+1), rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for i in range(layers)] # add more weights if we aren't tying weights between layers (need to add for higher-lower layers now) if not tied_weights: self.weights_list.extend( [get_weights(weights_init=weights_init, shape=(self.layer_sizes[i+1], self.layer_sizes[i]), name="W_{0!s}_{1!s}".format(i+1, i), rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for i in reversed(range(layers))] ) # initialize each layer bias to 0's. self.bias_list = [get_bias(shape=(self.layer_sizes[i],), name='b_' + str(i), init_values=bias_init) for i in range(layers+1)] # build the params of the model into a list self.params = self.weights_list + self.bias_list log.debug("gsn params: %s", str(self.params)) # using the properties, build the computational graph self.cost, self.monitors, self.output, self.hiddens = self.build_computation_graph()