def get_updates(self, gradients): """ Compute the AdaDelta updates (see the paper for details). Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. """ log.debug('Setting up ADADELTA for optimizer...') updates = OrderedDict() for param in gradients.keys(): # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(param.get_value() * 0.) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(param.get_value() * 0.) if param.name is not None: mean_square_grad.name = 'mean_square_grad_' + param.name mean_square_dx.name = 'mean_square_dx_' + param.name # Accumulate gradient new_mean_squared_grad = ( self.decay * mean_square_grad + (1 - self.decay) * T.sqr(gradients[param]) ) # Compute update epsilon = self.lr_scalers.get(param, 1.) * self.learning_rate rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon) rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon) delta_x_t = - (rms_dx_tm1 / rms_grad_t) * gradients[param] # Accumulate updates new_mean_square_dx = ( self.decay * mean_square_dx + (1 - self.decay) * T.sqr(delta_x_t) ) # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[param] = param + delta_x_t return updates
def get_updates(self, gradients): """ Compute the AdaDelta updates (see the paper for details). Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. """ log.debug('Setting up ADADELTA for optimizer...') updates = OrderedDict() for param in gradients.keys(): # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(param.get_value() * 0.) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(param.get_value() * 0.) if param.name is not None: mean_square_grad.name = 'mean_square_grad_' + param.name mean_square_dx.name = 'mean_square_dx_' + param.name # Accumulate gradient new_mean_squared_grad = ( self.decay * mean_square_grad + (1 - self.decay) * T.sqr(gradients[param])) # Compute update epsilon = self.lr_scalers.get(param, 1.) * self.learning_rate rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon) rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon) delta_x_t = -(rms_dx_tm1 / rms_grad_t) * gradients[param] # Accumulate updates new_mean_square_dx = (self.decay * mean_square_dx + (1 - self.decay) * T.sqr(delta_x_t)) # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[param] = param + delta_x_t return updates
def __init__(self, model, dataset, config=None, defaults=defaults, n_epoch=None, batch_size=None, minimum_batch_size=None, save_frequency=None, early_stop_threshold=None, early_stop_length=None, learning_rate=None, lr_decay=None, lr_factor=None, decay=None, gamma_clip=None, damping=None, grad_clip=None, start_var_reduction=None, delta_clip=None, use_adagrad=None, skip_nan_inf=None, upper_bound_tau=None, lower_bound_tau=None, use_corrected_grad=None): # get everything together with the Optimizer class super(AdaSecant, self).__init__(model, dataset, config=config, defaults=defaults, n_epoch=n_epoch, batch_size=batch_size, minimum_batch_size=minimum_batch_size, save_frequency=save_frequency, early_stop_length=early_stop_length, early_stop_threshold=early_stop_threshold, learning_rate=learning_rate, lr_decay=lr_decay, lr_factor=lr_factor, decay=decay, gamma_clip=gamma_clip, damping=damping, grad_clip=grad_clip, start_var_reduction=start_var_reduction, delta_clip=delta_clip, use_adagrad=use_adagrad, skip_nan_inf=skip_nan_inf, upper_bound_tau=upper_bound_tau, lower_bound_tau=lower_bound_tau, use_corrected_grad=use_corrected_grad) # We have to bound the tau to prevent it to # grow to an arbitrarily large number, oftenwise # that causes numerical instabilities for very deep # networks. Note that once tau become very large, it will keep, # increasing indefinitely. assert self.decay >= 0., "Decay needs to be >=0." assert self.decay < 1., "Decay needs to be <1." self.decay = sharedX(self.decay, "decay")
def get_updates(self, gradients): """ Provides the symbolic (theano) description of the updates needed to perform this learning rule. See Notes for side-effects. Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. Notes ----- This method has the side effect of storing the moving average of the square gradient in `self.mean_square_grads`. This is necessary in order for the monitoring channels to be able to track the value of these moving averages. Therefore, this method should only get called once for each instance of RMSProp. """ log.debug('Setting up RMSProp for optimizer...') updates = OrderedDict() for param in gradients: # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(param.get_value() * 0.) if param.name is None: raise ValueError("Model parameters must be named.") mean_square_grad.name = 'mean_square_grad_' + param.name if param.name in self.mean_square_grads: log.warning("Calling get_updates more than once on the " "gradients of `%s` may make monitored values " "incorrect." % param.name) # Store variable in self.mean_square_grads for monitoring. self.mean_square_grads[param.name] = mean_square_grad # Accumulate gradient new_mean_squared_grad = ( self.decay * mean_square_grad + (1 - self.decay) * T.sqr(gradients[param])) # Compute update scaled_lr = self.lr_scalers.get(param, 1.) * self.learning_rate rms_grad_t = T.sqrt(new_mean_squared_grad) rms_grad_t = T.maximum(rms_grad_t, self.epsilon) delta_x_t = -scaled_lr * gradients[param] / rms_grad_t # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[param] = param + delta_x_t return updates
def __init__(self, model, dataset, config=None, defaults=defaults, n_epoch=None, batch_size=None, minimum_batch_size=None, save_frequency=None, early_stop_threshold=None, early_stop_length=None, learning_rate=None, lr_decay=None, lr_factor=None, momentum=None, momentum_decay=None, momentum_factor=None, nesterov_momentum=None): # superclass init super(SGD, self).__init__(model, dataset, config=config, defaults=defaults, n_epoch=n_epoch, batch_size=batch_size, minimum_batch_size=minimum_batch_size, save_frequency=save_frequency, early_stop_length=early_stop_length, early_stop_threshold=early_stop_threshold, learning_rate=learning_rate, lr_decay=lr_decay, lr_factor=lr_factor, momentum=momentum, momentum_decay=momentum_decay, momentum_factor=momentum_factor, nesterov_momentum=nesterov_momentum) # everything is in self! yay! # Momentum - smoothing over the parameter changes (see Hinton) if self.momentum: self.momentum = sharedX(self.momentum, 'momentum') if self.momentum_decay is not None and \ self.momentum_decay is not False and \ self.momentum_factor is not None: self.momentum_decay = get_decay_function(self.momentum_decay, self.momentum, self.momentum.get_value(), self.momentum_factor) else: self.momentum_decay = False else: self.momentum = 1
def get_updates(self, grads): """ Provides the symbolic (theano) description of the updates needed to perform this learning rule. See Notes for side-effects. Parameters ---------- grads : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. Notes ----- This method has the side effect of storing the moving average of the square gradient in `self.mean_square_grads`. This is necessary in order for the monitoring channels to be able to track the value of these moving averages. Therefore, this method should only get called once for each instance of RMSProp. """ log.debug('Setting up RMSProp for optimizer...') updates = OrderedDict() for param in grads: # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(param.get_value() * 0.) if param.name is None: raise ValueError("Model parameters must be named.") mean_square_grad.name = 'mean_square_grad_' + param.name if param.name in self.mean_square_grads: log.warning("Calling get_updates more than once on the " "gradients of `%s` may make monitored values " "incorrect." % param.name) # Store variable in self.mean_square_grads for monitoring. self.mean_square_grads[param.name] = mean_square_grad # Accumulate gradient new_mean_squared_grad = (self.decay * mean_square_grad + (1 - self.decay) * T.sqr(grads[param])) # Compute update scaled_lr = self.lr_scalers.get(param, 1.) * self.learning_rate rms_grad_t = T.sqrt(new_mean_squared_grad) rms_grad_t = T.maximum(rms_grad_t, self.epsilon) delta_x_t = - scaled_lr * grads[param] / rms_grad_t # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[param] = param + delta_x_t return updates
def __init__(self, train_X, train_Y=None, valid_X=None, valid_Y=None, test_X=None, test_Y=None): log.info('Wrapping matrix from memory') super(self.__class__, self).__init__() # make sure the inputs are arrays train_X = numpy.array(train_X) self._train_shape = train_X.shape self.train_X = sharedX(train_X) if train_Y: self.train_Y = sharedX(numpy.array(train_Y)) if valid_X: valid_X = numpy.array(valid_X) self._valid_shape = valid_X.shape self.valid_X = sharedX(valid_X) if valid_Y: self.valid_Y = sharedX(numpy.array(valid_Y)) if test_X: test_X = numpy.array(test_X) self._test_shape = test_X.shape self.test_X = sharedX(test_X) if test_Y: self.test_Y = sharedX(numpy.array(test_Y))
def get_updates(self, gradients): """ Based on Pylearn2 (https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/training_algorithms/learning_rule.py) Implements momentum as described in Section 9 of "A Practical Guide to Training Restricted Boltzmann Machines", Geoffrey Hinton. Parameters are updated by the formula: inc := momentum * inc - learning_rate * d cost / d param param := param + inc Also has the option to implement Nesterov momentum (accelerated momentum), which works better in a lot of cases. Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. """ log.debug('Setting up Stochastic Gradient Descent with momentum for optimizer...') updates = OrderedDict() for (param, gradient) in six.iteritems(gradients): velocity = sharedX(param.get_value() * 0.) assert param.dtype == velocity.dtype assert gradient.dtype == param.dtype if param.name is not None: velocity.name = 'vel_' + param.name scaled_lr = self.learning_rate * self.lr_scalers.get(param, 1.) updates[velocity] = self.momentum * velocity - scaled_lr * gradient inc = updates[velocity] if self.nesterov_momentum: log.debug('Using Nesterov momentum for parameter %s', str(param)) inc = self.momentum * inc - scaled_lr * gradient assert inc.dtype == velocity.dtype updates[param] = param + inc return updates
def __init__(self, model, dataset, decay=None, max_scaling=None, iterator_class=SequentialIterator, config=None, defaults=_defaults, rng=None, n_epoch=None, batch_size=None, minimum_batch_size=None, save_frequency=None, early_stop_threshold=None, early_stop_length=None, learning_rate=None, flag_para_load=None): if not decay: if config: decay = config.get('decay', defaults.get('decay')) elif defaults: decay = defaults.get('decay') else: log.error("RMSProp missing 'decay' parameter in config or defaults!") raise AssertionError assert decay >= 0. assert decay < 1. self.decay = sharedX(decay) if not max_scaling: if config: max_scaling = config.get('max_scaling', defaults.get('max_scaling')) elif defaults: max_scaling = defaults.get('max_scaling') else: log.error("RMSProp missing 'max_scaling' parameter in config or defaults!") raise AssertionError assert max_scaling > 0. self.epsilon = 1. / max_scaling self.mean_square_grads = OrderedDict() # need to call the SGD constructor after parameters are extracted because the constructor calls get_updates()! super(RMSProp, self).__init__(model=model, dataset=dataset, iterator_class=iterator_class, config=config, defaults=defaults, rng=rng, n_epoch=n_epoch, batch_size=batch_size, minimum_batch_size=minimum_batch_size, save_frequency=save_frequency, early_stop_length=early_stop_length, early_stop_threshold=early_stop_threshold, learning_rate=learning_rate, flag_para_load=flag_para_load)
def get_updates(self, grads): """ From Pylearn2 (https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/training_algorithms/learning_rule.py) Implements momentum as described in Section 9 of "A Practical Guide to Training Restricted Boltzmann Machines", Geoffrey Hinton. Parameters are updated by the formula: inc := momentum * inc - learning_rate * d cost / d param param := param + inc Also has the option to implement Nesterov momentum (accelerated momentum), which works better in a lot of cases. :param grads: OrderedDict An OrderedDict of (parameter, gradient) for the model's gradients :return: OrderedDict Updates at each training step """ log.debug( 'Setting up Stochastic Gradient Descent with momentum for optimizer...' ) updates = OrderedDict() for (param, gradient) in six.iteritems(grads): vel = sharedX(param.get_value() * 0.) assert param.dtype == vel.dtype assert gradient.dtype == param.dtype if param.name is not None: vel.name = 'vel_' + param.name scaled_lr = self.learning_rate * self.lr_scalers.get(param, 1.) updates[vel] = self.momentum * vel - scaled_lr * gradient inc = updates[vel] if self.nesterov_momentum: log.debug('Using Nesterov momentum') inc = self.momentum * inc - scaled_lr * gradient assert inc.dtype == vel.dtype updates[param] = param + inc return updates
def __init__(self, model, dataset, n_epoch=10, batch_size=100, minimum_batch_size=1, save_frequency=None, early_stop_threshold=None, early_stop_length=None, learning_rate=.1, lr_decay="exponential", lr_factor=.995, momentum=0.5, momentum_decay="linear", momentum_factor=0, nesterov_momentum=True): """ Initialize SGD. Parameters ---------- model : Model The Model to train. dataset : Dataset The Dataset to use when training the Model. n_epoch : int how many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. minimum_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_frequency : int How many epochs to train between each new save of the Model's parameters. early_stop_threshold : float The factor by how much the best validation training score needs to improve to determine early stopping. early_stop_length : int The patience or number of epochs to wait after the early_stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The type of decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for options. lr_factor : float The amount to use for the decay function when changing the learning rate over epochs. See `opendeep.utils.decay` for its effect for given decay functions. momentum : float The momentum to use during gradient updates. momentum_decay : str The type of decay function to use for changing the momentum over epochs. See `opendeep.utils.decay` for options. momentum_factor : float The amount to use for the decay function when changing the momentum over epochs. See `opendeep.utils.decay` for its effect for given decay functions. nesterov_momentum : bool Whether or not to use Nesterov momentum. """ # superclass init initial_parameters = locals().copy() initial_parameters.pop('self') super(SGD, self).__init__(**initial_parameters) # Momentum - smoothing over the parameter changes (see Hinton) if momentum: self.momentum = sharedX(momentum, 'momentum') if momentum_decay is not None and \ momentum_decay is not False and \ momentum_factor is not None: self.momentum_decay = get_decay_function(momentum_decay, self.momentum, self.momentum.get_value(), momentum_factor) else: self.momentum_decay = False else: self.momentum = 0 self.momentum_decay = False self.nesterov_momentum = nesterov_momentum
def __init__(self, model, dataset, config=None, defaults=None, n_epoch=None, batch_size=None, minimum_batch_size=None, save_frequency=None, early_stop_threshold=None, early_stop_length=None, learning_rate=None, lr_decay=None, lr_factor=None, **kwargs): # Default values to use for some training parameters _defaults = {"n_epoch": 1000, "batch_size": 100, "minimum_batch_size": 1, "save_frequency": 10, "early_stop_threshold": .9995, "early_stop_length": 30, "learning_rate": 0.001, "lr_decay": "exponential", "lr_factor": 1, # no learning rate decay by default } log.debug("Initializing optimizer %s", str(type(self))) assert isinstance(model, Model), "Optimizer input model needs to be an opendeep Model class!" self.model = model self.dataset = dataset assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be an opendeep Dataset class!" # set self.args to be the combination of the defaults and the config dictionaries from the subclass in_args = combine_config_and_defaults(config, defaults) self.args = combine_config_and_defaults(in_args, _defaults) # if the args are none, make it a blank dictionary if self.args is None: self.args = {} # now that our required variables are out of the way, do the same thing for everything else passed via kwargs for arg, val in kwargs.items(): if (val is not None or str(arg) not in self.args) and str(arg) != 'kwargs': self.args[str(arg)] = val # flatten kwargs if it was passed as a variable elif str(arg) == 'kwargs': inner_kwargs = kwargs['kwargs'] for key, item in inner_kwargs.items(): if item is not None or str(key) not in self.args: self.args[str(key)] = item # now take care of overriding explicits passed in if n_epoch is not None: self.args['n_epoch'] = n_epoch if batch_size is not None: self.args['batch_size'] = batch_size if minimum_batch_size is not None: self.args['minimum_batch_size'] = minimum_batch_size if save_frequency is not None: self.args['save_frequency'] = save_frequency if early_stop_threshold is not None: self.args['early_stop_threshold'] = early_stop_threshold if early_stop_length is not None: self.args['early_stop_length'] = early_stop_length if learning_rate is not None: self.args['learning_rate'] = learning_rate if lr_decay is not None: self.args['lr_decay'] = lr_decay if lr_factor is not None: self.args['lr_factor'] = lr_factor # Magic! Now self.args contains the combination of all the initialization variables, overridden like so: # _defaults < defaults < config < kwargs (explicits passed to model's __init__) # log the arguments log.debug("optimizer config args: %s", str(self.args)) # Finally, to make things really easy, update the class 'self' with everything in self.args to make # all the parameters accessible via self.<param> self.__dict__.update(self.args) # Learning rate - how drastic of a step do the parameters change self.learning_rate = sharedX(self.learning_rate, 'learning_rate') self.lr_scalers = self.model.get_lr_scalers() if self.lr_decay: self.learning_rate_decay = get_decay_function(self.lr_decay, self.learning_rate, self.learning_rate.get_value(), self.lr_factor) else: self.learning_rate_decay = False
def __init__(self, model, dataset, n_epoch=10, batch_size=100, minimum_batch_size=1, save_frequency=None, early_stop_threshold=None, early_stop_length=None, learning_rate=1e-6, lr_decay=None, lr_factor=None, decay=0.95, gamma_clip=1.8, damping=1e-7, grad_clip=None, start_var_reduction=0, delta_clip=None, use_adagrad=False, skip_nan_inf=False, upper_bound_tau=1e8, lower_bound_tau=1.5, use_corrected_grad=True): """ Initialize AdaSecant. Parameters ---------- model : Model The Model to train. dataset : Dataset The Dataset to use when training the Model. n_epoch : int how many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. minimum_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_frequency : int How many epochs to train between each new save of the Model's parameters. early_stop_threshold : float The factor by how much the best validation training score needs to improve to determine early stopping. early_stop_length : int The patience or number of epochs to wait after the early_stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The type of decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for options. lr_factor : float The amount to use for the decay function when changing the learning rate over epochs. See `opendeep.utils.decay` for its effect for given decay functions. decay : float, optional Decay rate :math:`\\rho` in Algorithm 1 of the aforementioned paper. Decay 0.95 seems to work fine for several tasks. gamma_clip : float, optional The clipping threshold for the gamma. In general 1.8 seems to work fine for several tasks. start_var_reduction: float, optional, How many updates later should the variance reduction start from? delta_clip: float, optional, The threshold to clip the deltas after. grad_clip: float, optional, Apply gradient clipping for RNNs (not necessary for feedforward networks). But this is a constraint on the norm of the gradient per layer. Based on: Pascanu, Razvan, Tomas Mikolov, and Yoshua Bengio. "On the difficulty of training recurrent neural networks." arXiv preprint arXiv:1211.5063 (2012). use_adagrad: bool, optional Either to use clipped adagrad or not. use_corrected_grad: bool, optional Either to use correction for gradients (referred as variance reduction in the workshop paper). """ # get everything together with the Optimizer class initial_parameters = locals().copy() initial_parameters.pop('self') super(AdaSecant, self).__init__(**initial_parameters) assert decay >= 0., "Decay needs to be >=0." assert decay < 1., "Decay needs to be <1." self.decay = sharedX(decay, "decay") self.damping = damping self.skip_nan_inf = skip_nan_inf if grad_clip: assert grad_clip > 0. assert grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1." self.grad_clip = grad_clip self.use_adagrad = use_adagrad self.use_corrected_grad = use_corrected_grad self.gamma_clip = gamma_clip self.start_var_reduction = start_var_reduction self.delta_clip = delta_clip # We have to bound the tau to prevent it to # grow to an arbitrarily large number, oftenwise # that causes numerical instabilities for very deep # networks. Note that once tau become very large, it will keep, # increasing indefinitely. self.lower_bound_tau = lower_bound_tau self.upper_bound_tau = upper_bound_tau
def get_updates(self, gradients): """ Compute AdaSecant updates (see the paper for details). Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- OrderdDict A dictionary mapping from the old model parameters to their new values after a single iteration of the learning rule. """ updates = OrderedDict({}) eps = self.damping step = sharedX(0., name="step") if self.skip_nan_inf: #If norm of the gradients of a parameter is inf or nan don't update that parameter #That might be useful for RNNs. gradients = OrderedDict({ p: T.switch(T.or_(T.isinf(gradients[p]), T.isnan(gradients[p])), 0, gradients[p]) for p in gradients.keys() }) #Block-normalize gradients: gradients = OrderedDict({ p: gradients[p] / (gradients[p].norm(2) + eps) for p in gradients.keys() }) nparams = len(gradients.keys()) #Apply the gradient clipping, this is only necessary for RNNs and sometimes for very deep #networks if self.grad_clip: gnorm = sum([g.norm(2) for g in gradients.values()]) gradients = OrderedDict({p: T.switch(gnorm/nparams > self.grad_clip, g * self.grad_clip * nparams / gnorm , g)\ for p, g in gradients.iteritems()}) for param in gradients.keys(): gradients[param].name = "grad_%s" % param.name mean_grad = sharedX(param.get_value() * 0. + eps, name="mean_grad_%s" % param.name) # mean_corrected_grad = sharedX(param.get_value() * 0 + eps, name="mean_corrected_grad_%s" % param.name) slow_constant = 2.1 if self.use_adagrad: # sum_square_grad := \sum_i g_i^2 sum_square_grad = sharedX(param.get_value(borrow=True) * 0., name="sum_square_grad_%s" % param.name) """ Initialization of accumulators """ taus_x_t = sharedX( (numpy.ones_like(param.get_value()) + eps) * slow_constant, name="taus_x_t_" + param.name) self.taus_x_t = taus_x_t #Variance reduction parameters #Numerator of the gamma: gamma_nume_sqr = sharedX(numpy.zeros_like(param.get_value()) + eps, name="gamma_nume_sqr_" + param.name) #Denominator of the gamma: gamma_deno_sqr = sharedX(numpy.zeros_like(param.get_value()) + eps, name="gamma_deno_sqr_" + param.name) #For the covariance parameter := E[\gamma \alpha]_{t-1} cov_num_t = sharedX(numpy.zeros_like(param.get_value()) + eps, name="cov_num_t_" + param.name) # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(numpy.zeros_like(param.get_value()) + eps, name="msg_" + param.name) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(param.get_value() * 0., name="msd_" + param.name) if self.use_corrected_grad: old_grad = sharedX(param.get_value() * 0. + eps) #The uncorrected gradient of previous of the previous update: old_plain_grad = sharedX(param.get_value() * 0. + eps) mean_curvature = sharedX(param.get_value() * 0. + eps) mean_curvature_sqr = sharedX(param.get_value() * 0. + eps) # Initialize the E[\Delta]_{t-1} mean_dx = sharedX(param.get_value() * 0.) # Block-wise normalize the gradient: norm_grad = gradients[param] #For the first time-step, assume that delta_x_t := norm_grad cond = T.eq(step, 0) msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx mdx = cond * norm_grad + (1 - cond) * mean_dx """ Compute the new updated values. """ # E[g_i^2]_t new_mean_squared_grad = (mean_square_grad * self.decay + T.sqr(norm_grad) * (1 - self.decay)) new_mean_squared_grad.name = "msg_" + param.name # E[g_i]_t new_mean_grad = (mean_grad * self.decay + norm_grad * (1 - self.decay)) new_mean_grad.name = "nmg_" + param.name mg = new_mean_grad mgsq = new_mean_squared_grad # Keep the rms for numerator and denominator of gamma. new_gamma_nume_sqr = (gamma_nume_sqr * (1 - 1 / taus_x_t) + T.sqr( (norm_grad - old_grad) * (old_grad - mg)) / taus_x_t) new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name new_gamma_deno_sqr = (gamma_deno_sqr * (1 - 1 / taus_x_t) + T.sqr( (mg - norm_grad) * (old_grad - mg)) / taus_x_t) new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name gamma = T.sqrt(gamma_nume_sqr) / T.sqrt(gamma_deno_sqr + eps) gamma.name = "gamma_" + param.name if self.gamma_clip: gamma = T.minimum(gamma, self.gamma_clip) momentum_step = gamma * mg corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma) #For starting the variance reduction. if self.start_var_reduction > -1: cond = T.le(self.start_var_reduction, step) corrected_grad = cond * corrected_grad_cand + ( 1 - cond) * norm_grad else: corrected_grad = norm_grad new_sum_squared_grad = None if self.use_adagrad: g = corrected_grad # Accumulate gradient new_sum_squared_grad = (sum_square_grad + T.sqr(g)) rms_g_t = T.sqrt(new_sum_squared_grad) rms_g_t = T.maximum(rms_g_t, 1.0) # Use the gradients from the previous update # to compute the \nabla f(x_t) - \nabla f(x_{t-1}) cur_curvature = norm_grad - old_plain_grad cur_curvature_sqr = T.sqr(cur_curvature) new_curvature_ave = (mean_curvature * (1 - 1 / taus_x_t) + (cur_curvature / taus_x_t)) new_curvature_ave.name = "ncurve_ave_" + param.name #Average average curvature nc_ave = new_curvature_ave new_curvature_sqr_ave = (mean_curvature_sqr * (1 - 1 / taus_x_t) + (cur_curvature_sqr / taus_x_t)) new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name #Unbiased average squared curvature nc_sq_ave = new_curvature_sqr_ave epsilon = self.lr_scalers.get(param, 1.) * self.learning_rate scaled_lr = self.lr_scalers.get(param, 1.) * sharedX(1.0) rms_dx_tm1 = T.sqrt(msdx + epsilon) rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon) #This is where the update step is being defined delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t / (new_curvature_sqr_ave + epsilon)) delta_x_t.name = "delta_x_t_" + param.name # This part seems to be necessary for only RNNs # For feedforward networks this does not seem to be important. if self.delta_clip: log.info("Clipping will be applied on the adaptive step size.") delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip) if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: log.info("Clipped adagrad is disabled.") delta_x_t = delta_x_t * corrected_grad else: log.info( "Clipping will not be applied on the adaptive step size.") if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: log.info("Clipped adagrad will not be used.") delta_x_t = delta_x_t * corrected_grad new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX( 1 + eps, "stabilized") #To compute the E[\Delta^2]_t new_mean_square_dx = (msdx * (1 - 1 / taus_x_t) + (T.sqr(delta_x_t) / taus_x_t)) #To compute the E[\Delta]_t new_mean_dx = (mean_dx * (1 - 1 / taus_x_t) + (delta_x_t / (taus_x_t))) #Perform the outlier detection: #This outlier detection is slightly different: new_taus_t = T.switch( T.or_( abs(norm_grad - mg) > (2 * T.sqrt(mgsq - mg**2)), abs(cur_curvature - nc_ave) > (2 * T.sqrt(nc_sq_ave - nc_ave**2))), sharedX(2.2), new_taus_t) #Apply the bound constraints on tau: new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t) new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t) new_cov_num_t = (cov_num_t * (1 - 1 / taus_x_t) + (delta_x_t * cur_curvature) * (1 / taus_x_t)) update_step = delta_x_t # Apply updates updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[mean_dx] = new_mean_dx updates[gamma_nume_sqr] = new_gamma_nume_sqr updates[gamma_deno_sqr] = new_gamma_deno_sqr updates[taus_x_t] = new_taus_t updates[cov_num_t] = new_cov_num_t updates[mean_grad] = new_mean_grad updates[old_plain_grad] = norm_grad updates[mean_curvature] = new_curvature_ave updates[mean_curvature_sqr] = new_curvature_sqr_ave updates[param] = param + update_step updates[step] = step + 1 if self.use_adagrad: updates[sum_square_grad] = new_sum_squared_grad if self.use_corrected_grad: updates[old_grad] = corrected_grad return updates
def __init__(self, model, dataset, n_epoch=1000, batch_size=100, minimum_batch_size=1, save_frequency=10, early_stop_threshold=.9995, early_stop_length=30, learning_rate=1e-3, lr_decay='exponential', lr_factor=1, **kwargs): """ Initialize the Optimizer. Parameters ---------- model : Model The Model to train. dataset : Dataset The Dataset to use when training the Model. n_epoch : int how many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. minimum_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_frequency : int How many epochs to train between each new save of the Model's parameters. early_stop_threshold : float The factor by how much the best validation training score needs to improve to determine early stopping. early_stop_length : int The patience or number of epochs to wait after the early_stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The type of decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for options. lr_factor : float The amount to use for the decay function when changing the learning rate over epochs. See `opendeep.utils.decay` for its effect for given decay functions. """ log.info("Initializing optimizer %s", str(type(self))) if early_stop_threshold is None: early_stop_threshold = 1. if save_frequency is None: save_frequency = 1000000 if early_stop_length is None: early_stop_length = 100 self.args = locals().copy() self.args.pop('self') kwargs = self.args.pop('kwargs') self.args = add_kwargs_to_dict(kwargs, self.args) # log the arguments log.info("optimizer config args: %s", str(self.args)) assert isinstance(model, Model), "Optimizer input model needs to be an opendeep Model class!" assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be an opendeep Dataset class!" self.model = model self.dataset = dataset # Learning rate - how drastic of a step do the parameters change self.learning_rate = sharedX(learning_rate, 'learning_rate') self.lr_scalers = self.model.get_lr_scalers() if lr_decay: self.learning_rate_decay = get_decay_function(lr_decay, self.learning_rate, self.learning_rate.get_value(), lr_factor) else: self.learning_rate_decay = False self.noise_switches = raise_to_list(self.model.get_noise_switch()) self.batch_size = batch_size self.minimum_batch_size = minimum_batch_size self.n_epoch = n_epoch self.save_frequency = save_frequency self.early_stop_threshold = early_stop_threshold self.early_stop_length = early_stop_length
def __init__(self, config=None, defaults=_default, inputs_hook=None, params_hook=None, input_size=None, output_size=None, activation=None, cost=None, cost_args=None, weights_init=None, weights_mean=None, weights_std=None, weights_interval=None, bias_init=None, noise=None, noise_level=None, mrg=None, outdir=None, **kwargs): # init Model to combine the defaults and config dictionaries with the initial parameters. initial_parameters = locals() initial_parameters.pop('self') super(BasicLayer, self).__init__(**initial_parameters) # all configuration parameters are now in self! ################## # specifications # ################## # grab info from the inputs_hook, or from parameters if self.inputs_hook is not None: # inputs_hook is a tuple of (Shape, Input) assert len(self.inputs_hook) == 2, 'Expected inputs_hook to be tuple!' # make sure inputs_hook is a tuple self.input_size = self.inputs_hook[0] or self.input_size self.input = self.inputs_hook[1] else: # make the input a symbolic matrix self.input = T.fmatrix('X') # now that we have the input specs, define the output 'target' variable to be used in supervised training! self.target = T.fmatrix('Y') # either grab the output's desired size from the parameter directly, or copy n_in self.output_size = self.output_size or self.input_size # other specifications # activation function! activation_func = get_activation_function(self.activation) # cost function! cost_func = get_cost_function(self.cost) #################################################### # parameters - make sure to deal with params_hook! # #################################################### if self.params_hook is not None: # make sure the params_hook has W (weights matrix) and b (bias vector) assert len(self.params_hook) == 2, \ "Expected 2 params (W and b) for BasicLayer, found {0!s}!".format(len(self.params_hook)) W, b = self.params_hook else: W = get_weights(weights_init=self.weights_init, shape=(self.input_size, self.output_size), name="W", # if gaussian mean=self.weights_mean, std=self.weights_std, # if uniform interval=self.weights_interval) # grab the bias vector b = get_bias(shape=self.output_size, name="b", init_values=self.bias_init) # Finally have the two parameters - weights matrix W and bias vector b. That is all! self.params = [W, b] ############### # computation # ############### # Here is the meat of the computation transforming input -> output # It simply involves a matrix multiplication of inputs*weights, adding the bias vector, and then passing # the result through our activation function (normally something nonlinear such as: max(0, output)) self.output = activation_func(T.dot(self.input, W) + b) # Now deal with noise if we added it: if self.noise: log.debug('Adding noise switch.') if self.noise_level is not None: noise_func = get_noise(self.noise, self.noise_level, self.mrg) else: noise_func = get_noise(self.noise, mrg=self.mrg) # apply the noise as a switch! # default to apply noise. this is for the cost and gradient functions to be computed later # (not sure if the above statement is accurate such that gradient depends on initial value of switch) self.switch = sharedX(value=1, name="basiclayer_noise_switch") self.output = T.switch(self.switch, noise_func(input=self.output), self.output) # now to define the cost of the model - use the cost function to compare our output with the target value. self.cost = cost_func(output=self.output, target=self.target, **self.cost_args) log.debug("Initialized a basic fully-connected layer with shape %s and activation: %s", str((self.input_size, self.output_size)), str(self.activation))
def get_updates(self, grads): """ .. todo:: WRITEME Parameters ---------- grads : dict A dictionary mapping from the model's parameters to their gradients. """ updates = OrderedDict({}) eps = self.damping step = sharedX(0., name="step") if self.skip_nan_inf: #If norm of the gradients of a parameter is inf or nan don't update that parameter #That might be useful for RNNs. grads = OrderedDict({p: T.switch(T.or_(T.isinf(grads[p]), T.isnan(grads[p])), 0, grads[p]) for p in grads.keys()}) #Block-normalize gradients: grads = OrderedDict({p: grads[p] / (grads[p].norm(2) + eps) for p in grads.keys()}) nparams = len(grads.keys()) #Apply the gradient clipping, this is only necessary for RNNs and sometimes for very deep #networks if self.grad_clip: assert self.grad_clip > 0. assert self.grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1." gnorm = sum([g.norm(2) for g in grads.values()]) grads = OrderedDict({p: T.switch(gnorm/nparams > self.grad_clip, g * self.grad_clip * nparams / gnorm , g)\ for p, g in grads.iteritems()}) for param in grads.keys(): grads[param].name = "grad_%s" % param.name mean_grad = sharedX(param.get_value() * 0. + eps, name="mean_grad_%s" % param.name) # mean_corrected_grad = sharedX(param.get_value() * 0 + eps, name="mean_corrected_grad_%s" % param.name) slow_constant = 2.1 if self.use_adagrad: # sum_square_grad := \sum_i g_i^2 sum_square_grad = sharedX(param.get_value(borrow=True) * 0., name="sum_square_grad_%s" % param.name) """ Initialization of accumulators """ taus_x_t = sharedX((numpy.ones_like(param.get_value()) + eps) * slow_constant, name="taus_x_t_" + param.name) self.taus_x_t = taus_x_t #Variance reduction parameters #Numerator of the gamma: gamma_nume_sqr = sharedX(numpy.zeros_like(param.get_value()) + eps, name="gamma_nume_sqr_" + param.name) #Denominator of the gamma: gamma_deno_sqr = sharedX(numpy.zeros_like(param.get_value()) + eps, name="gamma_deno_sqr_" + param.name) #For the covariance parameter := E[\gamma \alpha]_{t-1} cov_num_t = sharedX(numpy.zeros_like(param.get_value()) + eps, name="cov_num_t_" + param.name) # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(numpy.zeros_like(param.get_value()) + eps, name="msg_" + param.name) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(param.get_value() * 0., name="msd_" + param.name) if self.use_corrected_grad: old_grad = sharedX(param.get_value() * 0. + eps) #The uncorrected gradient of previous of the previous update: old_plain_grad = sharedX(param.get_value() * 0. + eps) mean_curvature = sharedX(param.get_value() * 0. + eps) mean_curvature_sqr = sharedX(param.get_value() * 0. + eps) # Initialize the E[\Delta]_{t-1} mean_dx = sharedX(param.get_value() * 0.) # Block-wise normalize the gradient: norm_grad = grads[param] #For the first time-step, assume that delta_x_t := norm_grad cond = T.eq(step, 0) msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx mdx = cond * norm_grad + (1 - cond) * mean_dx """ Compute the new updated values. """ # E[g_i^2]_t new_mean_squared_grad = (mean_square_grad * self.decay + T.sqr(norm_grad) * (1 - self.decay)) new_mean_squared_grad.name = "msg_" + param.name # E[g_i]_t new_mean_grad = (mean_grad * self.decay + norm_grad * (1 - self.decay)) new_mean_grad.name = "nmg_" + param.name mg = new_mean_grad mgsq = new_mean_squared_grad # Keep the rms for numerator and denominator of gamma. new_gamma_nume_sqr = ( gamma_nume_sqr * (1 - 1 / taus_x_t) + T.sqr((norm_grad - old_grad) * (old_grad - mg)) / taus_x_t ) new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name new_gamma_deno_sqr = ( gamma_deno_sqr * (1 - 1 / taus_x_t) + T.sqr((mg - norm_grad) * (old_grad - mg)) / taus_x_t ) new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name gamma = T.sqrt(gamma_nume_sqr) / T.sqrt(gamma_deno_sqr + eps) gamma.name = "gamma_" + param.name if self.gamma_clip: gamma = T.minimum(gamma, self.gamma_clip) momentum_step = gamma * mg corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma) #For starting the variance reduction. if self.start_var_reduction > -1: cond = T.le(self.start_var_reduction, step) corrected_grad = cond * corrected_grad_cand + (1 - cond) * norm_grad else: corrected_grad = norm_grad new_sum_squared_grad = None if self.use_adagrad: g = corrected_grad # Accumulate gradient new_sum_squared_grad = (sum_square_grad + T.sqr(g)) rms_g_t = T.sqrt(new_sum_squared_grad) rms_g_t = T.maximum(rms_g_t, 1.0) # Use the gradients from the previous update # to compute the \nabla f(x_t) - \nabla f(x_{t-1}) cur_curvature = norm_grad - old_plain_grad cur_curvature_sqr = T.sqr(cur_curvature) new_curvature_ave = (mean_curvature * (1 - 1 / taus_x_t) + (cur_curvature / taus_x_t)) new_curvature_ave.name = "ncurve_ave_" + param.name #Average average curvature nc_ave = new_curvature_ave new_curvature_sqr_ave = (mean_curvature_sqr * (1 - 1 / taus_x_t) + (cur_curvature_sqr / taus_x_t)) new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name #Unbiased average squared curvature nc_sq_ave = new_curvature_sqr_ave epsilon = self.lr_scalers.get(param, 1.) * self.learning_rate scaled_lr = self.lr_scalers.get(param, 1.) * sharedX(1.0) rms_dx_tm1 = T.sqrt(msdx + epsilon) rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon) #This is where the update step is being defined delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t / (new_curvature_sqr_ave + epsilon)) delta_x_t.name = "delta_x_t_" + param.name # This part seems to be necessary for only RNNs # For feedforward networks this does not seem to be important. if self.delta_clip: log.info("Clipping will be applied on the adaptive step size.") delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip) if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: log.info("Clipped adagrad is disabled.") delta_x_t = delta_x_t * corrected_grad else: log.info("Clipping will not be applied on the adaptive step size.") if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: log.info("Clipped adagrad will not be used.") delta_x_t = delta_x_t * corrected_grad new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX(1 + eps, "stabilized") #To compute the E[\Delta^2]_t new_mean_square_dx = (msdx * (1 - 1 / taus_x_t) + (T.sqr(delta_x_t) / taus_x_t)) #To compute the E[\Delta]_t new_mean_dx = (mean_dx * (1 - 1 / taus_x_t) + (delta_x_t / (taus_x_t))) #Perform the outlier detection: #This outlier detection is slightly different: new_taus_t = T.switch(T.or_(abs(norm_grad - mg) > (2 * T.sqrt(mgsq - mg**2)), abs(cur_curvature - nc_ave) > (2 * T.sqrt(nc_sq_ave - nc_ave**2))), sharedX(2.2), new_taus_t) #Apply the bound constraints on tau: new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t) new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t) new_cov_num_t = (cov_num_t * (1 - 1 / taus_x_t) + (delta_x_t * cur_curvature) * (1 / taus_x_t)) update_step = delta_x_t # Apply updates updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[mean_dx] = new_mean_dx updates[gamma_nume_sqr] = new_gamma_nume_sqr updates[gamma_deno_sqr] = new_gamma_deno_sqr updates[taus_x_t] = new_taus_t updates[cov_num_t] = new_cov_num_t updates[mean_grad] = new_mean_grad updates[old_plain_grad] = norm_grad updates[mean_curvature] = new_curvature_ave updates[mean_curvature_sqr] = new_curvature_sqr_ave updates[param] = param + update_step updates[step] = step + 1 if self.use_adagrad: updates[sum_square_grad] = new_sum_squared_grad if self.use_corrected_grad: updates[old_grad] = corrected_grad return updates
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/gsn/', input_size=None, hidden_size=1000, layers=2, walkbacks=4, visible_activation='sigmoid', hidden_activation='tanh', input_sampling=True, mrg=RNG_MRG.MRG_RandomStreams(1), tied_weights=True, weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3, bias_init=0.0, cost_function='binary_crossentropy', cost_args=None, add_noise=True, noiseless_h1=True, hidden_noise='gaussian', hidden_noise_level=2, input_noise='salt_and_pepper', input_noise_level=0.4, noise_decay='exponential', noise_annealing=1, image_width=None, image_height=None, **kwargs): """ Initialize a GSN. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, it needs to include the shape information (normally the dimensionality of the input i.e. n_in). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. This is used for linking different models together (e.g. setting the DAE model's hidden layers to the RNN's output layer gives a generative recurrent model.) For now, it needs to include the shape information (normally the dimensionality of the hiddens i.e. n_hidden). params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as a training model with dropout applied to layers and one without for testing, where the parameters are shared between the two. outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. input_size : int The size (dimensionality) of the input to the DAE. If shape is provided in `inputs_hook`, this is optional. The :class:`Model` requires an `output_size`, which gets set to this value because the DAE is an unsupervised model. The output is a reconstruction of the input. hidden_size : int The size (dimensionality) of the hidden layer for the DAE. Generally, you want it to be larger than `input_size`, which is known as *overcomplete*. visible_activation : str or callable The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer. This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. layers : int The number of hidden layers to use. walkbacks : int The number of walkbacks to perform (the variable K in Bengio's paper above). A walkback is a Gibbs sample from the DAE, which means the model generates inputs in sequence, where each generated input is compared to the original input to create the reconstruction cost for training. For running the model, the very last generated input in the Gibbs chain is used as the output. input_sampling : bool During walkbacks, whether to sample from the generated input to create a new starting point for the next walkback (next step in the Gibbs chain). This generally makes walkbacks more effective by making the process more stochastic - more likely to find spurious modes in the model's representation. mrg : random A random number generator that is used when adding noise into the network and for sampling from the input. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. tied_weights : bool DAE has two weight matrices - W from input -> hiddens and V from hiddens -> input. This boolean determines if V = W.T, which 'ties' V to W and reduces the number of parameters necessary during training. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. cost_function : str or callable The function to use when calculating the reconstruction cost of the model. This should be appropriate for the type of input, i.e. use 'binary_crossentropy' for binary inputs, or 'mse' for real-valued inputs. See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. add_noise : bool Whether to add noise (corrupt) the input before passing it through the computation graph during training. This should most likely be set to the default of True, because this is a *denoising* autoencoder after all. noiseless_h1 : bool Whether to not add noise (corrupt) the hidden layer during computation. hidden_noise : str What type of noise to use for corrupting the hidden layer (if not `noiseless_h1`). See opendeep.utils.noise for options. This should be appropriate for the hidden unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. hidden_noise_level : float The amount of noise to use for the noise function specified by `hidden_noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. input_noise : str What type of noise to use for corrupting the input before computation (if `add_noise`). See opendeep.utils.noise for options. This should be appropriate for the input units, i.e. salt-and-pepper for binary units, etc. input_noise_level : float The amount of noise used to corrupt the input. This could be the masking probability for salt-and-pepper, standard deviation for Gaussian, interval for Uniform, etc. noise_decay : str or False Whether to use `input_noise` scheduling (decay `input_noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the DAE learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_annealing : float The amount to reduce the `input_noise_level` after each training epoch based on the decay function specified in `noise_decay`. image_width : int If the input should be represented as an image, the width of the input image. If not specified, it will be close to the square factor of the `input_size`. image_height : int If the input should be represented as an image, the height of the input image. If not specified, it will be close to the square factor of the `input_size`. """ # init Model to combine the defaults and config dictionaries with the initial parameters. initial_parameters = locals().copy() initial_parameters.pop('self') super(GSN, self).__init__(**initial_parameters) # when the input should be thought of as an image, either use the specified width and height, # or try to make as square as possible. if image_height is None and image_width is None: (_h, _w) = closest_to_square_factors(self.input_size) self.image_width = _w self.image_height = _h else: self.image_height = image_height self.image_width = image_width ############################ # Theano variables and RNG # ############################ if self.inputs_hook is None: self.X = T.matrix('X') else: # inputs_hook is a (shape, input) tuple self.X = self.inputs_hook[1] ########################## # Network specifications # ########################## # generally, walkbacks should be at least 2*layers if layers % 2 == 0: if walkbacks < 2*layers: log.warning('Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. ' 'Generaly want 2X walkbacks to layers', str(layers), str(walkbacks)) else: if walkbacks < 2*layers-1: log.warning('Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. ' 'Generaly want 2X walkbacks to layers', str(layers), str(walkbacks)) self.add_noise = add_noise self.noise_annealing = as_floatX(noise_annealing) # noise schedule parameter self.hidden_noise_level = sharedX(hidden_noise_level, dtype=theano.config.floatX) self.hidden_noise = get_noise(name=hidden_noise, noise_level=self.hidden_noise_level, mrg=mrg) self.input_noise_level = sharedX(input_noise_level, dtype=theano.config.floatX) self.input_noise = get_noise(name=input_noise, noise_level=self.input_noise_level, mrg=mrg) self.walkbacks = walkbacks self.tied_weights = tied_weights self.layers = layers self.noiseless_h1 = noiseless_h1 self.input_sampling = input_sampling self.noise_decay = noise_decay # if there was a hiddens_hook, unpack the hidden layers in the tensor if self.hiddens_hook is not None: hidden_size = self.hiddens_hook[0] self.hiddens_flag = True else: self.hiddens_flag = False # determine the sizes of each layer in a list. # layer sizes, from h0 to hK (h0 is the visible layer) hidden_size = list(raise_to_list(hidden_size)) if len(hidden_size) == 1: self.layer_sizes = [self.input_size] + hidden_size * self.layers else: assert len(hidden_size) == self.layers, "Hiddens sizes and number of hidden layers mismatch." + \ "Hiddens %d and layers %d" % (len(hidden_size), self.layers) self.layer_sizes = [self.input_size] + hidden_size if self.hiddens_hook is not None: self.hiddens = self.unpack_hiddens(self.hiddens_hook[1]) ######################### # Activation functions! # ######################### # hidden unit activation self.hidden_activation = get_activation_function(hidden_activation) # Visible layer activation self.visible_activation = get_activation_function(visible_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.visible_activation): self.visible_sampling = mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary visible activation not supported yet!") raise NotImplementedError("Non-binary visible activation not supported yet!") # Cost function self.cost_function = get_cost_function(cost_function) self.cost_args = cost_args or dict() ############### # Parameters! # ############### # make sure to deal with params_hook! if self.params_hook is not None: # if tied weights, expect layers*2 + 1 params if self.tied_weights: assert len(self.params_hook) == 2*layers + 1, \ "Tied weights: expected {0!s} params, found {1!s}!".format(2*layers+1, len(self.params_hook)) self.weights_list = self.params_hook[:layers] self.bias_list = self.params_hook[layers:] # if untied weights, expect layers*3 + 1 params else: assert len(self.params_hook) == 3*layers + 1, \ "Untied weights: expected {0!s} params, found {1!s}!".format(3*layers+1, len(self.params_hook)) self.weights_list = self.params_hook[:2*layers] self.bias_list = self.params_hook[2*layers:] # otherwise, construct our params else: # initialize a list of weights and biases based on layer_sizes for the GSN self.weights_list = [get_weights(weights_init=weights_init, shape=(self.layer_sizes[i], self.layer_sizes[i+1]), name="W_{0!s}_{1!s}".format(i, i+1), rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for i in range(layers)] # add more weights if we aren't tying weights between layers (need to add for higher-lower layers now) if not tied_weights: self.weights_list.extend( [get_weights(weights_init=weights_init, shape=(self.layer_sizes[i+1], self.layer_sizes[i]), name="W_{0!s}_{1!s}".format(i+1, i), rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for i in reversed(range(layers))] ) # initialize each layer bias to 0's. self.bias_list = [get_bias(shape=(self.layer_sizes[i],), name='b_' + str(i), init_values=bias_init) for i in range(layers+1)] # build the params of the model into a list self.params = self.weights_list + self.bias_list log.debug("gsn params: %s", str(self.params)) # using the properties, build the computational graph self.cost, self.monitors, self.output, self.hiddens = self.build_computation_graph()
def __init__(self, inputs_hook=None, hiddens_hook=None, params_hook=None, outdir='outputs/gsn/', input_size=None, hidden_size=1000, layers=2, walkbacks=4, visible_activation='sigmoid', hidden_activation='tanh', input_sampling=True, mrg=RNG_MRG.MRG_RandomStreams(1), tied_weights=True, weights_init='uniform', weights_interval='montreal', weights_mean=0, weights_std=5e-3, bias_init=0.0, cost_function='binary_crossentropy', cost_args=None, add_noise=True, noiseless_h1=True, hidden_noise='gaussian', hidden_noise_level=2, input_noise='salt_and_pepper', input_noise_level=0.4, noise_decay='exponential', noise_annealing=1, image_width=None, image_height=None, **kwargs): """ Initialize a GSN. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together (e.g. setting the Softmax model's input layer to the DAE's hidden layer gives a newly supervised classification model). For now, it needs to include the shape information (normally the dimensionality of the input i.e. n_in). hiddens_hook : Tuple of (shape, variable) Routing information for the model to accept its hidden representation from elsewhere. This is used for linking different models together (e.g. setting the DAE model's hidden layers to the RNN's output layer gives a generative recurrent model.) For now, it needs to include the shape information (normally the dimensionality of the hiddens i.e. n_hidden). params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as a training model with dropout applied to layers and one without for testing, where the parameters are shared between the two. outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. input_size : int The size (dimensionality) of the input to the DAE. If shape is provided in `inputs_hook`, this is optional. The :class:`Model` requires an `output_size`, which gets set to this value because the DAE is an unsupervised model. The output is a reconstruction of the input. hidden_size : int The size (dimensionality) of the hidden layer for the DAE. Generally, you want it to be larger than `input_size`, which is known as *overcomplete*. visible_activation : str or callable The nonlinear (or linear) visible activation to perform after the dot product from hiddens -> visible layer. This activation function should be appropriate for the input unit types, i.e. 'sigmoid' for binary inputs. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. hidden_activation : str or callable The nonlinear (or linear) hidden activation to perform after the dot product from visible -> hiddens layer. See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass your own function to be used as long as it is callable. layers : int The number of hidden layers to use. walkbacks : int The number of walkbacks to perform (the variable K in Bengio's paper above). A walkback is a Gibbs sample from the DAE, which means the model generates inputs in sequence, where each generated input is compared to the original input to create the reconstruction cost for training. For running the model, the very last generated input in the Gibbs chain is used as the output. input_sampling : bool During walkbacks, whether to sample from the generated input to create a new starting point for the next walkback (next step in the Gibbs chain). This generally makes walkbacks more effective by making the process more stochastic - more likely to find spurious modes in the model's representation. mrg : random A random number generator that is used when adding noise into the network and for sampling from the input. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. tied_weights : bool DAE has two weight matrices - W from input -> hiddens and V from hiddens -> input. This boolean determines if V = W.T, which 'ties' V to W and reduces the number of parameters necessary during training. weights_init : str Determines the method for initializing model weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. cost_function : str or callable The function to use when calculating the reconstruction cost of the model. This should be appropriate for the type of input, i.e. use 'binary_crossentropy' for binary inputs, or 'mse' for real-valued inputs. See opendeep.utils.cost for options. You can also specify your own function, which needs to be callable. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. add_noise : bool Whether to add noise (corrupt) the input before passing it through the computation graph during training. This should most likely be set to the default of True, because this is a *denoising* autoencoder after all. noiseless_h1 : bool Whether to not add noise (corrupt) the hidden layer during computation. hidden_noise : str What type of noise to use for corrupting the hidden layer (if not `noiseless_h1`). See opendeep.utils.noise for options. This should be appropriate for the hidden unit activation, i.e. Gaussian for tanh or other real-valued activations, etc. hidden_noise_level : float The amount of noise to use for the noise function specified by `hidden_noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. input_noise : str What type of noise to use for corrupting the input before computation (if `add_noise`). See opendeep.utils.noise for options. This should be appropriate for the input units, i.e. salt-and-pepper for binary units, etc. input_noise_level : float The amount of noise used to corrupt the input. This could be the masking probability for salt-and-pepper, standard deviation for Gaussian, interval for Uniform, etc. noise_decay : str or False Whether to use `input_noise` scheduling (decay `input_noise_level` during the course of training), and if so, the string input specifies what type of decay to use. See opendeep.utils.decay for options. Noise decay (known as noise scheduling) effectively helps the DAE learn larger variance features first, and then smaller ones later (almost as a kind of curriculum learning). May help it converge faster. noise_annealing : float The amount to reduce the `input_noise_level` after each training epoch based on the decay function specified in `noise_decay`. image_width : int If the input should be represented as an image, the width of the input image. If not specified, it will be close to the square factor of the `input_size`. image_height : int If the input should be represented as an image, the height of the input image. If not specified, it will be close to the square factor of the `input_size`. """ # init Model to combine the defaults and config dictionaries with the initial parameters. initial_parameters = locals().copy() initial_parameters.pop('self') super(GSN, self).__init__(**initial_parameters) # when the input should be thought of as an image, either use the specified width and height, # or try to make as square as possible. if image_height is None and image_width is None: (_h, _w) = closest_to_square_factors(self.input_size) self.image_width = _w self.image_height = _h else: self.image_height = image_height self.image_width = image_width ############################ # Theano variables and RNG # ############################ if self.inputs_hook is None: self.X = T.matrix('X') else: # inputs_hook is a (shape, input) tuple self.X = self.inputs_hook[1] ########################## # Network specifications # ########################## # generally, walkbacks should be at least 2*layers if layers % 2 == 0: if walkbacks < 2 * layers: log.warning( 'Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. ' 'Generaly want 2X walkbacks to layers', str(layers), str(walkbacks)) else: if walkbacks < 2 * layers - 1: log.warning( 'Not enough walkbacks for the layers! Layers is %s and walkbacks is %s. ' 'Generaly want 2X walkbacks to layers', str(layers), str(walkbacks)) self.add_noise = add_noise self.noise_annealing = as_floatX( noise_annealing) # noise schedule parameter self.hidden_noise_level = sharedX(hidden_noise_level, dtype=theano.config.floatX) self.hidden_noise = get_noise(name=hidden_noise, noise_level=self.hidden_noise_level, mrg=mrg) self.input_noise_level = sharedX(input_noise_level, dtype=theano.config.floatX) self.input_noise = get_noise(name=input_noise, noise_level=self.input_noise_level, mrg=mrg) self.walkbacks = walkbacks self.tied_weights = tied_weights self.layers = layers self.noiseless_h1 = noiseless_h1 self.input_sampling = input_sampling self.noise_decay = noise_decay # if there was a hiddens_hook, unpack the hidden layers in the tensor if self.hiddens_hook is not None: hidden_size = self.hiddens_hook[0] self.hiddens_flag = True else: self.hiddens_flag = False # determine the sizes of each layer in a list. # layer sizes, from h0 to hK (h0 is the visible layer) hidden_size = list(raise_to_list(hidden_size)) if len(hidden_size) == 1: self.layer_sizes = [self.input_size] + hidden_size * self.layers else: assert len(hidden_size) == self.layers, "Hiddens sizes and number of hidden layers mismatch." + \ "Hiddens %d and layers %d" % (len(hidden_size), self.layers) self.layer_sizes = [self.input_size] + hidden_size if self.hiddens_hook is not None: self.hiddens = self.unpack_hiddens(self.hiddens_hook[1]) ######################### # Activation functions! # ######################### # hidden unit activation self.hidden_activation = get_activation_function(hidden_activation) # Visible layer activation self.visible_activation = get_activation_function(visible_activation) # make sure the sampling functions are appropriate for the activation functions. if is_binary(self.visible_activation): self.visible_sampling = mrg.binomial else: # TODO: implement non-binary activation log.error("Non-binary visible activation not supported yet!") raise NotImplementedError( "Non-binary visible activation not supported yet!") # Cost function self.cost_function = get_cost_function(cost_function) self.cost_args = cost_args or dict() ############### # Parameters! # ############### # make sure to deal with params_hook! if self.params_hook is not None: # if tied weights, expect layers*2 + 1 params if self.tied_weights: assert len(self.params_hook) == 2*layers + 1, \ "Tied weights: expected {0!s} params, found {1!s}!".format(2*layers+1, len(self.params_hook)) self.weights_list = self.params_hook[:layers] self.bias_list = self.params_hook[layers:] # if untied weights, expect layers*3 + 1 params else: assert len(self.params_hook) == 3*layers + 1, \ "Untied weights: expected {0!s} params, found {1!s}!".format(3*layers+1, len(self.params_hook)) self.weights_list = self.params_hook[:2 * layers] self.bias_list = self.params_hook[2 * layers:] # otherwise, construct our params else: # initialize a list of weights and biases based on layer_sizes for the GSN self.weights_list = [ get_weights( weights_init=weights_init, shape=(self.layer_sizes[i], self.layer_sizes[i + 1]), name="W_{0!s}_{1!s}".format(i, i + 1), rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for i in range(layers) ] # add more weights if we aren't tying weights between layers (need to add for higher-lower layers now) if not tied_weights: self.weights_list.extend([ get_weights( weights_init=weights_init, shape=(self.layer_sizes[i + 1], self.layer_sizes[i]), name="W_{0!s}_{1!s}".format(i + 1, i), rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) for i in reversed(range(layers)) ]) # initialize each layer bias to 0's. self.bias_list = [ get_bias(shape=(self.layer_sizes[i], ), name='b_' + str(i), init_values=bias_init) for i in range(layers + 1) ] # build the params of the model into a list self.params = self.weights_list + self.bias_list log.debug("gsn params: %s", str(self.params)) # using the properties, build the computational graph self.cost, self.monitors, self.output, self.hiddens = self.build_computation_graph( )
def __init__(self, inputs_hook=None, params_hook=None, outdir='outputs/basic', input_size=None, output_size=None, activation='rectifier', cost='mse', cost_args=None, weights_init='uniform', weights_mean=0, weights_std=5e-3, weights_interval='montreal', bias_init=0.0, noise=None, noise_level=None, mrg=RNG_MRG.MRG_RandomStreams(1), **kwargs): """ Initialize a basic layer. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together. For now, it needs to include the shape information (normally the dimensionality of the input i.e. input_size). params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as a training model with dropout applied to layers and one without for testing, where the parameters are shared between the two. outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. input_size : int The size (dimensionality) of the input to the layer. If shape is provided in `inputs_hook`, this is optional. output_size : int The size (dimensionality) of the output from the layer. activation : str or callable The activation function to use after the dot product going from input -> output. This can be a string representing an option from opendeep.utils.activation, or your own function as long as it is callable. cost : str or callable The cost function to use when training the layer. This should be appropriate for the output type, i.e. mse for real-valued outputs, binary cross-entropy for binary outputs, etc. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. weights_init : str Determines the method for initializing input -> output weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. noise : str What type of noise to use for corrupting the output (if not None). See opendeep.utils.noise for options. This should be appropriate for the output activation, i.e. Gaussian for tanh or other real-valued activations, etc. Often, you will use 'dropout' here as a regularization in BasicLayers. noise_level : float The amount of noise to use for the noise function specified by `noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. mrg : random A random number generator that is used when adding noise. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. """ # init Model to combine the defaults and config dictionaries with the initial parameters. initial_parameters = locals().copy() initial_parameters.pop('self') super(BasicLayer, self).__init__(**initial_parameters) ################## # specifications # ################## # grab info from the inputs_hook, or from parameters if inputs_hook is not None: # inputs_hook is a tuple of (Shape, Input) assert len( inputs_hook ) == 2, 'Expected inputs_hook to be tuple!' # make sure inputs_hook is a tuple self.input = inputs_hook[1] else: # make the input a symbolic matrix self.input = T.matrix('X') # now that we have the input specs, define the output 'target' variable to be used in supervised training! self.target = T.matrix('Y') # either grab the output's desired size from the parameter directly, or copy input_size self.output_size = self.output_size or self.input_size # other specifications # activation function! activation_func = get_activation_function(activation) # cost function! cost_func = get_cost_function(cost) cost_args = cost_args or dict() #################################################### # parameters - make sure to deal with params_hook! # #################################################### if params_hook is not None: # make sure the params_hook has W (weights matrix) and b (bias vector) assert len(params_hook) == 2, \ "Expected 2 params (W and b) for BasicLayer, found {0!s}!".format(len(params_hook)) W, b = params_hook else: W = get_weights( weights_init=weights_init, shape=(self.input_size, self.output_size), name="W", rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval) # grab the bias vector b = get_bias(shape=output_size, name="b", init_values=bias_init) # Finally have the two parameters - weights matrix W and bias vector b. That is all! self.params = [W, b] ############### # computation # ############### # Here is the meat of the computation transforming input -> output # It simply involves a matrix multiplication of inputs*weights, adding the bias vector, and then passing # the result through our activation function (normally something nonlinear such as: max(0, output)) self.output = activation_func(T.dot(self.input, W) + b) # Now deal with noise if we added it: if noise: log.debug('Adding noise switch.') if noise_level is not None: noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg) else: noise_func = get_noise(noise, mrg=mrg) # apply the noise as a switch! # default to apply noise. this is for the cost and gradient functions to be computed later # (not sure if the above statement is accurate such that gradient depends on initial value of switch) self.switch = sharedX(value=1, name="basiclayer_noise_switch") self.output = T.switch(self.switch, noise_func(input=self.output), self.output) # now to define the cost of the model - use the cost function to compare our output with the target value. self.cost = cost_func(output=self.output, target=self.target, **cost_args) log.debug( "Initialized a basic fully-connected layer with shape %s and activation: %s", str((self.input_size, self.output_size)), str(activation))
def __init__(self, model, dataset, iterator_class=SequentialIterator, config=None, defaults=_defaults, rng=None, n_epoch=None, batch_size=None, minimum_batch_size=None, save_frequency=None, early_stop_threshold=None, early_stop_length=None, learning_rate=None, lr_decay=None, lr_factor=None, momentum=None, momentum_decay=None, momentum_factor=None, nesterov_momentum=None, flag_para_load=None): # superclass init super(SGD, self).__init__(config=config, defaults=defaults) # config and defaults are now combined in self.args! yay! self.model = model self.dataset = dataset self.iterator = iterator_class # Training epochs - how many times to iterate over the whole dataset self.n_epoch = n_epoch or self.args.get('n_epoch') # Dataset iteration batch sizes - number of examples in each calculation self.batch_size = batch_size or self.args.get('batch_size') self.minimum_batch_size = minimum_batch_size or self.args.get( 'minimum_batch_size') # Number of epochs between saving model parameters self.save_frequency = save_frequency or self.args.get('save_frequency') # Early stopping threshold and patience - by how much does the cost have to improve over a number of epochs self.early_stop_threshold = early_stop_threshold or self.args.get( 'early_stop_threshold') self.early_stop_length = early_stop_length or self.args.get( 'early_stop_length') # Learning rate - how drastic of a step do the parameters change lr = learning_rate or self.args.get('learning_rate') self.learning_rate = sharedX(lr, 'learning_rate') self.lr_scalers = self.model.get_lr_scalers() if lr_decay or self.args.get('lr_decay'): self.learning_rate_decay = get_decay_function( lr_decay or self.args.get('lr_decay'), self.learning_rate, self.learning_rate.get_value(), lr_factor or self.args.get('lr_factor')) # Momentum - smoothing over the parameter changes (see Hinton) self.momentum = sharedX(momentum or self.args.get('momentum'), 'momentum') if self.args.get('momentum_decay'): self.momentum_decay = get_decay_function( momentum_decay or self.args.get('momentum_decay'), self.momentum, self.momentum.get_value(), momentum_factor or self.args.get('momentum_factor')) self.nesterov_momentum = nesterov_momentum or self.args.get( 'nesterov_momentum') # RNG for working on random iterator if rng is None: random.seed(123) self.rng = random else: self.rng = rng self.params = self.model.get_params() # Now create the training cost function for the model to use while training - update parameters log.info("%s params: %s", str(type(self.model)), str(self.params)) # gradient! gradient = grad(self.model.get_train_cost(), self.params) grads = OrderedDict(zip(self.params, gradient)) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD, including AdaDelta! # It tells how to update the params each training epoch gradient_updates = self.get_updates(grads) # Combine the updates from the model also if applicable train_updates = model.get_updates() if train_updates: train_updates.update(gradient_updates) else: train_updates = gradient_updates # Compile the training function! log.info('Compiling f_learn function for model %s...', str(type(self.model))) t = time.time() self.f_learn = function(inputs=model.get_inputs(), updates=train_updates, outputs=self.model.get_train_cost(), name='f_learn') log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) # Determine if this function is unsupervised or not by looking at the number of inputs to the f_learn function. # If there is only one input, it is unsupervised, otherwise, it is supervised. # This workaround was provided by Pascal Lamblin on the theano-users google group num_inputs = len( [i for i in self.f_learn.maker.inputs if not i.shared]) if num_inputs == 1: log.debug("Model is unsupervised: 1 input to f_learn.") self.unsupervised = True elif num_inputs == 2: log.debug("Model is supervised: 2 inputs to f_learn.") self.unsupervised = False else: log.error( "Number of inputs to f_learn on model %s was %s. Needs to be 1 for unsupervised or 2 for supervised.", str(type(self.model)), str(num_inputs)) raise AssertionError( "Number of inputs to f_learn on model %s was %s. Needs to be 1 for unsupervised or 2 for supervised." % str(type(self.model)), str(num_inputs)) # grab the function(s) to use to monitor different model values during training self.monitors = self.model.get_monitors()
def __init__( self, inputs_hook=None, params_hook=None, outdir="outputs/basic", input_size=None, output_size=None, activation="rectifier", cost="mse", cost_args=None, weights_init="uniform", weights_mean=0, weights_std=5e-3, weights_interval="montreal", bias_init=0.0, noise=None, noise_level=None, mrg=RNG_MRG.MRG_RandomStreams(1), **kwargs ): """ Initialize a basic layer. Parameters ---------- inputs_hook : Tuple of (shape, variable) Routing information for the model to accept inputs from elsewhere. This is used for linking different models together. For now, it needs to include the shape information (normally the dimensionality of the input i.e. input_size). params_hook : List(theano shared variable) A list of model parameters (shared theano variables) that you should use when constructing this model (instead of initializing your own shared variables). This parameter is useful when you want to have two versions of the model that use the same parameters - such as a training model with dropout applied to layers and one without for testing, where the parameters are shared between the two. outdir : str The directory you want outputs (parameters, images, etc.) to save to. If None, nothing will be saved. input_size : int The size (dimensionality) of the input to the layer. If shape is provided in `inputs_hook`, this is optional. output_size : int The size (dimensionality) of the output from the layer. activation : str or callable The activation function to use after the dot product going from input -> output. This can be a string representing an option from opendeep.utils.activation, or your own function as long as it is callable. cost : str or callable The cost function to use when training the layer. This should be appropriate for the output type, i.e. mse for real-valued outputs, binary cross-entropy for binary outputs, etc. cost_args : dict Any additional named keyword arguments to pass to the specified `cost_function`. weights_init : str Determines the method for initializing input -> output weights. See opendeep.utils.nnet for options. weights_interval : str or float If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options. weights_mean : float If Gaussian `weights_init`, the mean value to use. weights_std : float If Gaussian `weights_init`, the standard deviation to use. bias_init : float The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred. noise : str What type of noise to use for corrupting the output (if not None). See opendeep.utils.noise for options. This should be appropriate for the output activation, i.e. Gaussian for tanh or other real-valued activations, etc. Often, you will use 'dropout' here as a regularization in BasicLayers. noise_level : float The amount of noise to use for the noise function specified by `noise`. This could be the standard deviation for gaussian noise, the interval for uniform noise, the dropout amount, etc. mrg : random A random number generator that is used when adding noise. I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams. """ # init Model to combine the defaults and config dictionaries with the initial parameters. initial_parameters = locals().copy() initial_parameters.pop("self") super(BasicLayer, self).__init__(**initial_parameters) ################## # specifications # ################## # grab info from the inputs_hook, or from parameters if inputs_hook is not None: # inputs_hook is a tuple of (Shape, Input) assert len(inputs_hook) == 2, "Expected inputs_hook to be tuple!" # make sure inputs_hook is a tuple self.input = inputs_hook[1] else: # make the input a symbolic matrix self.input = T.matrix("X") # now that we have the input specs, define the output 'target' variable to be used in supervised training! self.target = T.matrix("Y") # either grab the output's desired size from the parameter directly, or copy input_size self.output_size = self.output_size or self.input_size # other specifications # activation function! activation_func = get_activation_function(activation) # cost function! cost_func = get_cost_function(cost) cost_args = cost_args or dict() #################################################### # parameters - make sure to deal with params_hook! # #################################################### if params_hook is not None: # make sure the params_hook has W (weights matrix) and b (bias vector) assert len(params_hook) == 2, "Expected 2 params (W and b) for BasicLayer, found {0!s}!".format( len(params_hook) ) W, b = params_hook else: W = get_weights( weights_init=weights_init, shape=(self.input_size, self.output_size), name="W", rng=mrg, # if gaussian mean=weights_mean, std=weights_std, # if uniform interval=weights_interval, ) # grab the bias vector b = get_bias(shape=output_size, name="b", init_values=bias_init) # Finally have the two parameters - weights matrix W and bias vector b. That is all! self.params = [W, b] ############### # computation # ############### # Here is the meat of the computation transforming input -> output # It simply involves a matrix multiplication of inputs*weights, adding the bias vector, and then passing # the result through our activation function (normally something nonlinear such as: max(0, output)) self.output = activation_func(T.dot(self.input, W) + b) # Now deal with noise if we added it: if noise: log.debug("Adding noise switch.") if noise_level is not None: noise_func = get_noise(noise, noise_level=noise_level, mrg=mrg) else: noise_func = get_noise(noise, mrg=mrg) # apply the noise as a switch! # default to apply noise. this is for the cost and gradient functions to be computed later # (not sure if the above statement is accurate such that gradient depends on initial value of switch) self.switch = sharedX(value=1, name="basiclayer_noise_switch") self.output = T.switch(self.switch, noise_func(input=self.output), self.output) # now to define the cost of the model - use the cost function to compare our output with the target value. self.cost = cost_func(output=self.output, target=self.target, **cost_args) log.debug( "Initialized a basic fully-connected layer with shape %s and activation: %s", str((self.input_size, self.output_size)), str(activation), )