def main(): var = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W') updates = [(var, add_uniform(input=var, noise_level=.02))] stats = get_stats(var) l1 = stats.pop('l1') l2 = stats.pop('l2') min = stats.pop('min') max = stats.pop('max') var = stats.pop('var') std = stats.pop('std') mean = stats.pop('mean') mean_monitor = Monitor('mean', mean, train=True, valid=True, out_service=FileService('outs/mean.txt')) var_monitor = Monitor('var', var, out_service=FileService('outs/var.txt')) w_channel = MonitorsChannel('W', monitors=mean_monitor) stat_channel = MonitorsChannel('stats', monitors=[var_monitor]) monitors = [w_channel, stat_channel] train_collapsed_raw = collapse_channels(monitors, train=True) train_collapsed = OrderedDict([(item[0], item[1]) for item in train_collapsed_raw]) train_services = OrderedDict([(item[0], item[2]) for item in train_collapsed_raw]) valid_collapsed_raw = collapse_channels(monitors, valid=True) valid_collapsed = OrderedDict([(item[0], item[1]) for item in valid_collapsed_raw]) valid_services = OrderedDict([(item[0], item[2]) for item in valid_collapsed_raw]) log.debug('compiling...') f = theano.function(inputs=[], outputs=list(train_collapsed.values()), updates=updates) f2 = theano.function(inputs=[], outputs=list(valid_collapsed.values()), updates=updates) log.debug('done') t1=time.time() for epoch in range(10): t=time.time() log.debug(epoch) vals = f() m = OrderedDict(zip(train_collapsed.keys(), vals)) for name, service in train_services.items(): if name in m: service.write(m[name], "train") log.debug('----- '+make_time_units_string(time.time()-t)) for epoch in range(10): t = time.time() log.debug(epoch) vals = f2() m = OrderedDict(zip(valid_collapsed.keys(), vals)) for name, service in valid_services.items(): if name in m: service.write(m[name], "valid") log.debug('----- ' + make_time_units_string(time.time() - t)) log.debug("TOTAL TIME "+make_time_units_string(time.time()-t1))
def main(): var = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W') updates = [(var, add_uniform(input=var, noise_level=.02))] stats = get_stats(var) l1 = stats.pop('l1') l2 = stats.pop('l2') min = stats.pop('min') max = stats.pop('max') var = stats.pop('var') std = stats.pop('std') mean = stats.pop('mean') mean_monitor = Monitor('mean', mean, train=True, valid=True, out_service=FileService('outs/mean.txt')) var_monitor = Monitor('var', var, out_service=FileService('outs/var.txt')) w_channel = MonitorsChannel('W', monitors=mean_monitor) stat_channel = MonitorsChannel('stats', monitors=[var_monitor]) monitors = [w_channel, stat_channel] train_collapsed_raw = collapse_channels(monitors, train=True) train_collapsed = OrderedDict([(item[0], item[1]) for item in train_collapsed_raw]) train_services = OrderedDict([(item[0], item[2]) for item in train_collapsed_raw]) valid_collapsed_raw = collapse_channels(monitors, valid=True) valid_collapsed = OrderedDict([(item[0], item[1]) for item in valid_collapsed_raw]) valid_services = OrderedDict([(item[0], item[2]) for item in valid_collapsed_raw]) log.debug('compiling...') f = theano.function(inputs=[], outputs=train_collapsed.values(), updates=updates) f2 = theano.function(inputs=[], outputs=valid_collapsed.values(), updates=updates) log.debug('done') t1=time.time() for epoch in range(10): t=time.time() log.debug(epoch) vals = f() m = OrderedDict(zip(train_collapsed.keys(), vals)) for name, service in train_services.items(): if name in m: service.write(m[name], TRAIN) log.debug('----- '+make_time_units_string(time.time()-t)) for epoch in range(10): t = time.time() log.debug(epoch) vals = f2() m = OrderedDict(zip(valid_collapsed.keys(), vals)) for name, service in valid_services.items(): if name in m: service.write(m[name], VALID) log.debug('----- ' + make_time_units_string(time.time() - t)) log.debug("TOTAL TIME "+make_time_units_string(time.time()-t1))
def main(): w = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W') updates = [(w, add_uniform(input=w, noise_level=.02))] stats = get_stats(w) l1 = stats.pop('l1') l2 = stats.pop('l2') min = stats.pop('min') max = stats.pop('max') var = stats.pop('var') std = stats.pop('std') mean = stats.pop('mean') mean_monitor = Monitor('mean', mean, train=True, valid=True) stat_monitor = Monitor('max', max) w_channel = MonitorsChannel('W', monitors=mean_monitor) stat_channel = MonitorsChannel('stats', monitors=[stat_monitor]) monitors = [w_channel, stat_channel] train_collapsed = collapse_channels(monitors, train=True) train_collapsed = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) valid_collapsed = collapse_channels(monitors, valid=True) valid_collapsed = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) plot = Plot(bokeh_doc_name='test_plots', monitor_channels=monitors, open_browser=True) log.debug('compiling...') f = theano.function(inputs=[], outputs=list(train_collapsed.values()), updates=updates) f2 = theano.function(inputs=[], outputs=list(valid_collapsed.values()), updates=updates) log.debug('done') t1=time.time() for epoch in range(100): t=time.time() log.debug(epoch) vals = f() m = OrderedDict(zip(train_collapsed.keys(), vals)) plot.update_plots(epoch, m) time.sleep(0.02) log.debug('----- '+make_time_units_string(time.time()-t)) for epoch in range(100): t = time.time() log.debug(epoch) vals = f2() m = OrderedDict(zip(valid_collapsed.keys(), vals)) plot.update_plots(epoch, m) time.sleep(0.02) log.debug('----- ' + make_time_units_string(time.time() - t)) log.debug("TOTAL TIME "+make_time_units_string(time.time()-t1))
def __init__(self, valid=None, invalid=None, valid_equivalent=None): ''' Check if variables can be expressed without using variables in invalid. init_valid_equivalent provides a dictionary mapping some invalid variables to valid ones that can be used instead. ''' if valid is None: valid = [] if invalid is None: invalid = [] if valid_equivalent is None: valid_equivalent = OrderedDict() # Nodes that are valid to have in the graph computing outputs self.valid = set(valid) # Nodes that are NOT valid to have in the graph computing outputs self.invalid = set(invalid) # Mapping from invalid variables to equivalent valid ones. self.valid_equivalent = valid_equivalent.copy() self.valid.update(valid_equivalent.values()) self.invalid.update(valid_equivalent.keys())
def get_lr_scalers(self): rval = OrderedDict() params = self.get_params() for layer in self.layers[:-1]: contrib = layer.get_lr_scalers() assert isinstance(contrib, OrderedDict) # No two layers can contend to scale a parameter assert not any([key in rval for key in contrib]) # Don't try to scale anything that's not a parameter assert all([key in params for key in contrib]) rval.update(contrib) for layer in self.layers[-1]: contrib = layer.get_lr_scalers() assert isinstance(contrib, OrderedDict) # No two layers can contend to scale a parameter assert not any([key in rval for key in contrib]) # Don't try to scale anything that's not a parameter assert all([key in params for key in contrib]) rval.update(contrib) assert all([isinstance(val, float) for val in rval.values()]) return rval
def get_gradients(self, model, data, **kwargs): cost = self.expr(model=model, data=data, **kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore') gradients = OrderedDict(izip(params, grads)) if self.gradient_clipping: norm_gs = 0. for grad in gradients.values(): norm_gs += (grad**2).sum() not_finite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) norm_gs = T.sqrt(norm_gs) norm_gs = T.switch(T.ge(norm_gs, self.max_magnitude), self.max_magnitude / norm_gs, 1.) for param, grad in gradients.items(): gradients[param] = T.switch(not_finite, .1 * param, grad * norm_gs) updates = OrderedDict() return gradients, updates
def get_params(self): """ This returns the list of theano shared variables that will be trained by the :class:`Optimizer`. These parameters are used in the gradient. This includes all of the parameters in every model in the Prototype, without duplication. Returns ------- dict(str: SharedVariable) Dictionary of {string_name: theano shared variables} to be trained with an :class:`Optimizer`. These are the parameters to be trained. """ params = OrderedDict() model_index = 0 for model in self.models: if isinstance(model, Model): model_params = model.get_params() # append the parameters only if they aren't already in the list! for name, param in model_params.items(): if param not in list(params.values()): name = model._classname + '_%d_' % model_index + name params[name] = param model_index += 1 return params
def get_gradients(self, model, data, ** kwargs): cost = self.expr(model=model, data=data, **kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore') gradients = OrderedDict(izip(params, grads)) if self.gradient_clipping: norm_gs = 0. for grad in gradients.values(): norm_gs += (grad ** 2).sum() not_finite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) norm_gs = T.sqrt(norm_gs) norm_gs = T.switch(T.ge(norm_gs, self.max_magnitude), self.max_magnitude / norm_gs, 1.) for param, grad in gradients.items(): gradients[param] = T.switch(not_finite, .1 * param, grad * norm_gs) updates = OrderedDict() return gradients, updates
def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None): """ Provides the updates for learning with gradient descent + momentum. Parameters ---------- learning_rate : float Learning rate coefficient. grads : dict A dictionary mapping from the model's parameters to their gradients. lr_scalers : dict A dictionary mapping from the model's parameters to a learning rate multiplier. """ gshared = OrderedDict({ p: sharedX(p.get_value() * 0., name='%s_grad' % p.name) for p, g in grads.iteritems() }) gsup = [(gs, g) for gs, g in zip(gshared.values(), grads.values())] get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x))) gnorm = get_norms(grads.values()) pnorm = get_norms(grads.keys()) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup) updates = OrderedDict() for param, grad in gshared.keys(): vel = sharedX(param.get_value() * 0.) assert param.dtype == vel.dtype assert grad.dtype == param.dtype if param.name is not None: vel.name = 'vel_' + param.name scaled_lr = learning_rate * lr_scalers.get(param, 1.) updates[vel] = self.momentum * vel - scaled_lr * grad inc = updates[vel] if self.nesterov_momentum: inc = self.momentum * inc - scaled_lr * grad assert inc.dtype == vel.dtype updates[param] = param + inc f_update = theano.function([learning_rate], [], updates=updates, on_unused_input='ignore') return f_grad_shared, f_update
def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None): """ Provides the updates for learning with gradient descent + momentum. Parameters ---------- learning_rate : float Learning rate coefficient. grads : dict A dictionary mapping from the model's parameters to their gradients. lr_scalers : dict A dictionary mapping from the model's parameters to a learning rate multiplier. """ gshared = OrderedDict({p: sharedX(p.get_value() * 0., name='%s_grad' % p.name) for p, g in grads.iteritems()}) gsup = [(gs, g) for gs, g in zip(gshared.values(), grads.values())] get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x))) gnorm = get_norms(grads.values()) pnorm = get_norms(grads.keys()) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup) updates = OrderedDict() for param, grad in gshared.keys(): vel = sharedX(param.get_value() * 0.) assert param.dtype == vel.dtype assert grad.dtype == param.dtype if param.name is not None: vel.name = 'vel_' + param.name scaled_lr = learning_rate * lr_scalers.get(param, 1.) updates[vel] = self.momentum * vel - scaled_lr * grad inc = updates[vel] if self.nesterov_momentum: inc = self.momentum * inc - scaled_lr * grad assert inc.dtype == vel.dtype updates[param] = param + inc f_update = theano.function([learning_rate], [], updates=updates, on_unused_input='ignore') return f_grad_shared, f_update
def get_updates(self, grads): """ .. todo:: WRITEME """ updates = OrderedDict() g_tt = OrderedDict() cnt = sharedX(0, 'counter') for p, g in grads.items(): lr_scaler = self.lr_scalers.get(str(p), 1.) m = sharedX(p.get_value() * 0.) v = sharedX(p.get_value() * 0.) b1 = self.b1 * self.lambd**cnt m_t = b1 * m + (1 - b1) * g v_t = self.b2 * v + (1 - self.b2) * g**2 m_t_hat = m_t / (1. - self.b1**(cnt + 1)) v_t_hat = v_t / (1. - self.b2**(cnt + 1)) g_t = m_t_hat / (T.sqrt(v_t_hat) + self.e) p_t = p - lr_scaler * self.lr * g_t g_tt[p] = g_t updates[m] = m_t updates[v] = v_t updates[p] = p_t if self.post_clip: g_norm = sum([T.sqr(x/self.batch_size).sum() for x in g_tt.values()]) not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm)) g_norm = T.sqrt(g_norm) scaler = self.scaler / T.maximum(self.scaler, g_norm) for p, g in g_tt.items(): lr_scaler = self.lr_scalers.get(str(p), 1.) p_t = p - lr_scaler * self.lr * g * scaler updates[p] = p_t updates[cnt] = cnt + 1 return updates
class Optimizer(object): ''' Default interface for an optimizer implementation - this provides the necessary parameter updates when training a model on a dataset using an online stochastic process. ''' def __init__(self, model, dataset, config=None, defaults=None, n_epoch=None, batch_size=None, minimum_batch_size=None, save_frequency=None, early_stop_threshold=None, early_stop_length=None, learning_rate=None, lr_decay=None, lr_factor=None, **kwargs): # Default values to use for some training parameters _defaults = {"n_epoch": 1000, "batch_size": 100, "minimum_batch_size": 1, "save_frequency": 10, "early_stop_threshold": .9995, "early_stop_length": 30, "learning_rate": 0.001, "lr_decay": "exponential", "lr_factor": 1, # no learning rate decay by default } log.debug("Initializing optimizer %s", str(type(self))) assert isinstance(model, Model), "Optimizer input model needs to be an opendeep Model class!" self.model = model self.dataset = dataset assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be an opendeep Dataset class!" # set self.args to be the combination of the defaults and the config dictionaries from the subclass in_args = combine_config_and_defaults(config, defaults) self.args = combine_config_and_defaults(in_args, _defaults) # if the args are none, make it a blank dictionary if self.args is None: self.args = {} # now that our required variables are out of the way, do the same thing for everything else passed via kwargs for arg, val in kwargs.items(): if (val is not None or str(arg) not in self.args) and str(arg) != 'kwargs': self.args[str(arg)] = val # flatten kwargs if it was passed as a variable elif str(arg) == 'kwargs': inner_kwargs = kwargs['kwargs'] for key, item in inner_kwargs.items(): if item is not None or str(key) not in self.args: self.args[str(key)] = item # now take care of overriding explicits passed in if n_epoch is not None: self.args['n_epoch'] = n_epoch if batch_size is not None: self.args['batch_size'] = batch_size if minimum_batch_size is not None: self.args['minimum_batch_size'] = minimum_batch_size if save_frequency is not None: self.args['save_frequency'] = save_frequency if early_stop_threshold is not None: self.args['early_stop_threshold'] = early_stop_threshold if early_stop_length is not None: self.args['early_stop_length'] = early_stop_length if learning_rate is not None: self.args['learning_rate'] = learning_rate if lr_decay is not None: self.args['lr_decay'] = lr_decay if lr_factor is not None: self.args['lr_factor'] = lr_factor # Magic! Now self.args contains the combination of all the initialization variables, overridden like so: # _defaults < defaults < config < kwargs (explicits passed to model's __init__) # log the arguments log.debug("optimizer config args: %s", str(self.args)) # Finally, to make things really easy, update the class 'self' with everything in self.args to make # all the parameters accessible via self.<param> self.__dict__.update(self.args) # Learning rate - how drastic of a step do the parameters change self.learning_rate = sharedX(self.learning_rate, 'learning_rate') self.lr_scalers = self.model.get_lr_scalers() if self.lr_decay: self.learning_rate_decay = get_decay_function(self.lr_decay, self.learning_rate, self.learning_rate.get_value(), self.lr_factor) else: self.learning_rate_decay = False def get_batch_indices(self, data_lengths): batch_indices = [] start_idx = 0 for len in raise_to_list(data_lengths): # integer division to determine number of whole batches for this length n_batches = len / int(self.batch_size) # add the (start_idx, end_idx) tuple to the list for i in range(n_batches): end_idx = start_idx + self.batch_size batch_indices.append((start_idx, end_idx)) start_idx = end_idx # remainder to find number of leftover examples remainder = numpy.remainder(len, self.batch_size) end_idx = start_idx + remainder # check if it is bigger than the minimum allowed size if remainder >= self.minimum_batch_size: batch_indices.append((start_idx, end_idx)) start_idx = end_idx return batch_indices def get_updates(self, gradients): """ This returns the parameter updates to use during training. It defaults to only using (annealed) learning rate. :param gradients: (parameter, gradient) tuples representing the parameters to update and their gradients :type gradients: list(tuple) :return: the updates :rtype: updates """ log.debug('Setting up Stochastic Gradient Descent for optimizer...') updates = OrderedDict() for (param, gradient) in six.iteritems(gradients): scaled_lr = self.learning_rate * self.lr_scalers.get(param, 1.) updates[param] = param - scaled_lr * gradient return updates def train(self, continue_training=False): """ This method performs the training!!! :param continue_training: :type continue_training: :return: :rtype: """ # grab the model parameters to use during training self.params = self.model.get_params() log.info("%s params: %s", str(type(self.model)), str(self.params)) ############################################### # theano index variable to use on the dataset # ############################################### # index to a [mini]batch - both start and end data_idx = T.iscalar('data_index') data_end_idx = T.iscalar('data_end_index') batch_slice = slice(data_idx, data_end_idx) # compute number of minibatches for training, validation and testing # shapes is list of list - input list of datasets to optimizer (for multiple inputs), and each dataset # could be a list of shared variables (like multiple sequences from files) train_data_shapes = raise_to_list(self.dataset.getDataShape(TRAIN)) valid_data_shapes = raise_to_list(self.dataset.getDataShape(VALID)) test_data_shapes = raise_to_list(self.dataset.getDataShape(TEST)) # train_batches is going to be lists of tuples that contain the start and end indices for train data train_data_lens = [shape[0] for shape in train_data_shapes] self.train_batches = self.get_batch_indices(train_data_lens) if valid_data_shapes is not None: valid_data_lens = [shape[0] for shape in valid_data_shapes] self.valid_batches = self.get_batch_indices(valid_data_lens) else: self.valid_batches = None if test_data_shapes is not None: test_data_lens = [shape[0] for shape in test_data_shapes] self.test_batches = self.get_batch_indices(test_data_lens) else: self.test_batches = None # translate the data_idx into the givens for the model model_inputs = raise_to_list(self.model.get_inputs()) model_targets = raise_to_list(self.model.get_targets()) train_data, train_labels = self.dataset.getSubset(TRAIN) train_givens = OrderedDict(zip(model_inputs, [train_data[batch_slice]])) if model_targets is not None and len(model_targets) > 0: train_givens.update(OrderedDict(zip(model_targets, [train_labels[batch_slice]]))) valid_data, valid_labels = self.dataset.getSubset(VALID) valid_givens = OrderedDict(zip(model_inputs, [valid_data[batch_slice]])) if model_targets is not None and len(model_targets) > 0: valid_givens.update(OrderedDict(zip(model_targets, [valid_labels[batch_slice]]))) test_data, test_labels = self.dataset.getSubset(TEST) test_givens = OrderedDict(zip(model_inputs, [test_data[batch_slice]])) if model_targets is not None and len(model_targets) > 0: test_givens.update(OrderedDict(zip(model_targets, [test_labels[batch_slice]]))) # Now time to create the training cost functions for the model - make sure to handle the possible # list of costs used for pretraining of certain parts of the model. train_costs = raise_to_list(self.model.get_train_cost()) self.train_functions = [] for i, train_cost in enumerate(train_costs): # Now create the training cost function for the model to use while training - update parameters # gradient! gradients, _ = self.model.get_gradient(cost=train_cost) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD, including AdaDelta! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable train_updates = self.model.get_updates() if train_updates: train_updates.update(gradient_updates) else: train_updates = gradient_updates # Compile the training function! log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_costs), str(type(self.model))) t = time.time() f_learn = function(inputs=[data_idx, data_end_idx], updates=train_updates, outputs=train_cost, givens=train_givens, name='f_learn_%d' % i) log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) self.train_functions.append(f_learn) # grab the expression(s) to use to monitor different model values during training log.debug("Compiling monitor functions...") monitor_t = time.time() self.monitors = OrderedDict(self.model.get_monitors()) self.monitor_names = self.monitors.keys() if len(self.monitors.keys()) > 0: self.train_monitor_function = function( inputs=[data_idx, data_end_idx], updates=self.model.get_updates(), outputs=self.monitors.values(), givens=train_givens, name="train_monitor_function" ) if len(self.monitors.keys()) > 0: self.valid_monitor_function = function( inputs=[data_idx, data_end_idx], updates=self.model.get_updates(), outputs=self.monitors.values(), givens=valid_givens, name="valid_monitor_function" ) if len(self.monitors.keys()) > 0: self.test_monitor_function = function( inputs=[data_idx, data_end_idx], updates=self.model.get_updates(), outputs=self.monitors.values(), givens=test_givens, name="test_monitor_function" ) log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) self.noise_switches = raise_to_list(self.model.get_noise_switch()) ################## # start training # ################## # make sure to deal with a list of train_cost functions - for layer-wise pretraining! # this list of training functions was created during __init__() start_time = time.time() for func_i, train_function in enumerate(self.train_functions): log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS (continue_training=%s)-----------", str(type(self.model)), func_i + 1, len(self.train_functions), self.n_epoch, str(continue_training)) log.debug("Train dataset size is: %s", self.dataset.getDataShape(TRAIN)) if self.dataset.hasSubset(VALID): log.debug("Valid dataset size is: %s", self.dataset.getDataShape(VALID)) if self.dataset.hasSubset(TEST): log.debug("Test dataset size is: %s", self.dataset.getDataShape(TEST)) self.STOP = False self.epoch_counter = 0 if not continue_training: # reset the learning rate if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay: self.learning_rate_decay.reset() # reset the other model decaying functions for decay_param in self.model.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(train_function) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") set_shared_values(self.params, self.best_params) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl') log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t)) log.info("------------TOTAL %s TRAIN TIME TOOK %s---------", str(type(self.model)), make_time_units_string(time.time() - start_time)) def _perform_one_epoch(self, f_learn): self.epoch_counter += 1 t = time.time() log.info('EPOCH %s', str(self.epoch_counter)) # set the noise switches on for training function! (this is where things like dropout happen) if len(self.noise_switches) > 0: log.debug("Turning on %s noise switches", str(len(self.noise_switches))) switch_vals = [switch.get_value() for switch in self.noise_switches] [switch.set_value(0.) for switch in self.noise_switches] # train train_costs = [] train_monitors = {key: [] for key in self.monitors.keys()} for batch_start, batch_end in self.train_batches: train_costs.append(f_learn(batch_start, batch_end)) self.call_monitors(monitor_function=self.train_monitor_function, monitors_dict=train_monitors, inputs=[batch_start, batch_end]) log.info('Train cost: %s', trunc(numpy.mean(train_costs, 0))) if len(self.monitors.keys()) > 0: log.info('Train monitors: %s', str({key: numpy.mean(value, 0) for key, value in train_monitors.items()})) # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :) if len(self.noise_switches) > 0: log.debug("Turning off %s noise switches", str(len(self.noise_switches))) [switch.set_value(0.) for switch in self.noise_switches] # valid if self.dataset.hasSubset(VALID) and len(self.monitors.keys()) > 0: valid_monitors = {key: [] for key in self.monitors.keys()} for batch_start, batch_end in self.valid_batches: self.call_monitors(monitor_function=self.valid_monitor_function, monitors_dict=valid_monitors, inputs=[batch_start, batch_end]) log.info('Valid monitors: %s', str({key: numpy.mean(value, 0) for key, value in valid_monitors.items()})) #test if self.dataset.hasSubset(TEST) and len(self.monitors.keys()) > 0: test_monitors = {key: [] for key in self.monitors.keys()} for batch_start, batch_end in self.test_batches: self.call_monitors(monitor_function=self.test_monitor_function, monitors_dict=test_monitors, inputs=[batch_start, batch_end]) log.info('Test monitors: %s', str({key: numpy.mean(value, 0) for key, value in test_monitors.items()})) # check for early stopping on train costs cost = numpy.sum(train_costs) if cost < self.best_cost * self.early_stop_threshold: self.patience = 0 self.best_cost = cost # save the parameters that made it the best self.best_params = get_shared_values(self.params) else: self.patience += 1 # check for stopping either from n_epochs or from threshold/patience stop = False if self.epoch_counter >= self.n_epoch: log.info("Stopping (reached max number of epochs)...") stop = True if self.patience >= self.early_stop_length: log.info("Stopping early (reached stop threshold)...") stop = True timing = time.time() - t self.times.append(timing) log.info('time: ' + make_time_units_string(timing)) log.info('remaining time: ' + make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times))) if (self.epoch_counter % self.save_frequency) == 0: #save params self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl') # ANNEAL! if not stop: if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay: self.learning_rate_decay.decay() if hasattr(self, 'momentum_decay') and self.momentum_decay: self.momentum_decay.decay() for decay_param in self.model.get_decay_params(): decay_param.decay() # reset the switches if len(self.noise_switches) > 0: [switch.set_value(val) for switch, val in zip(self.noise_switches, switch_vals)] # return whether or not to stop this epoch return stop def call_monitors(self, monitors_dict, monitor_function, inputs): outs = monitor_function(*inputs) for i, out in enumerate(outs): monitors_dict[self.monitor_names[i]].append(out)
class Monitor(object): """ A class for monitoring Models while they are being trained. A monitor object records the number of minibatches and number of examples the model has trained, as well as any number of "channels" that track quantities of interest (examples: the objective function, measures of hidden unit activity, reconstruction error, sum of squared second derivatives, average norm of the weight vectors, etc.) """ def __init__(self, model): """ Makes a monitor for `model`. Assumes the model has not been trained at all yet. Parameters ---------- model : pylearn2.models.model.Model instance """ self.training_succeeded = False self.model = model self.channels = OrderedDict() self._num_batches_seen = 0 self._examples_seen = 0 self._epochs_seen = 0 self._datasets = [] self._iteration_mode = [] self._batch_size = [] self._num_batches = [] self._dirty = True self._rng_seed = [] self.names_to_del = ['theano_function_mode'] self.t0 = time.time() # Determine whether the model should use topological or vector form of # examples. If the model acts on a space with more than the batch index # and channel dimension, the model has topological dimensions, so the # topological view of the data should be used. vector = model.get_input_space().make_theano_batch( name='monitoring_input') if isinstance(vector.type, theano.sparse.SparseType): self.topo = False else: self.topo = len(vector.type.broadcastable) > 2 self.require_label = False self.theano_function_mode = None def set_theano_function_mode(self, mode): if self.theano_function_mode != mode: self._dirty = True self.theano_function_mode = mode def add_dataset(self, dataset, mode='sequential', batch_size=None, num_batches=None, seed=None): """ Determines the data used to calculate the values of each channel. Parameters ---------- dataset : object A `pylearn2.datasets.Dataset` object. mode : str or object, optional Iteration mode; see the docstring of the `iterator` method on `pylearn2.datasets.Dataset` for details. batch_size : int, optional The size of an individual batch. Optional if `mode` is 'sequential' and `num_batches` is specified (batch size will be calculated based on full dataset size). num_batches : int, optional The total number of batches. Unnecessary if `mode` is 'sequential' and `batch_size` is specified (number of batches will be calculated based on full dataset size). """ # The user can ommit using lists if only one dataset is set if not isinstance(dataset, list): dataset = [dataset] if not isinstance(mode, list): mode = [mode] if not isinstance(batch_size, list): batch_size = [batch_size] if not isinstance(num_batches, list): num_batches = [num_batches] if seed is None: seed = [None] * len(dataset) if not isinstance(seed, list): seed = [seed] if len(mode) != len(dataset): raise ValueError("Received " + str(len(dataset)) + " dataset but " + str(len(mode)) + " modes.") if any([len(l) != len(dataset) for l in [batch_size, seed]]): raise ValueError("make sure each dataset has its iteration " + \ "batch size and number of batches.") for (d, m, b, n, sd) in safe_izip(dataset, mode, batch_size, num_batches, seed): try: it = d.iterator(mode=m, batch_size=b, num_batches=n, topo=self.topo, targets=self.require_label, rng=sd) except ValueError as exc: raise ValueError("invalid iteration parameters in " "Monitor.add_dataset: " + str(exc)) if it.stochastic: # must be a seed, not a random number generator # if it were a random number generator, different iterators using # it would update its state, so we would not get the same iterator # each time # Also, must not be None, because this makes the iterator pick # a seed based on the clock if sd is None: raise TypeError( "Monitor requires a seed when using stochastic iteration modes." ) if not isinstance(sd, (list, tuple, int)): raise TypeError( "Monitor requires a seed (not a random number generator) when using stochastic iteration modes." ) else: assert sd is None # the iterator should catch this, but let's double-check if not d in self._datasets: self._datasets.append(d) self._iteration_mode.append(m) self._batch_size.append(b) self._num_batches.append(n) self._rng_seed.append(sd) def __call__(self): """ Runs the model on the monitoring dataset in order to add one data point to each of the channels. """ # If the channels have changed at all, we need to recompile the theano # functions used to compute them if self._dirty: self.redo_theano() model = self.model datasets = self._datasets # Set all channels' val_shared to 0 self.begin_record_entry() for d, i, b, n, a, sd, ne in safe_izip(datasets, self._iteration_mode, self._batch_size, self._num_batches, self.accum, self._rng_seed, self.num_examples): if isinstance(d, basestring): d = yaml_parse.load(d) raise NotImplementedError() # need to put d back into self._datasets myiterator = d.iterator(mode=i, batch_size=b, num_batches=n, topo=self.topo, targets=self.require_label, rng=sd) actual_ne = 0 for X in myiterator: if self.require_label: X, y = X self.run_prereqs(X, y, d) a(X, y) else: self.run_prereqs(X, None, d) a(X) if X.ndim == 2: actual_batch_size = X.shape[0] else: actual_batch_size = X.shape[d.get_topo_batch_axis()] actual_ne += actual_batch_size # end for X if actual_ne != ne: raise RuntimeError( "At compile time, your iterator said it had " + str(ne) + " examples total, but at runtime it gave us " + str(actual_ne) + ".") # end for d log.info("Monitoring step:") log.info("\tEpochs seen: %d" % self._epochs_seen) log.info("\tBatches seen: %d" % self._num_batches_seen) log.info("\tExamples seen: %d" % self._examples_seen) t = time.time() - self.t0 for channel_name in sorted(self.channels.keys(), key=number_aware_alphabetical_key): channel = self.channels[channel_name] channel.time_record.append(t) channel.batch_record.append(self._num_batches_seen) channel.example_record.append(self._examples_seen) channel.epoch_record.append(self._epochs_seen) val = channel.val_shared.get_value() channel.val_record.append(val) # TODO: use logging infrastructure so that user can configure # formatting if abs(val) < 1e4: val_str = str(val) else: val_str = '%.3e' % val log.info("\t%s: %s" % (channel_name, val_str)) def run_prereqs(self, X, y, dataset): if dataset not in self.prereqs: return for prereq in self.prereqs[dataset]: prereq(X, y) def get_batches_seen(self): """ Returns the number of batches the model has learned on (assuming that the learning code has been calling Monitor.report_batch correctly) """ return self._num_batches_seen def get_epochs_seen(self): return self._epochs_seen def get_examples_seen(self): """ Returns the number of examples the model has learned on (assuming that the learning code has been calling Monitor.report_batch correctly) """ return self._examples_seen def report_batch(self, num_examples): """ Call this whenever the model has learned on another batch of examples. Report how many examples were learned on. """ self._examples_seen += num_examples self._num_batches_seen += 1 def report_epoch(self): self._epochs_seen += 1 def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is needed so that if new channels are added, Theano's optimizations make sure (to the extent that they can) that the new channels and old channels don't have any redundant calculations. It is also needed to regenerate Theano functions after pickling and unpickling, since Theano functions should not be pickled. """ self._dirty = False init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function( inputs=[], updates=updates, mode=self.theano_function_mode, name='Monitor.begin_record_entry') updates = OrderedDict() givens = OrderedDict() # Get the appropriate kind of theano variable to represent the data the model # acts on X = self.model.get_input_space().make_theano_batch(name="monitoring_X") if config.compute_test_value != 'off': m = self.model.get_test_batch_size() test_value = self.model.get_input_space().get_origin_batch(m) X.tag.test_value = np.cast[X.type.dtype](test_value) if self.require_label: Y = self.model.get_output_space().make_theano_batch( name="monitoring_Y") log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line( 'compiling monitor including channel ' + key + '\n') log.info('\t%s' % key) it = [d.iterator(mode=i, num_batches=n, batch_size=b, topo=self.topo) \ for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size)] self.num_examples = [ np.cast[config.floatX](float(i.num_examples)) for i in it ] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for channel in self.channels.values(): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] cur_num_examples = self.num_examples[index] u = updates[index] if isinstance(channel.graph_input, (list, tuple)): channel_X, channel_Y = channel.graph_input assert channel_X not in g or g[channel_X] is X assert channel_Y not in g or g[channel_Y] is Y g[channel_X] = X g[channel_Y] = Y else: channel_X = channel.graph_input assert channel_X not in g or g[channel_X] is X g[channel_X] = X if n == 0: raise ValueError( "Iterating over 0 examples results in divide by 0") if self.topo: batch_index = d.get_topo_batch_axis() else: batch_index = 0 val = channel.val * T.cast(X.shape[batch_index], config.floatX) / cur_num_examples u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' \ + key.name + ' has dtype ' + key.dtype + \ ' but is driven by an expression with type ' + \ up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key ' + var_descriptor(elem) + '\n') mode.record.handle_line('g val ' + var_descriptor(g[elem]) + '\n') for elem in u: mode.record.handle_line('u key ' + var_descriptor(elem) + '\n') mode.record.handle_line('u val ' + var_descriptor(u[elem]) + '\n') function_name = 'Monitor.accum[%d]' % idx if self.require_label: if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just monitor the model # parameters, or some shared variable updated by the training algorithm, so we # need to ignore the unused input error self.accum.append( function([X, Y], givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) else: if mode is not None and hasattr(mode, 'record'): mode.record.handle_line( 'compiling unsupervised accum\n') self.accum.append( function([X], givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output ' + var_descriptor(elem) + '\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del( [name for name in final_names if name not in init_names]) def register_names_to_del(self, names): """ Register names of fields that should be deleted before pickling. Parameters ---------- names : list A list of attribute names as strings. """ for name in names: if name not in self.names_to_del: self.names_to_del.append(name) def __getstate__(self): """ In order to avoid pickling a copy of the dataset whenever a monitor is saved, the __getstate__ method replaces the dataset field with the dataset's yaml source. This is not a perfect solution because it won't work with job resuming, which would require saving the state of the dataset's random number generator. Like in the Model class, we also need to avoid saving any Theano functions, so we delete everything that can be regenerated with `redo_theano` by deleting the fields in `self.names_to_del` """ # Patch old pickled monitors if not hasattr(self, '_datasets'): self._datasets = [self._dataset] del self._dataset temp = self._datasets if self._datasets: self._datasets = [] for dataset in temp: if isinstance(dataset, basestring): self._datasets.append(dataset) else: try: self._datasets.append(dataset.yaml_src) except AttributeError: warnings.warn( 'Trained model saved without indicating yaml_src') d = copy.copy(self.__dict__) self._datasets = temp for name in self.names_to_del: if name in d: del d[name] return d def __setstate__(self, d): # patch old pkl files if '_dataset' in d: d['_datasets'] = [d['_dataset']] del d['_dataset'] self.__dict__.update(d) def add_channel(self, name, ipt, val, dataset=None, prereqs=None): """ Asks the monitor to start tracking a new value. Can be called even after the monitor is already in use. Parameters ---------- name: str The display name in the monitor. ipt: tensor_like The symbolic tensor which should be clamped to the data. (or a (features,targets) list/tuple containing two symbolic tensors) val: tensor_like The value (function of `ipt`) to be tracked. dataset: A Dataset instance specifying which dataset to compute this channel on. prereqs: list of callables that take two numpy tensors (X and y, where y will be None if no labels are used) each prereq must be called exactly once per each new batch of data drawn *from dataset* before the channel value is computed if two channels provide a prereq with exactly the same id, that prereq will only be called once """ if isinstance(val, (float, int, long)): val = np.cast[theano.config.floatX](val) val = T.as_tensor_variable(val) if not isinstance(ipt, (list, tuple)): tmp = [ipt] else: tmp = ipt inputs = theano.gof.graph.inputs([val]) for elem in inputs: if not hasattr(elem, 'get_value') and not isinstance( elem, theano.gof.graph.Constant): if elem not in tmp: raise ValueError("Unspecified input: " + str(elem)) mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('Adding monitor channel ' + name + '\n') if isinstance(ipt, (list, tuple)): for elem in ipt: mode.record.handle_line('Includes input var ' + var_descriptor(elem) + '\n') else: mode.record.handle_line(name + ' input var is ' + var_descriptor(ipt) + '\n') mode.record.handle_line('channel ' + name + ' is ' + var_descriptor(val) + '\n') if dataset is None: if len(self._datasets) == 1: dataset = self._datasets[0] elif len(self._datasets) == 0: raise ValueError(_err_no_data) else: raise ValueError(_err_ambig_data) try: self._datasets.index(dataset) except ValueError: raise ValueError("The dataset specified is not " + \ "one of the monitor's datasets") if name in self.channels: raise ValueError("Tried to create the same channel twice (%s)" % name) if isinstance(ipt, (list, tuple)): if dataset is not None: if not dataset.has_targets(): raise ValueError("Tried to create a channel ("+name \ +") that uses targets, but monitoring dataset has no targets") self.require_label = True assert len(ipt) == 2 self.channels[name] = MonitorChannel(ipt, val, name, dataset, prereqs) self._dirty = True def _sanity_check(self): """ Sometimes we serialize models and then load them somewhere else but still try to use their Monitor, and the Monitor is in a mangled state. I've added some calls to _sanity_check to try to catch when that happens. Not sure what to do for a long term fix. I think it requires making theano graphs serializable first. """ for name in self.channels: channel = self.channels[name] assert hasattr(channel, 'prereqs') @classmethod def get_monitor(cls, model): """ Returns a model's monitor. If the model doesn't have a monitor yet, installs one and returns that. Parameters ---------- model : object An object that implements the `Model` interface specified in `pylearn2.models`. """ if hasattr(model, 'monitor'): rval = model.monitor rval._sanity_check() else: rval = Monitor(model) model.monitor = rval return rval # TODO: find out if monitor.foo below are used anywhere, remove if not. @property def batch_size(self): return self._batch_size @property def num_batches(self): return self._num_batches def setup(self, dataset, cost, batch_size, num_batches=None, extra_costs=None, mode='sequential'): """ Sets up the monitor for a cost minimization problem. Adds channels defined by both the model and the cost for the specified dataset(s), as well as a channel called 'objective' defined by the costs' __call__ method. dataset: a Dataset or dictionary mapping string names to Datasets If string names are used, then for every dataset, each channel defined by the model or cost will be replicated with that dataset's name followed by an underscore as the prefix. For example, if your cost defines a channel called 'misclass', and datasets is {'train' : train_dataset, 'valid' : valid_dataset} you will get channels called 'train_misclass' and 'valid_misclass'. cost: a Cost """ if dataset is None: return if isinstance(dataset, Dataset): dataset = {'': dataset} else: assert isinstance(dataset, dict) assert all(isinstance(key, str) for key in dataset) assert all(isinstance(dataset[key], Dataset) for key in dataset) if extra_costs is None: costs = {} else: costs = extra_costs assert '' not in costs costs[''] = cost supervised = any(cost.supervised for cost in costs.values()) model = self.model X_space = model.get_input_space() X = X_space.make_theano_batch(name='monitor_X') if config.compute_test_value != 'off': X.tag.test_value = X_space.get_origin_batch(batch_size).astype( X.dtype) if supervised: Y_space = model.get_output_space() Y = Y_space.make_theano_batch(name='monitor_Y') if config.compute_test_value != 'off': Y.tag.test_value = Y_space.get_origin_batch(batch_size).astype( Y.dtype) ipt = (X, Y) else: Y = None ipt = X custom_channels = {} for cost_name in costs: if cost_name == '': prefix = '' else: prefix = cost_name + '_' cost = costs[cost_name] raw_channels = cost.get_monitoring_channels(model, X, Y) channels = {} for name in raw_channels: channels[prefix + name] = raw_channels[name] custom_channels.update(channels) model_channels = model.get_monitoring_channels(X, Y) custom_channels.update(model_channels) if is_stochastic(mode): seed = [[2013, 02, 22]] else: seed = None for dataset_name in dataset: cur_dataset = dataset[dataset_name] self.add_dataset(dataset=cur_dataset, mode=mode, batch_size=batch_size, num_batches=num_batches, seed=seed) if dataset_name == '': dprefix = '' else: dprefix = dataset_name + '_' # These channel name 'objective' must not vary, since callbacks that respond to the # values in the monitor use the name to find it. for cost_name in costs: cost = costs[cost_name] cost_value = cost(model, X, Y) if cost_value is not None: if cost_name == '': name = dprefix + 'objective' else: name = dprefix + cost_name self.add_channel(name=name, ipt=ipt, val=cost_value, dataset=cur_dataset) for key in custom_channels: self.add_channel(name=dprefix + key, ipt=ipt, val=custom_channels[key], dataset=cur_dataset)
class Optimizer(object): """ Default interface for an optimizer implementation - this provides the necessary parameter updates when training a model on a dataset using an online stochastic process. The base framework for performing stochastic gradient descent. """ def __init__(self, dataset, loss=None, model=None, epochs=1000, batch_size=100, min_batch_size=1, save_freq=10, stop_threshold=None, stop_patience=50, learning_rate=1e-3, lr_decay=None, lr_decay_factor=None, grad_clip=None, hard_clip=False, **kwargs): """ Initialize the Optimizer. Parameters ---------- dataset : Dataset The :class:`opendeep.data.Dataset` to use when training the Model. loss : Loss The :class:`opendeep.optimization.loss.Loss` function to compare the model to a 'target' result. model : Model The :class:`opendeep.models.Model` to train. Needed if the Optimizer isn't being passed to a Model's .train() method. epochs : int How many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. min_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_freq : int, optional How many epochs to train between each new save of the Model's parameters. stop_threshold : float, optional The factor by how much the best validation training score needs to improve to determine early stopping. stop_patience : int, optional The patience or number of epochs to wait after the stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for classes of decay and documentation. lr_decay_factor : float The amount of decay to use for the ``lr_decay`` type of decay. grad_clip : float, optional Whether to clip gradients. This will clip the norm of the gradients either with a hard cutoff or rescaling. hard_clip : bool Whether to use a hard cutoff or rescaling for clipping gradients. """ log.info("Initializing optimizer %s", str(self.__class__.__name__)) # Deal with early stopping None initializations (no early stopping). if not stop_threshold: stop_threshold = numpy.inf if not save_freq: save_freq = 1000000 if not stop_patience: stop_patience = 1 # Put all init parameters in self.args so we can log the initial configuration. self.args = locals().copy() self.args.pop('self') kwargs = self.args.pop('kwargs') self.args = add_kwargs_to_dict(kwargs, self.args) # log the arguments log.info("Optimizer config args: %s", str(self.args)) # if the optimizer wasn't initialized with a Model (train() being called from the model class itself), # just return. (This seems kinda hacky but hey, people wanted .train() to happen from Model and there # wasn't really a better way unless the epoch looping logic was in that method for Model. That wasn't # the best option because other methods besides stochastic ones can exist for optimizers in the future. # TODO: fix this up - feels like a hack just to make model.train() work... if not model: return # Otherwise, things are proceeding as normal. Carry on... assert isinstance(model, Model), "Optimizer input model needs to be a Model class! " \ "Found %s" % str(model.__class__.__name__) assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be a Dataset class! " \ "Found %s" % str(dataset.__class__.__name__) # deal with loss expression/targets if loss is not None: assert isinstance(loss, Loss), "Optimizer input loss needs to be a Loss class! " \ "Found %s" % str(loss.__class__.__name__) if isinstance(loss, Loss): self.loss_targets = loss.get_targets() self.loss_expression = loss.get_loss() else: assert model.get_loss() is not None, "No Loss specified, and the model does not have one implemented." if isinstance(model.get_loss(), tuple): self.loss_targets = raise_to_list(model.get_loss()[0]) self.loss_expression = model.get_loss()[1] else: self.loss_targets = None self.loss_expression = model.get_loss() model_inputs = raise_to_list(model.get_inputs()) n_model_inputs = len(model_inputs) model_targets = self.loss_targets or [] for input in model_inputs: if input in model_targets: model_targets.remove(input) n_model_targets = len(model_targets) self.unsupervised = (n_model_targets is 0) # make sure the number of inputs/targets matches up with the dataset properties # train assert n_model_inputs == len(raise_to_list(dataset.train_inputs)), \ "Dataset has %d train inputs, while model expects %d" % \ (len(raise_to_list(dataset.train_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.train_targets) or []), \ "Dataset has %d train targets, while model expects %d" % \ (len(raise_to_list(dataset.train_targets) or []), n_model_targets) # valid if dataset.valid_inputs is not None: assert n_model_inputs == len(raise_to_list(dataset.valid_inputs)), \ "Dataset has %d valid inputs, while model expects %d" % \ (len(raise_to_list(dataset.valid_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.valid_targets) or []), \ "Dataset has %d valid targets, while model expects %d" % \ (len(raise_to_list(dataset.valid_targets) or []), n_model_targets) # test if dataset.test_inputs is not None: assert n_model_inputs == len(raise_to_list(dataset.test_inputs)), \ "Dataset has %d test inputs, while model expects %d" % \ (len(raise_to_list(dataset.test_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.test_targets) or []), \ "Dataset has %d test targets, while model expects %d" % \ (len(raise_to_list(dataset.test_targets) or []), n_model_targets) # now we are happy, we can add them to `self` self.model = model self.dataset = dataset self.loss = loss # Learning rate - how drastic of a step do the parameters change self.learning_rate = sharedX(learning_rate, 'learning_rate') # whether to scale individual model parameters' learning rates. self.lr_scalers = self.model.get_lr_scalers() # whether to decay if lr_decay: self.learning_rate_decay = get_decay_function(lr_decay, self.learning_rate, learning_rate, lr_decay_factor) else: self.learning_rate_decay = False # rest of initial parameters needed for training. self.batch_size = batch_size self.min_batch_size = min_batch_size self.n_epoch = epochs self.save_frequency = save_freq self.early_stop_threshold = stop_threshold self.early_stop_length = stop_patience self.grad_clip = grad_clip self.hard_clip = hard_clip def get_updates(self, gradients): """ This returns the parameter updates to use during training. It defaults to only using (annealed) learning rate. Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. """ log.debug('Setting up Stochastic Gradient Descent for optimizer...') updates = OrderedDict() for (param, gradient) in six.iteritems(gradients): scaled_lr = self.learning_rate * self.lr_scalers.get(param, 1.) updates[param] = param - scaled_lr * gradient return updates def train(self, monitor_channels=None, train_outservice=None, plot=None): """ This method performs the training!!! It is an online training method that goes over minibatches from the dataset for a number of epochs, updating parameters after each minibatch. You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully. Parameters ---------- monitor_channels : list(MonitorsChannel or Monitor), optional The list of channels or monitors containing monitor expressions/variables to compile and evaluate on the data. train_outservice : OutService, optional The OutService to use for the automatically created train_cost monitor. Default of None just outputs to logs. plot : Plot, optional The Plot object to use if we want to graph the outputs (uses bokeh server). """ if not self.model: log.error("No self.model for the Optimizer!") raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() " "was called from the Model. Try initializing the Optimizer with the model param " "and calling optimizer.train().") ######################### # gradients and updates # ######################### # grab the model parameters to use during training self.params = self.model.get_params() # Now create the training cost function for the model to use while training - update parameters # gradient! gradients = grad(cost=self.loss_expression, wrt=list(self.params.values())) # now create the dictionary mapping the parameter with its gradient gradients = OrderedDict( [(param, g) for param, g in zip(list(self.params.values()), gradients)] ) # clip gradients if we want. gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable updates = self.model.get_updates() if updates: updates.update(gradient_updates) else: updates = gradient_updates log.info("%s params: %s", self.model._classname, str(list(self.params.keys()))) ############ # monitors # ############ # deal with the monitor channels if they were given (or take them from the plot) if monitor_channels is None and plot is not None and len(plot.channels) > 0: monitor_channels = plot.channels self.train_monitors_dict = {} self.valid_monitors_dict = {} self.test_monitors_dict = {} self.train_monitors_outservice_dict = {} self.valid_monitors_outservice_dict = {} self.test_monitors_outservice_dict = {} if monitor_channels: # collapse the appropriate monitors into their (name, expression, out_service) tuples train_collapsed = collapse_channels(monitor_channels, train=True) valid_collapsed = collapse_channels(monitor_channels, valid=True) test_collapsed = collapse_channels(monitor_channels, test=True) # get name: expression dictionary self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) self.test_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in test_collapsed]) # get name: outservice dictionary self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed]) self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed]) self.test_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in test_collapsed]) # finally deal with an outservice provided to monitor training cost self.train_outservice = train_outservice # remove redundant files made by the fileservice for the train monitor. # TODO: THIS FEELS LIKE A HACK. I don't like it. if isinstance(self.train_outservice, FileService): os.remove(self.train_outservice.valid_filename) os.remove(self.train_outservice.test_filename) ####################################### # compile train and monitor functions # ####################################### function_input = raise_to_list(self.model.get_inputs()) if self.loss_targets is not None: function_input += self.loss_targets # Compile the training function! log.info('Compiling f_learn function for model %s...', self.model._classname) t = time.time() f_learn = function(inputs=function_input, updates=updates, outputs=[self.loss_expression] + list(self.train_monitors_dict.values()), name='f_learn') log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) # figure out if we want valid and test (monitors) self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0) self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0) # Now compile the monitor functions! log.debug("Compiling monitor functions...") monitor_t = time.time() # valid monitors if self.valid_flag: self.valid_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.valid_monitors_dict.values()), name='valid_monitor_function' ) else: self.valid_monitor_function = None # test monitors if self.test_flag: self.test_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.test_monitors_dict.values()), name='test_monitor_function' ) else: self.test_monitor_function = None log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) ################## # start training # ################## log.info("-----------TRAINING %s FOR %d EPOCHS-----------", self.model._classname, self.n_epoch) self.STOP = False self.epoch_counter = 0 # reset any decay params for decay_param in self.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(f_learn, plot) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") for best_param, param_value in self.best_params.items(): self.params[best_param].set_value(param_value, borrow=False) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter)) log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t)) def _perform_one_epoch(self, f_learn, plot=None): """ Performs a single training iteration with the given learn function. """ self.epoch_counter += 1 t = time.time() log.info('EPOCH %s', str(self.epoch_counter)) # set the noise switches on for training function! (this is where things like dropout happen) if not self.model.switches_on: self.model.turn_on_switches() ######### # train # ######### train_costs = [] train_monitors = {key: [] for key in self.train_monitors_dict.keys()} train_data = [ minibatch(input_data, self.batch_size, self.min_batch_size) for input_data in raise_to_list(self.dataset.train_inputs) ] if self.dataset.train_targets is not None and not self.unsupervised: train_data += [ minibatch(target, self.batch_size, self.min_batch_size) for target in raise_to_list(self.dataset.train_targets) ] for batch in min_normalized_izip(*train_data): _outs = raise_to_list(f_learn(*batch)) train_costs.append(_outs[0]) # handle any user defined monitors if len(train_monitors) > 0: current_monitors = zip(self.train_monitors_dict.keys(), _outs[1:]) for name, val in current_monitors: val = numpy.asarray(val) train_monitors[name].append(val) # get the mean values for the batches mean_train = numpy.mean(train_costs, 0) current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in train_monitors.items()} # log the mean values! log.info('Train cost: %s', trunc(mean_train)) if len(current_mean_monitors) > 0: log.info('Train monitors: %s', str(current_mean_monitors)) # send the values to their outservices if self.train_outservice: self.train_outservice.write(mean_train, "train") for name, service in self.train_monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], "train") # if there is a plot, also send them over! if plot: current_mean_monitors.update({TRAIN_COST_KEY: mean_train}) plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :) if self.model.switches_on: self.model.turn_off_switches() ######### # valid # ######### self._compute_over_subset("valid", self.dataset.valid_inputs, self.dataset.valid_targets, self.valid_monitors_dict, self.valid_monitor_function, self.valid_monitors_outservice_dict, plot) ######## # test # ######## self._compute_over_subset("test", self.dataset.test_inputs, self.dataset.test_targets, self.test_monitors_dict, self.test_monitor_function, self.test_monitors_outservice_dict, plot) ########### # cleanup # ########### # check for early stopping on train costs cost = numpy.sum(train_costs) # if the cost improved, reset the patience and record the best cost. if cost < self.best_cost * self.early_stop_threshold: self.patience = 0 self.best_cost = cost # save the parameters that made it the best self.best_params = {key: param.get_value(borrow=False) for key, param in self.params.items()} elif not numpy.isnan(cost): self.patience += 1 # check for stopping either from n_epochs or from threshold/patience stop = False if self.epoch_counter >= self.n_epoch: log.info("Stopping (reached max number of epochs)...") stop = True if self.patience >= self.early_stop_length: log.info("Stopping early (reached stop threshold)...") stop = True timing = time.time() - t self.times.append(timing) log.info('time: ' + make_time_units_string(timing)) log.debug('remaining time: ' + make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times))) if (self.epoch_counter % self.save_frequency) == 0: #save params self.model.save_params('trained_epoch_' + str(self.epoch_counter)) # ANNEAL! if not stop: # perform the appropriate decay on the decay functions/parameters for this optimizer and model for decay_param in self.get_decay_params(): decay_param.decay() # return whether or not to stop this epoch return stop def _compute_over_subset(self, subset, inputs, targets, monitors_dict, monitor_function, monitors_outservice_dict, plot): inputs = raise_to_list(inputs) targets = raise_to_list(targets) if inputs is not None and len(monitors_dict) > 0: monitors = {key: [] for key in monitors_dict.keys()} data = [minibatch(input, self.batch_size, self.min_batch_size) for input in inputs] if targets is not None and not self.unsupervised: data += [minibatch(target, self.batch_size, self.min_batch_size) for target in targets] for batch in min_normalized_izip(*data): _outs = raise_to_list(monitor_function(*batch)) current_monitors = zip(monitors_dict.keys(), _outs) for name, val in current_monitors: val = numpy.asarray(val) monitors[name].append(val) # get the mean values for the batches current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in monitors.items()} # log the mean values! log.info('%s monitors: %s', subset, str(current_mean_monitors)) # send the values to their outservices for name, service in monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], subset) # if there is a plot, also send them over! if plot: plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) def get_decay_params(self): """ Returns a list of all the Decay objects to decay during training. Returns ------- list List of Decay objects to use after each training epoch - in this case the learning rate decay. """ decay_params = self.model.get_decay_params() if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay: decay_params.append(self.learning_rate_decay) return decay_params
def main(): var = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W') updates = [(var, add_uniform(input=var, noise_level=.02))] stats = get_stats(var) l1 = stats.pop('l1') l2 = stats.pop('l2') min = stats.pop('min') max = stats.pop('max') var = stats.pop('var') std = stats.pop('std') mean = stats.pop('mean') mean_monitor = Monitor('mean', mean, train=True, valid=True) var_monitor = Monitor('var', var) w_channel = MonitorsChannel('W', monitors=mean_monitor) stat_channel = MonitorsChannel('stats', monitors=[var_monitor]) monitors = [w_channel, stat_channel] train_collapsed = collapse_channels(monitors, train=True) train_collapsed = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) valid_collapsed = collapse_channels(monitors, valid=True) valid_collapsed = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) plot = Plot(bokeh_doc_name='test_plots', monitor_channels=monitors, open_browser=True) log.debug('compiling...') f = theano.function(inputs=[], outputs=list(train_collapsed.values()), updates=updates) f2 = theano.function(inputs=[], outputs=list(valid_collapsed.values()), updates=updates) log.debug('done') t1 = time.time() for epoch in range(100): t = time.time() log.debug(epoch) vals = f() m = OrderedDict(zip(train_collapsed.keys(), vals)) plot.update_plots(epoch, m) log.debug('----- ' + make_time_units_string(time.time() - t)) for epoch in range(100): t = time.time() log.debug(epoch) vals = f2() m = OrderedDict(zip(valid_collapsed.keys(), vals)) plot.update_plots(epoch, m) log.debug('----- ' + make_time_units_string(time.time() - t)) log.debug("TOTAL TIME " + make_time_units_string(time.time() - t1))
class AlexNet(Model): """ This is the base model for AlexNet, Alex Krizhevsky's efficient deep convolutional net described in: 'ImageNet Classification with Deep Convolutional Neural Networks' Alex Krizhevsky, Ilya Sutskever, Geoffrey E. Hinton http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf Most of the code here is adapted from the authors listed in the license above, from the paper: 'Theano-based large-scale visual recognition with multiple GPUs' Weiguang Ding & Ruoyan Wnag, Fei Mao, Graham Taylor http://arxiv.org/pdf/1412.2302.pdf Copyright (c) 2014, Weiguang Ding, Ruoyan Wang, Fei Mao and Graham Taylor All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ defaults = { # data stuff "use_data_layer": False, "rand_crop": True, "batch_size": 256, # convolutional nets are particular about the batch size "output_path": '/outputs/alexnet/' } def __init__(self, config=None, defaults=defaults, inputs_hook=None, hiddens_hook=None, params_hook=None, use_data_layer=None, rand_crop=None, batch_size=None): # init Model to combine the defaults and config dictionaries. super(AlexNet, self).__init__(config, defaults) # all configuration parameters are now in self.args if inputs_hook or hiddens_hook or params_hook: log.critical( "Inputs_hook, hiddens_hook, and params_hook not implemented yet for AlexNet!" ) raise NotImplementedError() self.flag_datalayer = use_data_layer or self.args.get('use_data_layer') self.batch_size = batch_size or self.args.get('batch_size') self.rand_crop = rand_crop or self.args.get('rand_crop') #################### # Theano variables # #################### # allocate symbolic variables for the data # 'rand' is a random array used for random cropping/mirroring of data self.x = T.ftensor4('x') self.y = T.lvector('y') self.rand = T.fvector('rand') ########## # params # ########## self.params = [] # make the network! self.build_computation_graph() def build_computation_graph(self): ###################### BUILD NETWORK ########################## # whether or not to mirror the input images before feeding them into the network if self.flag_datalayer: layer_1_input = mirror_images( input=self.x, image_shape=( self.batch_size, 3, 256, 256, ), # bc01 format cropsize=227, rand=self.rand, flag_rand=self.rand_crop) else: layer_1_input = self.x # 4D tensor (going to be in c01b format) # Start with 5 convolutional pooling layers log.debug("convpool layer 1...") convpool_layer1 = ConvPoolLayer(inputs_hook=((self.batch_size, 3, 227, 227), layer_1_input), filter_shape=(96, 3, 11, 11), convstride=4, padsize=0, group=1, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer1.get_params() log.debug("convpool layer 2...") convpool_layer2 = ConvPoolLayer(inputs_hook=(( self.batch_size, 96, 27, 27, ), convpool_layer1.get_outputs()), filter_shape=(256, 96, 5, 5), convstride=1, padsize=2, group=2, poolsize=3, poolstride=2, bias_init=0.1, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer2.get_params() log.debug("convpool layer 3...") convpool_layer3 = ConvPoolLayer( inputs_hook=((self.batch_size, 256, 13, 13), convpool_layer2.get_outputs()), filter_shape=(384, 256, 3, 3), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer3.get_params() log.debug("convpool layer 4...") convpool_layer4 = ConvPoolLayer( inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer3.get_outputs()), filter_shape=(384, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=1, poolstride=0, bias_init=0.1, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer4.get_params() log.debug("convpool layer 5...") convpool_layer5 = ConvPoolLayer( inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer4.get_outputs()), filter_shape=(256, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer5.get_params() # Now onto the fully-connected layers! fc_config = { 'activation': 'rectifier', # type of activation function to use for output 'weights_init': 'gaussian', # either 'gaussian' or 'uniform' - how to initialize weights 'weights_mean': 0.0, # mean for gaussian weights init 'weights_std': 0.005, # standard deviation for gaussian weights init 'bias_init': 0.0 # how to initialize the bias parameter } log.debug("fully connected layer 1 (model layer 6)...") # we want to have dropout applied to the training version, but not the test version. fc_layer6_input = T.flatten(convpool_layer5.get_outputs(), 2) fc_layer6 = BasicLayer(inputs_hook=(9216, fc_layer6_input), output_size=4096, config=fc_config) # Add this layer's parameters! self.params += fc_layer6.get_params() # now apply dropout to the output for training dropout_layer6 = dropout(fc_layer6.get_outputs(), corruption_level=0.5) log.debug("fully connected layer 2 (model layer 7)...") fc_layer7 = BasicLayer(inputs_hook=(4096, fc_layer6.get_outputs()), output_size=4096, config=fc_config) fc_layer7_train = BasicLayer(inputs_hook=(4096, dropout_layer6), output_size=4096, params_hook=fc_layer7.get_params(), config=fc_config) # Add this layer's parameters! self.params += fc_layer7_train.get_params() # apply dropout again for training dropout_layer7 = dropout(fc_layer7_train.get_outputs(), corruption_level=0.5) # last layer is a softmax prediction output layer softmax_config = { 'weights_init': 'gaussian', 'weights_mean': 0.0, 'weights_std': 0.005, 'bias_init': 0.0 } log.debug("softmax classification layer (model layer 8)...") softmax_layer8 = SoftmaxLayer(inputs_hook=(4096, fc_layer7.get_outputs()), output_size=1000, config=softmax_config) softmax_layer8_train = SoftmaxLayer( inputs_hook=(4096, dropout_layer7), output_size=1000, params_hook=softmax_layer8.get_params(), config=softmax_config) # Add this layer's parameters! self.params += softmax_layer8.get_params() # finally the softmax output from the whole thing! self.output = softmax_layer8.get_outputs() ##################### # Cost and monitors # ##################### self.train_cost = softmax_layer8_train.negative_log_likelihood(self.y) cost = softmax_layer8.negative_log_likelihood(self.y) errors = softmax_layer8.errors(self.y) train_errors = softmax_layer8_train.errors(self.y) self.monitors = OrderedDict([('cost', cost), ('errors', errors), ('dropout_errors', train_errors)]) ######################### # Compile the functions # ######################### log.debug("Compiling functions!") t = time.time() log.debug("f_predict...") # use the actual argmax from the classification self.f_predict = function( inputs=[self.x], outputs=softmax_layer8.get_argmax_prediction()) log.debug("f_monitors") self.f_monitors = function(inputs=[self.x, self.y], outputs=self.monitors.values()) log.debug("compilation took %s" % make_time_units_string(time.time() - t)) def get_inputs(self): """ This should return the input(s) to the model's computation graph. This is called by the Optimizer when creating the theano train function on the cost expression returned by get_train_cost(). This should normally return the same theano variable list that is used in the inputs= argument to the f_predict function. ------------------ :return: Theano variables representing the input(s) to the training function. :rtype: List(theano variable) """ return [self.x] def get_outputs(self): """ This method will return the model's output variable expression from the computational graph. This should be what is given for the outputs= part of the 'f_predict' function from self.predict(). This will be used for creating hooks to link models together, where these outputs can be strung as the inputs or hiddens to another model :) ------------------ :return: theano expression of the outputs from this model's computation :rtype: theano tensor (expression) """ return self.output def predict(self, input): """ This method will return the model's output (run through the function), given an input. In the case that input_hooks or hidden_hooks are used, the function should use them appropriately and assume they are the input. Try to avoid re-compiling the theano function created for predict - check a hasattr(self, 'f_predict') or something similar first. I recommend creating your theano f_predict in a create_computation_graph method to be called after the class initializes. ------------------ :param input: Theano/numpy tensor-like object that is the input into the model's computation graph. :type input: tensor :return: Theano/numpy tensor-like object that is the output of the model's computation graph. :rtype: tensor """ if not hasattr(self, 'f_predict'): log.error( "Missing self.f_predict - make sure you ran self.build_computation_graph()! " "This should have run during initialization....") raise NotImplementedError() return self.f_predict(*input) def get_train_cost(self): """ This returns the expression that represents the cost given an input, which is used for the Optimizer during training. The reason we can't just compile a f_train theano function is because updates need to be calculated for the parameters during gradient descent - and these updates are created in the Optimizer object. ------------------ :return: theano expression of the model's training cost, from which parameter gradients will be computed. :rtype: theano tensor """ return self.train_cost def get_monitors(self): """ This returns a dictionary of (monitor_name: monitor_function) of variables (monitors) whose values we care about during training. For every monitor returned by this method, the function will be run on the train/validation/test dataset and its value will be reported. Again, please avoid recompiling the monitor functions every time - check your hasattr to see if they already exist! ------------------ :return: Dictionary of String: theano_function for each monitor variable we care about in the model. :rtype: Dictionary """ if not hasattr(self, 'f_monitors'): log.error( "Missing self.f_monitors - make sure you ran self.build_computation_graph()! " "This should have run during initialization....") raise NotImplementedError() names = ', '.join(self.monitors.keys()) return {names: self.f_monitors} def get_params(self): """ This returns the list of theano shared variables that will be trained by the Optimizer. These parameters are used in the gradient. ------------------ :return: flattened list of theano shared variables to be trained :rtype: List(shared_variables) """ return self.params
class RMSProp(LearningRule): """ Implements the RMSProp learning rule as described by Hinton in `lecture 6 <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>` of the Coursera Neural Networks for Machine Learning course. In short, Hinton suggests "[the] magnitude of the gradient can be very different for different weights and can change during learning. This makes it hard to choose a global learning rate." RMSProp solves this problem by "[dividing] the learning rate for a weight by a running average of the magnitudes of recent gradients for that weight." Parameters ---------- decay : float, optional Decay constant similar to that used in AdaDelta and Momentum methods. max_scaling: float, optional Restrict the RMSProp gradient scaling coefficient to values below `max_scaling`. Notes ----- An instance of this LearningRule should only be used with one TrainingAlgorithm, and its get_updates method should be called only once. This is required in order to make the monitoring channels correctly report the moving averages. """ def __init__(self, decay=0.9, max_scaling=1e5): assert 0. <= decay < 1. assert max_scaling > 0 self.decay = sharedX(decay, 'decay') self.epsilon = 1. / max_scaling self.mean_square_grads = OrderedDict() @wraps(LearningRule.add_channels_to_monitor) def add_channels_to_monitor(self, monitor, monitoring_dataset): """ The channels added are the min, mean, and max of the mean_square_grad of each parameter. """ channel_mapping = { '_min': T.min, '_max': T.max, '_mean': T.mean } for mean_square_grad in self.mean_square_grads.values(): for suffix, op in channel_mapping.items(): monitor.add_channel( name=(mean_square_grad.name + suffix), ipt=None, val=op(mean_square_grad), data_specs=(NullSpace(), ''), dataset=monitoring_dataset) return @wraps(LearningRule.get_updates) def get_updates(self, learning_rate, grads, lr_scalers=None): """ Notes ----- This method has the side effect of storing the moving average of the square gradient in `self.mean_square_grads`. This is necessary in order for the monitoring channels to be able to track the value of these moving averages. Therefore, this method should only get called once for each instance of RMSProp. """ updates = OrderedDict() for param in grads: # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(param.get_value() * 0.) if param.name is None: raise ValueError("Model parameters must be named.") mean_square_grad.name = 'mean_square_grad_' + param.name if param.name in self.mean_square_grads: warnings.warn("Calling get_updates more than once on the " "gradients of `%s` may make monitored values " "incorrect." % param.name) # Store variable in self.mean_square_grads for monitoring. self.mean_square_grads[param.name] = mean_square_grad # Accumulate gradient new_mean_squared_grad = (self.decay * mean_square_grad + (1 - self.decay) * T.sqr(grads[param])) # Compute update scaled_lr = lr_scalers.get(param, 1.) * learning_rate rms_grad_t = T.sqrt(new_mean_squared_grad) rms_grad_t = T.maximum(rms_grad_t, self.epsilon) delta_x_t = - scaled_lr * grads[param] / rms_grad_t # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[param] = param + delta_x_t return updates
def get_updates(self, learning_rate, grads, lr_scalers=None): """ .. todo:: WRITEME Parameters ---------- learning_rate : float Learning rate coefficient. Learning rate is not being used but, pylearn2 requires a learning rate to be defined. grads : dict A dictionary mapping from the model's parameters to their gradients. lr_scalers : dict A dictionary mapping from the model's parameters to a learning rate multiplier. """ updates = OrderedDict({}) eps = self.damping step = sharedX(0., name="step") if self.skip_nan_inf: #If norm of the gradients of a parameter is inf or nan don't update that parameter #That might be useful for RNNs. grads = OrderedDict({ p: T.switch(T.or_(T.isinf(grads[p]), T.isnan(grads[p])), 0, grads[p]) for p in grads.keys() }) #Block-normalize gradients: nparams = len(grads.keys()) #Apply the gradient clipping, this is only sometimes #necessary for RNNs and sometimes for very deep networks if self.grad_clip: assert self.grad_clip > 0. assert self.grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1." gnorm = sum([g.norm(2) for g in grads.values()]) notfinite = T.or_(T.isnan(gnorm), T.isinf(gnorm)) for p, g in grads.iteritems(): tmpg = T.switch(gnorm / nparams > self.grad_clip, g * self.grad_clip * nparams / gnorm, g) grads[p] = T.switch(notfinite, as_floatX(0.1) * p, tmpg) tot_norm_up = 0 tot_param_norm = 0 fix_decay = self.slow_decay**(step + 1) for param in grads.keys(): grads[param].name = "grad_%s" % param.name mean_grad = sharedX(param.get_value() * 0. + eps, name="mean_grad_%s" % param.name) mean_corrected_grad = sharedX(param.get_value() * 0 + eps, name="mean_corrected_grad_%s" % param.name) gnorm_sqr = sharedX(0.0 + eps, name="gnorm_%s" % param.name) prod_taus = sharedX((np.ones_like(param.get_value()) - 2 * eps), name="prod_taus_x_t_" + param.name) slow_constant = 2.1 if self.use_adagrad: # sum_square_grad := \sum_i g_i^2 sum_square_grad = sharedX(param.get_value(borrow=True) * 0., name="sum_square_grad_%s" % param.name) """ Initialization of accumulators """ taus_x_t = sharedX( (np.ones_like(param.get_value()) + eps) * slow_constant, name="taus_x_t_" + param.name) self.taus_x_t = taus_x_t #Variance reduction parameters #Numerator of the gamma: gamma_nume_sqr = sharedX(np.zeros_like(param.get_value()) + eps, name="gamma_nume_sqr_" + param.name) #Denominator of the gamma: gamma_deno_sqr = sharedX(np.zeros_like(param.get_value()) + eps, name="gamma_deno_sqr_" + param.name) #For the covariance parameter := E[\gamma \alpha]_{t-1} cov_num_t = sharedX(np.zeros_like(param.get_value()) + eps, name="cov_num_t_" + param.name) # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(np.zeros_like(param.get_value()) + eps, name="msg_" + param.name) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(param.get_value() * 0., name="msd_" + param.name) if self.use_corrected_grad: old_grad = sharedX(param.get_value() * 0. + eps) #The uncorrected gradient of previous of the previous update: old_plain_grad = sharedX(param.get_value() * 0. + eps) mean_curvature = sharedX(param.get_value() * 0. + eps) mean_curvature_sqr = sharedX(param.get_value() * 0. + eps) # Initialize the E[\Delta]_{t-1} mean_dx = sharedX(param.get_value() * 0.) # Block-wise normalize the gradient: norm_grad = grads[param] #For the first time-step, assume that delta_x_t := norm_grad gnorm = T.sqr(norm_grad).sum() cond = T.eq(step, 0) gnorm_sqr_o = cond * gnorm + (1 - cond) * gnorm_sqr gnorm_sqr_b = gnorm_sqr_o / (1 - fix_decay) norm_grad = norm_grad / (T.sqrt(gnorm_sqr_b) + eps) msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx mdx = cond * norm_grad + (1 - cond) * mean_dx new_prod_taus = (prod_taus * (1 - 1 / taus_x_t)) """ Compute the new updated values. """ # E[g_i^2]_t new_mean_squared_grad = (mean_square_grad * (1 - 1 / taus_x_t) + T.sqr(norm_grad) / (taus_x_t)) new_mean_squared_grad.name = "msg_" + param.name # E[g_i]_t new_mean_grad = (mean_grad * (1 - 1 / taus_x_t) + norm_grad / taus_x_t) new_mean_grad.name = "nmg_" + param.name mg = new_mean_grad / (1 - new_prod_taus) mgsq = new_mean_squared_grad / (1 - new_prod_taus) new_gnorm_sqr = (gnorm_sqr_o * self.slow_decay + T.sqr(norm_grad).sum() * (1 - self.slow_decay)) # Keep the rms for numerator and denominator of gamma. new_gamma_nume_sqr = (gamma_nume_sqr * (1 - 1 / taus_x_t) + T.sqr( (norm_grad - old_grad) * (old_grad - mg)) / taus_x_t) new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name new_gamma_deno_sqr = (gamma_deno_sqr * (1 - 1 / taus_x_t) + T.sqr( (mg - norm_grad) * (old_grad - mg)) / taus_x_t) new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name gamma = T.sqrt(gamma_nume_sqr) / (T.sqrt(gamma_deno_sqr + eps) + \ self.gamma_reg) gamma.name = "gamma_" + param.name if self.gamma_clip and self.gamma_clip > -1: gamma = T.minimum(gamma, self.gamma_clip) momentum_step = gamma * mg corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma) #For starting the variance reduction. if self.start_var_reduction > -1: cond = T.le(self.start_var_reduction, step) corrected_grad = cond * corrected_grad_cand + ( 1 - cond) * norm_grad else: corrected_grad = norm_grad if self.use_adagrad: g = corrected_grad # Accumulate gradient new_sum_squared_grad = (sum_square_grad + T.sqr(g)) rms_g_t = T.sqrt(new_sum_squared_grad) rms_g_t = T.maximum(rms_g_t, 1.0) #Use the gradients from the previous update #to compute the \nabla f(x_t) - \nabla f(x_{t-1}) cur_curvature = norm_grad - old_plain_grad #cur_curvature = theano.printing.Print("Curvature: ")(cur_curvature) cur_curvature_sqr = T.sqr(cur_curvature) new_curvature_ave = (mean_curvature * (1 - 1 / taus_x_t) + (cur_curvature / taus_x_t)) new_curvature_ave.name = "ncurve_ave_" + param.name #Average average curvature nc_ave = new_curvature_ave / (1 - new_prod_taus) new_curvature_sqr_ave = (mean_curvature_sqr * (1 - 1 / taus_x_t) + (cur_curvature_sqr / taus_x_t)) new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name #Unbiased average squared curvature nc_sq_ave = new_curvature_sqr_ave / (1 - new_prod_taus) epsilon = 1e-7 #lr_scalers.get(param, 1.) * learning_rate scaled_lr = sharedX(1.0) rms_dx_tm1 = T.sqrt(msdx + epsilon) rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon) #This is where the update step is being defined delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t / (new_curvature_sqr_ave + epsilon)) delta_x_t.name = "delta_x_t_" + param.name # This part seems to be necessary for only RNNs # For feedforward networks this does not seem to be important. if self.delta_clip: logger.info( "Clipping will be applied on the adaptive step size.") delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip) if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: logger.info("Clipped adagrad is disabled.") delta_x_t = delta_x_t * corrected_grad else: logger.info( "Clipping will not be applied on the adaptive step size.") if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: logger.info("Clipped adagrad will not be used.") delta_x_t = delta_x_t * corrected_grad new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX( 1 + eps, "stabilized") #To compute the E[\Delta^2]_t new_mean_square_dx = (msdx * (1 - 1 / taus_x_t) + (T.sqr(delta_x_t) / taus_x_t)) #To compute the E[\Delta]_t new_mean_dx = (mdx * (1 - 1 / taus_x_t) + (delta_x_t / (taus_x_t))) #Perform the outlier detection: #This outlier detection is slightly different: new_taus_t = T.switch( T.or_( abs(norm_grad - mg) > (2 * T.sqrt(mgsq - mg**2)), abs(cur_curvature - nc_ave) > (2 * T.sqrt(nc_sq_ave - nc_ave**2))), T.switch(new_taus_t > 2.5, sharedX(2.5), new_taus_t + sharedX(1.0) + eps), new_taus_t) #Apply the bound constraints on tau: new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t) new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t) new_cov_num_t = (cov_num_t * (1 - 1 / taus_x_t) + (delta_x_t * cur_curvature) * (1 / taus_x_t)) update_step = delta_x_t tot_norm_up += update_step.norm(2) tot_param_norm += param.norm(2) # Apply updates updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[mean_dx] = new_mean_dx updates[gnorm_sqr] = new_gnorm_sqr updates[gamma_nume_sqr] = new_gamma_nume_sqr updates[gamma_deno_sqr] = new_gamma_deno_sqr updates[taus_x_t] = new_taus_t updates[cov_num_t] = new_cov_num_t updates[mean_grad] = new_mean_grad updates[old_plain_grad] = norm_grad updates[mean_curvature] = new_curvature_ave updates[mean_curvature_sqr] = new_curvature_sqr_ave if self.perform_update: updates[param] = param + update_step updates[step] = step + 1 updates[prod_taus] = new_prod_taus if self.use_adagrad: updates[sum_square_grad] = new_sum_squared_grad if self.use_corrected_grad: updates[old_grad] = corrected_grad return updates, tot_norm_up, tot_param_norm
class Monitor(object): """ A class for monitoring Models while they are being trained. A monitor object records the number of minibatches and number of examples the model has trained, as well as any number of "channels" that track quantities of interest (examples: the objective function, measures of hidden unit activity, reconstruction error, sum of squared second derivatives, average norm of the weight vectors, etc.) Parameters ---------- model : `pylearn2.models.model.Model` """ def __init__(self, model): self.training_succeeded = False self.model = model self.channels = OrderedDict() self._num_batches_seen = 0 self._examples_seen = 0 self._epochs_seen = 0 self._datasets = [] self._iteration_mode = [] self._batch_size = [] self._num_batches = [] self._dirty = True self._rng_seed = [] self.names_to_del = ['theano_function_mode'] self.t0 = time.time() self.theano_function_mode = None # Initialize self._nested_data_specs, self._data_specs_mapping, # and self._flat_data_specs self._build_data_specs() def _build_data_specs(self): """ Computes a nested data_specs for input and all channels Also computes the mapping to flatten it. This function is called from redo_theano. """ # Ask the model what it needs m_space, m_source = self.model.get_monitoring_data_specs() input_spaces = [m_space] input_sources = [m_source] for channel in self.channels.values(): space = channel.data_specs[0] assert isinstance(space, Space) input_spaces.append(space) input_sources.append(channel.data_specs[1]) nested_space = CompositeSpace(input_spaces) nested_source = tuple(input_sources) self._nested_data_specs = (nested_space, nested_source) self._data_specs_mapping = DataSpecsMapping(self._nested_data_specs) flat_space = self._data_specs_mapping.flatten(nested_space, return_tuple=True) flat_source = self._data_specs_mapping.flatten(nested_source, return_tuple=True) self._flat_data_specs = (CompositeSpace(flat_space), flat_source) def set_theano_function_mode(self, mode): """ .. todo:: WRITEME Parameters ---------- mode : theano.compile.Mode Theano functions for the monitoring channels will be compiled and run using this mode. """ if self.theano_function_mode != mode: self._dirty = True self.theano_function_mode = mode def add_dataset(self, dataset, mode='sequential', batch_size=None, num_batches=None, seed=None): """ Determines the data used to calculate the values of each channel. Parameters ---------- dataset : object A `pylearn2.datasets.Dataset` object. mode : str or object, optional Iteration mode; see the docstring of the `iterator` method on `pylearn2.datasets.Dataset` for details. batch_size : int, optional The size of an individual batch. Optional if `mode` is 'sequential' and `num_batches` is specified (batch size will be calculated based on full dataset size). num_batches : int, optional The total number of batches. Unnecessary if `mode` is 'sequential' and `batch_size` is specified (number of batches will be calculated based on full dataset size). seed : int, optional Optional. The seed to be used for random iteration modes. """ # The user can ommit using lists if only one dataset is set if not isinstance(dataset, list): dataset = [dataset] if not isinstance(mode, list): mode = [mode] if not isinstance(batch_size, list): batch_size = [batch_size] if not isinstance(num_batches, list): num_batches = [num_batches] if seed is None: seed = [None] * len(dataset) if not isinstance(seed, list): seed = [seed] if len(mode) != len(dataset): raise ValueError("Received " + str(len(dataset)) + " dataset but " + str(len(mode)) + " modes.") if any([len(l) != len(dataset) for l in [batch_size, seed]]): raise ValueError("make sure each dataset has its iteration " + "batch size and number of batches.") for (d, m, b, n, sd) in safe_izip(dataset, mode, batch_size, num_batches, seed): try: it = d.iterator(mode=m, batch_size=b, num_batches=n, data_specs=self._flat_data_specs, return_tuple=True, rng=sd) except ValueError as exc: reraise_as(ValueError("invalid iteration parameters in " + "Monitor.add_dataset: " + str(exc))) if it.stochastic: # Must be a seed, not a random number generator. If it were a # random number generator, different iterators using it would # update its state, so we would not get the same iterator # each time. Also, must not be None, because this makes the # iterator pick a seed based on the clock if sd is None: raise TypeError("Monitor requires a seed when using " + "stochastic iteration modes.") if not isinstance(sd, (list, tuple, int)): raise TypeError("Monitor requires a seed (not a random " + "number generator) when using " + "stochastic iteration modes.") else: # The iterator should catch this, but let's double-check assert sd is None if not d in self._datasets: self._datasets.append(d) self._iteration_mode.append(m) self._batch_size.append(b) self._num_batches.append(n) self._rng_seed.append(sd) def __call__(self): """ Runs the model on the monitoring dataset in order to add one data point to each of the channels. """ # If the channels have changed at all, we need to recompile the theano # functions used to compute them if self._dirty: self.redo_theano() datasets = self._datasets # Set all channels' val_shared to 0 self.begin_record_entry() for d, i, b, n, a, sd, ne in safe_izip(datasets, self._iteration_mode, self._batch_size, self._num_batches, self.accum, self._rng_seed, self.num_examples): if isinstance(d, basestring): d = yaml_parse.load(d) raise NotImplementedError() # need to put d back into self._datasets myiterator = d.iterator(mode=i, batch_size=b, num_batches=n, data_specs=self._flat_data_specs, return_tuple=True, rng=sd) # If self._flat_data_specs is empty, no channel needs data, # so we do not need to call the iterator in order to average # the monitored values across different batches, we only # have to call them once. if len(self._flat_data_specs[1]) == 0: X = () self.run_prereqs(X, d) a(*X) else: actual_ne = 0 for X in myiterator: # X is a flat (not nested) tuple self.run_prereqs(X, d) a(*X) actual_ne += self._flat_data_specs[0].np_batch_size(X) # end for X if actual_ne != ne: raise RuntimeError("At compile time, your iterator said " "it had %d examples total, but at " "runtime it gave us %d." % (ne, actual_ne)) # end for d log.info("Monitoring step:") log.info("\tEpochs seen: %d" % self._epochs_seen) log.info("\tBatches seen: %d" % self._num_batches_seen) log.info("\tExamples seen: %d" % self._examples_seen) t = time.time() - self.t0 for channel_name in sorted(self.channels.keys(), key=number_aware_alphabetical_key): channel = self.channels[channel_name] channel.time_record.append(t) channel.batch_record.append(self._num_batches_seen) channel.example_record.append(self._examples_seen) channel.epoch_record.append(self._epochs_seen) val = channel.val_shared.get_value() channel.val_record.append(val) # TODO: use logging infrastructure so that user can configure # formatting if abs(val) < 1e4: val_str = str(val) else: val_str = '%.3e' % val log.info("\t%s: %s" % (channel_name, val_str)) def run_prereqs(self, data, dataset): """ Runs all "prerequistie functions" on a batch of data. Always called right before computing the monitoring channels on that batch. Parameters ---------- data : tuple or Variable a member of the Space used as input to the monitoring functions dataset : Dataset the Dataset the data was drawn from """ if dataset not in self.prereqs: return for prereq in self.prereqs[dataset]: prereq(*data) def get_batches_seen(self): """ Returns the number of batches the model has learned on (assuming that the learning code has been calling Monitor.report_batch correctly). """ return self._num_batches_seen def get_epochs_seen(self): """ .. todo:: WRITEME Returns ------- epochs_seen : int The number of epochs the model has been trained on. One "epoch" is one pass through Dataset.iterator. """ return self._epochs_seen def get_examples_seen(self): """ .. todo:: WRITEME Returns ------- examples_seen : int The number of examples the model has learned on (assuming that the learning code has been calling Monitor.report_batch correctly) """ return self._examples_seen def report_batch(self, num_examples): """ Call this whenever the model has learned on another batch of examples. Report how many examples were learned on. Parameters ---------- num_examples : int The number of examples learned on in this minibatch. """ self._examples_seen += num_examples self._num_batches_seen += 1 def report_epoch(self): """ Call this whenever the model has completed another "epoch" of learning. We regard one pass through Dataset.iterator as one epoch. """ self._epochs_seen += 1 def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is called any time we need to evaluate the channels and the channel definitions have changed since last we called it, or if the theano functions are unavailable for any other reason (first time they are needed after construction or deserialization, etc.) All channels are compiled as part of the same theano function so that the theano optimizations can eliminate subexpressions that are shared between multiple channels. """ self._dirty = False # Recompute the data specs, since the channels may have changed. self._build_data_specs() init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function( inputs=[], updates=updates, mode=self.theano_function_mode, name='Monitor.begin_record_entry' ) updates = OrderedDict() givens = OrderedDict() # Get the appropriate kind of theano variable to represent the data # the model acts on batch_names = ['monitoring_%s' % s for s in self._flat_data_specs[1]] theano_args = self._flat_data_specs[0].make_theano_batch(batch_names) # Get a symbolic expression of the batch size # We do it here, rather than for each channel, because channels with an # empty data_specs do not use data, and are unable to extract the batch # size. The case where the whole data specs is empty is not supported. batch_size = self._flat_data_specs[0].batch_size(theano_args) # Also get a nested representation, for joint iteration # with each of channel.graph_input nested_theano_args = self._data_specs_mapping.nest(theano_args) if not isinstance(nested_theano_args, tuple): nested_theano_args = (nested_theano_args,) assert len(nested_theano_args) == (len(self.channels) + 1) log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling monitor including ' + 'channel ' + key + '\n') log.info('\t%s' % key) it = [d.iterator(mode=i, num_batches=n, batch_size=b, data_specs=self._flat_data_specs, return_tuple=True) for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size)] self.num_examples = [np.cast[config.floatX](float(i.num_examples)) for i in it] self.num_examples = [float(i.num_examples) for i in it] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for i, channel in enumerate(self.channels.values()): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] inv_cur_num_examples = as_floatX(1./self.num_examples[index]) u = updates[index] # Flatten channel.graph_input and the appropriate part of # nested_theano_args, to iterate jointly over them. c_mapping = DataSpecsMapping(channel.data_specs) channel_inputs = c_mapping.flatten(channel.graph_input, return_tuple=True) inputs = c_mapping.flatten(nested_theano_args[i + 1], return_tuple=True) for (channel_X, X) in safe_izip(channel_inputs, inputs): assert channel_X not in g or g[channel_X] is X assert channel_X.type == X.type, (channel_X.type, X.type) g[channel_X] = X if batch_size == 0: # No channel does need any data, so there is not need to # average results, and we will call the accum functions only # once. # TODO: better handling of channels not needing data when # some other channels need data. assert len(self._flat_data_specs[1]) == 0 val = channel.val else: if n == 0: raise ValueError("Iterating over 0 examples results in " + "divide by 0") val = (channel.val * T.cast(batch_size, config.floatX) * inv_cur_num_examples) u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' + key.name + ' has dtype ' + key.dtype + ' but is driven by an expression ' + 'with type ' + up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key ' + var_descriptor(elem) + '\n') mode.record.handle_line('g val ' + var_descriptor(g[elem]) + '\n') for elem in u: mode.record.handle_line('u key ' + var_descriptor(elem) + '\n') mode.record.handle_line('u val ' + var_descriptor(u[elem]) + '\n') function_name = 'Monitor.accum[%d]' % idx if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just # monitor the model parameters, or some shared variable updated # by the training algorithm, so we need to ignore the unused # input error self.accum.append(function(theano_args, givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output ' + var_descriptor(elem) + '\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del([name for name in final_names if name not in init_names]) def register_names_to_del(self, names): """ Register names of fields that should be deleted before pickling. Parameters ---------- names : list A list of attribute names as strings. """ for name in names: if name not in self.names_to_del: self.names_to_del.append(name) def __getstate__(self): """ In order to avoid pickling a copy of the dataset whenever a monitor is saved, the __getstate__ method replaces the dataset field with the dataset's yaml source. This is not a perfect solution because it won't work with job resuming, which would require saving the state of the dataset's random number generator. Like in the Model class, we also need to avoid saving any Theano functions, so we delete everything that can be regenerated with `redo_theano` by deleting the fields in `self.names_to_del` """ # Patch old pickled monitors if not hasattr(self, '_datasets'): self._datasets = [self._dataset] del self._dataset temp = self._datasets if self._datasets: self._datasets = [] for dataset in temp: if isinstance(dataset, basestring): self._datasets.append(dataset) else: try: self._datasets.append(dataset.yaml_src) except AttributeError: warnings.warn('Trained model saved without ' + 'indicating yaml_src') d = copy.copy(self.__dict__) self._datasets = temp for name in self.names_to_del: if name in d: del d[name] return d def __setstate__(self, d): """ Sets the object to have the state described by `d`. Parameters ---------- d : dict A dictionary mapping string names of fields to values for these fields. """ # patch old pkl files if '_dataset' in d: d['_datasets'] = [d['_dataset']] del d['_dataset'] self.__dict__.update(d) def add_channel(self, name, ipt, val, dataset=None, prereqs=None, data_specs=None): """ Asks the monitor to start tracking a new value. Can be called even after the monitor is already in use. Parameters ---------- name : str The display name in the monitor. ipt : tensor_like The symbolic tensor which should be clamped to the data. (or a list/tuple containing symbolic tensors, following the data_specs) val : tensor_like The value (function of `ipt`) to be tracked. dataset : pylearn2.datasets.Dataset Which dataset to compute this channel on prereqs : list of callables that take a list of numpy tensors Each prereq must be called exactly once per each new batch of data drawn *from dataset* before the channel value is computed if two channels provide a prereq with exactly the same id, that prereq will only be called once data_specs : (space, source) pair Identifies the order, format and semantics of ipt """ if isinstance(val, (float, int, long)): val = np.cast[theano.config.floatX](val) val = T.as_tensor_variable(val) if data_specs is None: warnings.warn("parameter 'data_specs' should be provided when " + "calling add_channel. We will build a default one.", stacklevel=2) if isinstance(ipt, list): ipt = tuple(ipt) if ipt is not None and not isinstance(ipt, tuple): ipt = (ipt,) if ipt is None: data_specs = (NullSpace(), '') elif len(ipt) == 0: data_specs = (CompositeSpace([]), ()) elif hasattr(dataset, 'get_data_specs'): dataset_space, dataset_source = dataset.get_data_specs() if (len(ipt) == 1 and dataset_source is not None and (not isinstance(dataset_source, tuple) or len(dataset_source) == 1) and 'features' in dataset_source): data_specs = (dataset_space, dataset_source) elif (len(ipt) == 2 and dataset_source == ('features', 'targets')): data_specs = (dataset_space, dataset_source) else: raise ValueError("Cannot infer default data_specs for " + "the following input points and " + "dataset: ipt = %s, dataset = %s" % (ipt, dataset)) data_specs[0].validate(ipt) mapping = DataSpecsMapping(data_specs) flat_ipt = mapping.flatten(ipt) if not isinstance(flat_ipt, tuple): flat_ipt = (flat_ipt,) inputs = theano.gof.graph.inputs([val]) for elem in inputs: if not hasattr(elem, 'get_value') and \ not isinstance(elem, theano.gof.graph.Constant): if elem not in flat_ipt: raise ValueError("Unspecified input: " + str(elem) + ". This may be due to an incorrect " + "implementation of a cost's " + "get_data_specs() method, or of a " + "model's get_monitoring_data_specs() " + "method.") mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('Adding monitor channel '+name+'\n') assert isinstance(flat_ipt, tuple) if len(flat_ipt) != 1: for elem in flat_ipt: mode.record.handle_line('Includes input var ' + var_descriptor(elem) + '\n') else: mode.record.handle_line(name + ' input var is ' + var_descriptor(flat_ipt[0]) + '\n') mode.record.handle_line('channel ' + name + ' is ' + var_descriptor(val) + '\n') if dataset is None: if len(self._datasets) == 1: dataset = self._datasets[0] elif len(self._datasets) == 0: raise ValueError(_err_no_data) else: raise ValueError(_err_ambig_data) try: self._datasets.index(dataset) except ValueError: reraise_as(ValueError("The dataset specified is not one of the " + "monitor's datasets")) if name in self.channels: raise ValueError("Tried to create the same channel twice (%s)" % name) self.channels[name] = MonitorChannel(ipt, val, name, data_specs, dataset, prereqs) self._dirty = True def _sanity_check(self): """ Sometimes we serialize models and then load them somewhere else but still try to use their Monitor, and the Monitor is in a mangled state. I've added some calls to _sanity_check to try to catch when that happens. Not sure what to do for a long term fix. I think it requires making theano graphs serializable first. """ for name in self.channels: channel = self.channels[name] assert hasattr(channel, 'prereqs') @classmethod def get_monitor(cls, model): """ Returns a model's monitor. If the model doesn't have a monitor yet, installs one and returns that. Parameters ---------- model : object An object that implements the `Model` interface specified in `pylearn2.models`. """ if hasattr(model, 'monitor'): rval = model.monitor rval._sanity_check() else: rval = Monitor(model) model.monitor = rval return rval # TODO: find out if this method is used anywhere, remove if not. @property def batch_size(self): """ .. todo:: WRITEME Returns ------- batch_size : int The size of the batches used for monitoring """ return self._batch_size # TODO: find out if this method is used anywhere, remove if not. @property def num_batches(self): """ .. todo:: WRITEME Returns ------- num_batches : int The number of batches used for monitoring """ return self._num_batches def setup(self, dataset, cost, batch_size, num_batches=None, extra_costs=None, mode='sequential', obj_prereqs=None, cost_monitoring_args=None): """ Sets up the monitor for a cost minimization problem. Adds channels defined by both the model and the cost for the specified dataset(s), as well as a channel called 'objective' defined by the costs' __call__ method. Parameters ---------- dataset : pylearn2.datasets.Dataset Dataset or dictionary mapping string names to Datasets. If string names are used, then for every dataset, each channel defined by the model or cost will be replicated with that dataset's name followed by an underscore as the prefix. For example, if your cost defines a channel called 'misclass', and datasets is {'train' : train_dataset, 'valid' : valid_dataset}, you will get channels called 'train_misclass' and 'valid_misclass'. cost : pylearn2.costs.Cost The cost being optimized by training. The value of the cost will appear as the `objective` channel. Its `get_monitoring_channels` method will also be used to supply other channels. extra_costs : OrderedDict, optional A dictionary mapping channel names to Cost objects. Their value will appear as the specified channel name. They will also provide more monitoring channels via their `get_monitoring_channels` method. obj_prereqs : None, or list of functions Functions to pass as prerequisites to the `objective` channel. cost_monitoring_args : dict Dictionary of kwargs that will be passed to `cost.get_monitoring_channels()` (but not for the extra_costs). """ if dataset is None: return if isinstance(dataset, Dataset): dataset = {'': dataset} else: assert isinstance(dataset, dict) assert all(isinstance(key, str) for key in dataset) assert all(isinstance(dataset[key], Dataset) for key in dataset) if extra_costs is None: costs = {} else: assert isinstance(extra_costs, (OrderedDict, dict)) costs = extra_costs assert '' not in costs costs[''] = cost if cost_monitoring_args is None: cost_monitoring_args = {} model = self.model # Build a composite data_specs containing the specs for all costs, # then the specs of the model cost_names = sorted(costs.keys()) spaces = [] sources = [] for c in cost_names: c_space, c_source = costs[c].get_data_specs(model) spaces.append(c_space) sources.append(c_source) # Ask the model for the data_specs needed m_space, m_source = model.get_monitoring_data_specs() spaces.append(m_space) sources.append(m_source) nested_space = CompositeSpace(spaces) nested_sources = tuple(sources) # Flatten this data_specs, so we build only one symbolic Theano # variable for each of the unique (space, source) pairs. mapping = DataSpecsMapping((nested_space, nested_sources)) space_tuple = mapping.flatten(nested_space, return_tuple=True) source_tuple = mapping.flatten(nested_sources, return_tuple=True) ipt = tuple(space.make_theano_batch(name='monitor_%s' % source, batch_size=None) for (space, source) in safe_zip(space_tuple, source_tuple)) # Build a nested tuple from ipt, to dispatch the appropriate parts # of the ipt batch to each cost nested_ipt = mapping.nest(ipt) custom_channels = {} for i, cost_name in enumerate(cost_names): if cost_name == '': prefix = '' else: prefix = cost_name + '_' cost = costs[cost_name] cost_ipt = nested_ipt[i] raw_channels = cost.get_monitoring_channels(model, cost_ipt) channels = {} for name in raw_channels: # We need three things: the value itself (raw_channels[name]), # the input variables (cost_ipt), and the data_specs for # these input variables ((spaces[i], sources[i])) channels[prefix + name] = (raw_channels[name], cost_ipt, (spaces[i], sources[i])) custom_channels.update(channels) # Use the last inputs from nested_ipt for the model model_channels = model.get_monitoring_channels(nested_ipt[-1]) channels = {} for name in model_channels: # Note: some code used to consider that model_channels[name] # could be a a (channel, prereqs) pair, this is not supported. channels[name] = (model_channels[name], nested_ipt[-1], (spaces[-1], sources[-1])) custom_channels.update(channels) if is_stochastic(mode): seed = [[2013, 02, 22]] else: seed = None for dataset_name in dataset: cur_dataset = dataset[dataset_name] self.add_dataset(dataset=cur_dataset, mode=mode, batch_size=batch_size, num_batches=num_batches, seed=seed) if dataset_name == '': dprefix = '' else: dprefix = dataset_name + '_' # These channel name 'objective' must not vary, since callbacks # that respond to the values in the monitor use the name to find # it. for i, cost_name in enumerate(cost_names): cost = costs[cost_name] cost_ipt = nested_ipt[i] cost_value = cost.expr(model, cost_ipt) if cost_value is not None: if cost_name == '': name = dprefix + 'objective' prereqs = obj_prereqs else: name = dprefix + cost_name prereqs = None cost.get_data_specs(model)[0].validate(cost_ipt) self.add_channel(name=name, ipt=cost_ipt, val=cost_value, data_specs=cost.get_data_specs(model), dataset=cur_dataset, prereqs=prereqs) for key in custom_channels: val, ipt, data_specs = custom_channels[key] data_specs[0].validate(ipt) self.add_channel(name=dprefix + key, ipt=ipt, val=val, data_specs=data_specs, dataset=cur_dataset)
class Optimizer(object): """ Default interface for an optimizer implementation - this provides the necessary parameter updates when training a model on a dataset using an online stochastic process. The base framework for performing stochastic gradient descent. """ def __init__(self, dataset, loss=None, model=None, epochs=1000, batch_size=100, min_batch_size=1, save_freq=10, stop_threshold=None, stop_patience=50, learning_rate=1e-3, lr_decay=None, lr_decay_factor=None, grad_clip=None, hard_clip=False, **kwargs): """ Initialize the Optimizer. Parameters ---------- dataset : Dataset The :class:`opendeep.data.Dataset` to use when training the Model. loss : Loss The :class:`opendeep.optimization.loss.Loss` function to compare the model to a 'target' result. model : Model The :class:`opendeep.models.Model` to train. Needed if the Optimizer isn't being passed to a Model's .train() method. epochs : int How many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. min_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_freq : int, optional How many epochs to train between each new save of the Model's parameters. stop_threshold : float, optional The factor by how much the best validation training score needs to improve to determine early stopping. stop_patience : int, optional The patience or number of epochs to wait after the stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for classes of decay and documentation. lr_decay_factor : float The amount of decay to use for the ``lr_decay`` type of decay. grad_clip : float, optional Whether to clip gradients. This will clip the norm of the gradients either with a hard cutoff or rescaling. hard_clip : bool Whether to use a hard cutoff or rescaling for clipping gradients. """ log.info("Initializing optimizer %s", str(self.__class__.__name__)) # Deal with early stopping None initializations (no early stopping). if not stop_threshold: stop_threshold = numpy.inf if not save_freq: save_freq = 1000000 if not stop_patience: stop_patience = 1 # Put all init parameters in self.args so we can log the initial configuration. self.args = locals().copy() self.args.pop('self') kwargs = self.args.pop('kwargs') self.args = add_kwargs_to_dict(kwargs, self.args) # log the arguments log.info("Optimizer config args: %s", str(self.args)) # if the optimizer wasn't initialized with a Model (train() being called from the model class itself), # just return. (This seems kinda hacky but hey, people wanted .train() to happen from Model and there # wasn't really a better way unless the epoch looping logic was in that method for Model. That wasn't # the best option because other methods besides stochastic ones can exist for optimizers in the future. # TODO: fix this up - feels like a hack just to make model.train() work... if not model: return # Otherwise, things are proceeding as normal. Carry on... assert isinstance(model, Model), "Optimizer input model needs to be a Model class! " \ "Found %s" % str(model.__class__.__name__) assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be a Dataset class! " \ "Found %s" % str(dataset.__class__.__name__) # deal with loss expression/targets if loss is not None: assert isinstance(loss, Loss), "Optimizer input loss needs to be a Loss class! " \ "Found %s" % str(loss.__class__.__name__) if isinstance(loss, Loss): self.loss_targets = loss.get_targets() self.loss_expression = loss.get_loss() else: assert model.get_loss() is not None, "No Loss specified, and the model does not have one implemented." if isinstance(model.get_loss(), tuple): self.loss_targets = raise_to_list(model.get_loss()[0]) self.loss_expression = model.get_loss()[1] else: self.loss_targets = None self.loss_expression = model.get_loss() model_inputs = raise_to_list(model.get_inputs()) n_model_inputs = len(model_inputs) model_targets = self.loss_targets or [] for input in model_inputs: if input in model_targets: model_targets.remove(input) n_model_targets = len(model_targets) self.unsupervised = (n_model_targets is 0) # make sure the number of inputs/targets matches up with the dataset properties # train assert n_model_inputs == len(raise_to_list(dataset.train_inputs)), \ "Dataset has %d train inputs, while model expects %d" % \ (len(raise_to_list(dataset.train_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.train_targets) or []), \ "Dataset has %d train targets, while model expects %d" % \ (len(raise_to_list(dataset.train_targets) or []), n_model_targets) # valid if dataset.valid_inputs is not None: assert n_model_inputs == len(raise_to_list(dataset.valid_inputs)), \ "Dataset has %d valid inputs, while model expects %d" % \ (len(raise_to_list(dataset.valid_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.valid_targets) or []), \ "Dataset has %d valid targets, while model expects %d" % \ (len(raise_to_list(dataset.valid_targets) or []), n_model_targets) # test if dataset.test_inputs is not None: assert n_model_inputs == len(raise_to_list(dataset.test_inputs)), \ "Dataset has %d test inputs, while model expects %d" % \ (len(raise_to_list(dataset.test_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.test_targets) or []), \ "Dataset has %d test targets, while model expects %d" % \ (len(raise_to_list(dataset.test_targets) or []), n_model_targets) # now we are happy, we can add them to `self` self.model = model self.dataset = dataset self.loss = loss # Learning rate - how drastic of a step do the parameters change self.learning_rate = sharedX(learning_rate, 'learning_rate') # whether to scale individual model parameters' learning rates. self.lr_scalers = self.model.get_lr_scalers() # whether to decay if lr_decay: self.learning_rate_decay = get_decay_function(lr_decay, self.learning_rate, learning_rate, lr_decay_factor) else: self.learning_rate_decay = False # rest of initial parameters needed for training. self.batch_size = batch_size self.min_batch_size = min_batch_size self.n_epoch = epochs self.save_frequency = save_freq self.early_stop_threshold = stop_threshold self.early_stop_length = stop_patience self.grad_clip = grad_clip self.hard_clip = hard_clip def get_updates(self, gradients): """ This returns the parameter updates to use during training. It defaults to only using (annealed) learning rate. Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. """ log.debug('Setting up Stochastic Gradient Descent for optimizer...') updates = OrderedDict() for (param, gradient) in iteritems(gradients): scaled_lr = self.learning_rate * self.lr_scalers.get(param, 1.) updates[param] = param - scaled_lr * gradient return updates def train(self, monitor_channels=None, plot=None): """ This method performs the training!!! It is an online training method that goes over minibatches from the dataset for a number of epochs, updating parameters after each minibatch. You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully. Parameters ---------- monitor_channels : list(MonitorsChannel or Monitor), optional The list of channels or monitors containing monitor expressions/variables to compile and evaluate on the data. plot : Plot, optional The Plot object to use if we want to graph the outputs (uses bokeh server). """ if not self.model: log.error("No self.model for the Optimizer!") raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() " "was called from the Model. Try initializing the Optimizer with the model param " "and calling optimizer.train().") ######################### # gradients and updates # ######################### # grab the model parameters to use during training self.params = self.model.get_params() # Now create the training cost function for the model to use while training - update parameters # gradient! # First find the basic variables that will be updated params = set() for param in self.params.values(): params.update(base_variables(param)) params = list(params) gradients = grad(cost=self.loss_expression, wrt=params) # now create the dictionary mapping the parameter with its gradient gradients = OrderedDict( [(param, g) for param, g in zip(params, gradients)] ) # clip gradients if we want. gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable updates = self.model.get_updates() if updates: updates.update(gradient_updates) else: updates = gradient_updates log.info("%s params: %s", self.model._classname, str(list(self.params.keys()))) ############ # monitors # ############ # deal with the monitor channels if they were given (or take them from the plot) if monitor_channels is None and plot is not None and len(plot.channels) > 0: monitor_channels = plot.channels self.train_monitors_dict = {} self.valid_monitors_dict = {} self.test_monitors_dict = {} self.train_monitors_outservice_dict = {} self.valid_monitors_outservice_dict = {} self.test_monitors_outservice_dict = {} if monitor_channels: # collapse the appropriate monitors into their (name, expression, out_service) tuples train_collapsed = collapse_channels(monitor_channels, train=True) valid_collapsed = collapse_channels(monitor_channels, valid=True) test_collapsed = collapse_channels(monitor_channels, test=True) # get name: expression dictionary self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) self.test_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in test_collapsed]) # get name: outservice dictionary self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed]) self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed]) self.test_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in test_collapsed]) ####################################### # compile train and monitor functions # ####################################### function_input = raise_to_list(self.model.get_inputs()) if self.loss_targets is not None: function_input += self.loss_targets # Compile the training function! log.info('Compiling f_learn function for model %s...', self.model._classname) t = time.time() f_learn = function(inputs=function_input, updates=updates, outputs=[self.loss_expression] + list(self.train_monitors_dict.values()), name='f_learn') log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) # figure out if we want valid and test (monitors) self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0) self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0) # Now compile the monitor functions! log.debug("Compiling monitor functions...") monitor_t = time.time() # valid monitors if self.valid_flag: self.valid_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.valid_monitors_dict.values()), name='valid_monitor_function' ) else: self.valid_monitor_function = None # test monitors if self.test_flag: self.test_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.test_monitors_dict.values()), name='test_monitor_function' ) else: self.test_monitor_function = None log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) ################## # start training # ################## log.info("-----------TRAINING %s FOR %d EPOCHS-----------", self.model._classname, self.n_epoch) self.STOP = False self.epoch_counter = 0 # reset any decay params for decay_param in self.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(f_learn, plot) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") self.model.set_param_values(self.best_params, borrow=False) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter)) log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t)) def _perform_one_epoch(self, f_learn, plot=None): """ Performs a single training iteration with the given learn function. """ self.epoch_counter += 1 t = time.time() log.info('EPOCH %s', str(self.epoch_counter)) # set the noise switches on for training function! (this is where things like dropout happen) if not self.model.switches_on: self.model.turn_on_switches() ######### # train # ######### train_costs = [] train_monitors = {key: [] for key in self.train_monitors_dict.keys()} train_data = [ minibatch(input_data, self.batch_size, self.min_batch_size) for input_data in raise_to_list(self.dataset.train_inputs) ] if self.dataset.train_targets is not None and not self.unsupervised: train_data += [ minibatch(target, self.batch_size, self.min_batch_size) for target in raise_to_list(self.dataset.train_targets) ] for batch in min_normalized_izip(*train_data): _outs = raise_to_list(f_learn(*batch)) train_costs.append(_outs[0]) # handle any user defined monitors (if different from the train cost) if len(train_monitors) > 0: current_monitors = zip(self.train_monitors_dict.keys(), _outs[1:]) for name, val in current_monitors: val = numpy.asarray(val) train_monitors[name].append(val) # get the mean values for the batches mean_train = numpy.mean(train_costs, 0) current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in train_monitors.items()} # log the mean values! log.info('Train cost: %s', trunc(mean_train)) if len(current_mean_monitors) > 0: log.info('Train monitors: %s', str(current_mean_monitors)) # send the values to their outservices for name, service in self.train_monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], "train") # if there is a plot, also send them over! if plot: plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :) if self.model.switches_on: self.model.turn_off_switches() ######### # valid # ######### self._compute_over_subset("valid", self.dataset.valid_inputs, self.dataset.valid_targets, self.valid_monitors_dict, self.valid_monitor_function, self.valid_monitors_outservice_dict, plot) ######## # test # ######## self._compute_over_subset("test", self.dataset.test_inputs, self.dataset.test_targets, self.test_monitors_dict, self.test_monitor_function, self.test_monitors_outservice_dict, plot) ########### # cleanup # ########### # check for early stopping on train costs cost = numpy.sum(train_costs) # if the cost improved, reset the patience and record the best cost. if cost < self.best_cost * self.early_stop_threshold: self.patience = 0 self.best_cost = cost # save the parameters that made it the best self.best_params = self.model.get_param_values(borrow=False) elif not numpy.isnan(cost): self.patience += 1 # check for stopping either from n_epochs or from threshold/patience stop = False if self.epoch_counter >= self.n_epoch: log.info("Stopping (reached max number of epochs)...") stop = True if self.patience >= self.early_stop_length: log.info("Stopping early (reached stop threshold)...") stop = True timing = time.time() - t self.times.append(timing) log.info('time: ' + make_time_units_string(timing)) log.debug('remaining time: ' + make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times))) if (self.epoch_counter % self.save_frequency) == 0: #save params self.model.save_params('trained_epoch_' + str(self.epoch_counter)) # ANNEAL! if not stop: # perform the appropriate decay on the decay functions/parameters for this optimizer and model for decay_param in self.get_decay_params(): decay_param.decay() # return whether or not to stop this epoch return stop def _compute_over_subset(self, subset, inputs, targets, monitors_dict, monitor_function, monitors_outservice_dict, plot): inputs = raise_to_list(inputs) targets = raise_to_list(targets) if inputs is not None and len(monitors_dict) > 0: monitors = {key: [] for key in monitors_dict.keys()} data = [minibatch(input, self.batch_size, self.min_batch_size) for input in inputs] if targets is not None and not self.unsupervised: data += [minibatch(target, self.batch_size, self.min_batch_size) for target in targets] for batch in min_normalized_izip(*data): _outs = raise_to_list(monitor_function(*batch)) current_monitors = zip(monitors_dict.keys(), _outs) for name, val in current_monitors: val = numpy.asarray(val) monitors[name].append(val) # get the mean values for the batches current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in monitors.items()} # log the mean values! log.info('%s monitors: %s', subset, str(current_mean_monitors)) # send the values to their outservices for name, service in monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], subset) # if there is a plot, also send them over! if plot: plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) def get_decay_params(self): """ Returns a list of all the Decay objects to decay during training. Returns ------- list List of Decay objects to use after each training epoch - in this case the learning rate decay. """ decay_params = self.model.get_decay_params() if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay: decay_params.append(self.learning_rate_decay) return decay_params
class Monitor(object): """ A class for monitoring Models while they are being trained. A monitor object records the number of minibatches and number of examples the model has trained, as well as any number of "channels" that track quantities of interest (examples: the objective function, measures of hidden unit activity, reconstruction error, sum of squared second derivatives, average norm of the weight vectors, etc.) """ def __init__(self, model): """ Makes a monitor for `model`. Assumes the model has not been trained at all yet. Parameters ---------- model : pylearn2.models.model.Model instance """ self.training_succeeded = False self.model = model self.channels = OrderedDict() self._num_batches_seen = 0 self._examples_seen = 0 self._epochs_seen = 0 self._datasets = [] self._iteration_mode = [] self._batch_size = [] self._num_batches = [] self._dirty = True self._rng_seed = [] self.names_to_del = ['theano_function_mode'] self.t0 = time.time() # Determine whether the model should use topological or vector form of # examples. If the model acts on a space with more than the batch index # and channel dimension, the model has topological dimensions, so the # topological view of the data should be used. vector = model.get_input_space().make_theano_batch(name='monitoring_input') if isinstance(vector.type, theano.sparse.SparseType): self.topo = False else: self.topo = len(vector.type.broadcastable) > 2 self.require_label = False self.theano_function_mode = None def set_theano_function_mode(self, mode): if self.theano_function_mode != mode: self._dirty = True self.theano_function_mode = mode def add_dataset(self, dataset, mode='sequential', batch_size=None, num_batches=None, seed = None): """ Determines the data used to calculate the values of each channel. Parameters ---------- dataset : object A `pylearn2.datasets.Dataset` object. mode : str or object, optional Iteration mode; see the docstring of the `iterator` method on `pylearn2.datasets.Dataset` for details. batch_size : int, optional The size of an individual batch. Optional if `mode` is 'sequential' and `num_batches` is specified (batch size will be calculated based on full dataset size). num_batches : int, optional The total number of batches. Unnecessary if `mode` is 'sequential' and `batch_size` is specified (number of batches will be calculated based on full dataset size). """ # The user can ommit using lists if only one dataset is set if not isinstance(dataset, list): dataset = [dataset] if not isinstance(mode, list): mode = [mode] if not isinstance(batch_size, list): batch_size = [batch_size] if not isinstance(num_batches, list): num_batches = [num_batches] if seed is None: seed = [ None ] * len(dataset) if not isinstance(seed, list): seed = [ seed ] if len(mode) != len(dataset): raise ValueError("Received "+str(len(dataset))+" dataset but " + str(len(mode)) + " modes.") if any([len(l) != len(dataset) for l in [batch_size, seed]]): raise ValueError("make sure each dataset has its iteration " + \ "batch size and number of batches.") for (d, m, b, n, sd) in safe_izip(dataset, mode, batch_size, num_batches, seed): try: it = d.iterator(mode=m, batch_size=b, num_batches=n, topo=self.topo, targets=self.require_label, rng = sd) except ValueError as exc: raise ValueError("invalid iteration parameters in " "Monitor.add_dataset: " + str(exc)) if it.stochastic: # must be a seed, not a random number generator # if it were a random number generator, different iterators using # it would update its state, so we would not get the same iterator # each time # Also, must not be None, because this makes the iterator pick # a seed based on the clock if sd is None: raise TypeError("Monitor requires a seed when using stochastic iteration modes.") if not isinstance(sd, (list, tuple, int)): raise TypeError("Monitor requires a seed (not a random number generator) when using stochastic iteration modes.") else: assert sd is None # the iterator should catch this, but let's double-check if not d in self._datasets: self._datasets.append(d) self._iteration_mode.append(m) self._batch_size.append(b) self._num_batches.append(n) self._rng_seed.append(sd) def __call__(self): """ Runs the model on the monitoring dataset in order to add one data point to each of the channels. """ # If the channels have changed at all, we need to recompile the theano # functions used to compute them if self._dirty: self.redo_theano() model = self.model datasets = self._datasets # Set all channels' val_shared to 0 self.begin_record_entry() for d, i, b, n, a, sd, ne in safe_izip(datasets, self._iteration_mode, self._batch_size, self._num_batches, self.accum, self._rng_seed, self.num_examples): if isinstance(d, basestring): d = yaml_parse.load(d) raise NotImplementedError() # need to put d back into self._datasets myiterator = d.iterator(mode=i, batch_size=b, num_batches=n, topo=self.topo, targets=self.require_label, rng=sd) actual_ne = 0 for X in myiterator: if self.require_label: X, y = X self.run_prereqs(X,y,d) a(X, y) else: self.run_prereqs(X, None, d) a(X) if X.ndim == 2: actual_batch_size = X.shape[0] else: actual_batch_size = X.shape[d.get_topo_batch_axis()] actual_ne += actual_batch_size # end for X if actual_ne != ne: raise RuntimeError("At compile time, your iterator said it had " + str(ne) + " examples total, but at runtime it gave us " + str(actual_ne) + ".") # end for d log.info("Monitoring step:") log.info("\tEpochs seen: %d" % self._epochs_seen) log.info("\tBatches seen: %d" % self._num_batches_seen) log.info("\tExamples seen: %d" % self._examples_seen) t = time.time() - self.t0 for channel_name in sorted(self.channels.keys(), key=number_aware_alphabetical_key): channel = self.channels[channel_name] channel.time_record.append(t) channel.batch_record.append(self._num_batches_seen) channel.example_record.append(self._examples_seen) channel.epoch_record.append(self._epochs_seen) val = channel.val_shared.get_value() channel.val_record.append(val) # TODO: use logging infrastructure so that user can configure # formatting if abs(val) < 1e4: val_str = str(val) else: val_str = '%.3e' % val log.info("\t%s: %s" % (channel_name, val_str)) def run_prereqs(self, X, y, dataset): if dataset not in self.prereqs: return for prereq in self.prereqs[dataset]: prereq(X,y) def get_batches_seen(self): """ Returns the number of batches the model has learned on (assuming that the learning code has been calling Monitor.report_batch correctly) """ return self._num_batches_seen def get_epochs_seen(self): return self._epochs_seen def get_examples_seen(self): """ Returns the number of examples the model has learned on (assuming that the learning code has been calling Monitor.report_batch correctly) """ return self._examples_seen def report_batch(self, num_examples): """ Call this whenever the model has learned on another batch of examples. Report how many examples were learned on. """ self._examples_seen += num_examples self._num_batches_seen += 1 def report_epoch(self): self._epochs_seen += 1 def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is needed so that if new channels are added, Theano's optimizations make sure (to the extent that they can) that the new channels and old channels don't have any redundant calculations. It is also needed to regenerate Theano functions after pickling and unpickling, since Theano functions should not be pickled. """ self._dirty = False init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function(inputs=[], updates=updates, mode=self.theano_function_mode, name = 'Monitor.begin_record_entry') updates = OrderedDict() givens = OrderedDict() # Get the appropriate kind of theano variable to represent the data the model # acts on X = self.model.get_input_space().make_theano_batch(name = "monitoring_X") if config.compute_test_value != 'off': m = self.model.get_test_batch_size() test_value = self.model.get_input_space().get_origin_batch(m) X.tag.test_value = np.cast[X.type.dtype](test_value) if self.require_label: Y = self.model.get_output_space().make_theano_batch(name = "monitoring_Y") log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling monitor including channel '+key+'\n') log.info('\t%s' % key) it = [d.iterator(mode=i, num_batches=n, batch_size=b, topo=self.topo) \ for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size)] self.num_examples = [np.cast[config.floatX](float(i.num_examples)) for i in it] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for channel in self.channels.values(): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] cur_num_examples = self.num_examples[index] u = updates[index] if isinstance(channel.graph_input, (list, tuple)): channel_X, channel_Y = channel.graph_input assert channel_X not in g or g[channel_X] is X assert channel_Y not in g or g[channel_Y] is Y g[channel_X] = X g[channel_Y] = Y else: channel_X = channel.graph_input assert channel_X not in g or g[channel_X] is X g[channel_X] = X if n == 0: raise ValueError("Iterating over 0 examples results in divide by 0") if self.topo: batch_index = d.get_topo_batch_axis() else: batch_index = 0 val = channel.val * T.cast(X.shape[batch_index], config.floatX) / cur_num_examples u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' \ + key.name + ' has dtype ' + key.dtype + \ ' but is driven by an expression with type ' + \ up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key '+var_descriptor(elem)+'\n') mode.record.handle_line('g val '+var_descriptor(g[elem])+'\n') for elem in u: mode.record.handle_line('u key '+var_descriptor(elem)+'\n') mode.record.handle_line('u val '+var_descriptor(u[elem])+'\n') function_name = 'Monitor.accum[%d]' % idx if self.require_label: if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just monitor the model # parameters, or some shared variable updated by the training algorithm, so we # need to ignore the unused input error self.accum.append(function([X, Y], givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) else: if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling unsupervised accum\n') self.accum.append(function([X], givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output '+var_descriptor(elem)+'\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del([name for name in final_names if name not in init_names]) def register_names_to_del(self, names): """ Register names of fields that should be deleted before pickling. Parameters ---------- names : list A list of attribute names as strings. """ for name in names: if name not in self.names_to_del: self.names_to_del.append(name) def __getstate__(self): """ In order to avoid pickling a copy of the dataset whenever a monitor is saved, the __getstate__ method replaces the dataset field with the dataset's yaml source. This is not a perfect solution because it won't work with job resuming, which would require saving the state of the dataset's random number generator. Like in the Model class, we also need to avoid saving any Theano functions, so we delete everything that can be regenerated with `redo_theano` by deleting the fields in `self.names_to_del` """ # Patch old pickled monitors if not hasattr(self, '_datasets'): self._datasets = [ self._dataset ] del self._dataset temp = self._datasets if self._datasets: self._datasets = [] for dataset in temp: if isinstance(dataset, basestring): self._datasets.append(dataset) else: try: self._datasets.append(dataset.yaml_src) except AttributeError: warnings.warn('Trained model saved without indicating yaml_src') d = copy.copy(self.__dict__) self._datasets = temp for name in self.names_to_del: if name in d: del d[name] return d def __setstate__(self, d): # patch old pkl files if '_dataset' in d: d['_datasets'] = [ d['_dataset'] ] del d['_dataset'] self.__dict__.update(d) def add_channel(self, name, ipt, val, dataset=None, prereqs=None): """ Asks the monitor to start tracking a new value. Can be called even after the monitor is already in use. Parameters ---------- name: str The display name in the monitor. ipt: tensor_like The symbolic tensor which should be clamped to the data. (or a (features,targets) list/tuple containing two symbolic tensors) val: tensor_like The value (function of `ipt`) to be tracked. dataset: A Dataset instance specifying which dataset to compute this channel on. prereqs: list of callables that take two numpy tensors (X and y, where y will be None if no labels are used) each prereq must be called exactly once per each new batch of data drawn *from dataset* before the channel value is computed if two channels provide a prereq with exactly the same id, that prereq will only be called once """ if isinstance(val, (float, int, long)): val = np.cast[theano.config.floatX](val) val = T.as_tensor_variable(val) if not isinstance(ipt, (list, tuple)): tmp = [ ipt ] else: tmp = ipt inputs = theano.gof.graph.inputs([val]) for elem in inputs: if not hasattr(elem, 'get_value') and not isinstance(elem, theano.gof.graph.Constant): if elem not in tmp: raise ValueError("Unspecified input: "+str(elem)) mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('Adding monitor channel '+name+'\n') if isinstance(ipt, (list, tuple)): for elem in ipt: mode.record.handle_line('Includes input var '+var_descriptor(elem)+'\n') else: mode.record.handle_line(name+' input var is '+var_descriptor(ipt)+'\n') mode.record.handle_line('channel '+name+' is '+var_descriptor(val)+'\n') if dataset is None: if len(self._datasets) == 1: dataset = self._datasets[0] elif len(self._datasets) == 0: raise ValueError(_err_no_data) else: raise ValueError(_err_ambig_data) try: self._datasets.index(dataset) except ValueError: raise ValueError("The dataset specified is not " + \ "one of the monitor's datasets") if name in self.channels: raise ValueError("Tried to create the same channel twice (%s)" % name) if isinstance(ipt, (list, tuple)): if dataset is not None: if not dataset.has_targets(): raise ValueError("Tried to create a channel ("+name \ +") that uses targets, but monitoring dataset has no targets") self.require_label = True assert len(ipt) == 2 self.channels[name] = MonitorChannel(ipt, val, name, dataset, prereqs) self._dirty = True def _sanity_check(self): """ Sometimes we serialize models and then load them somewhere else but still try to use their Monitor, and the Monitor is in a mangled state. I've added some calls to _sanity_check to try to catch when that happens. Not sure what to do for a long term fix. I think it requires making theano graphs serializable first. """ for name in self.channels: channel = self.channels[name] assert hasattr(channel, 'prereqs') @classmethod def get_monitor(cls, model): """ Returns a model's monitor. If the model doesn't have a monitor yet, installs one and returns that. Parameters ---------- model : object An object that implements the `Model` interface specified in `pylearn2.models`. """ if hasattr(model, 'monitor'): rval = model.monitor rval._sanity_check() else: rval = Monitor(model) model.monitor = rval return rval # TODO: find out if monitor.foo below are used anywhere, remove if not. @property def batch_size(self): return self._batch_size @property def num_batches(self): return self._num_batches def setup(self, dataset, cost, batch_size, num_batches = None, extra_costs=None, mode='sequential'): """ Sets up the monitor for a cost minimization problem. Adds channels defined by both the model and the cost for the specified dataset(s), as well as a channel called 'objective' defined by the costs' __call__ method. dataset: a Dataset or dictionary mapping string names to Datasets If string names are used, then for every dataset, each channel defined by the model or cost will be replicated with that dataset's name followed by an underscore as the prefix. For example, if your cost defines a channel called 'misclass', and datasets is {'train' : train_dataset, 'valid' : valid_dataset} you will get channels called 'train_misclass' and 'valid_misclass'. cost: a Cost """ if dataset is None: return if isinstance(dataset, Dataset): dataset = {'': dataset} else: assert isinstance(dataset, dict) assert all(isinstance(key, str) for key in dataset) assert all(isinstance(dataset[key], Dataset) for key in dataset) if extra_costs is None: costs = {} else: costs = extra_costs assert '' not in costs costs[''] = cost supervised = any(cost.supervised for cost in costs.values()) model = self.model X_space = model.get_input_space() X = X_space.make_theano_batch(name='monitor_X') if config.compute_test_value != 'off': X.tag.test_value = X_space.get_origin_batch(batch_size).astype(X.dtype) if supervised: Y_space = model.get_output_space() Y = Y_space.make_theano_batch(name='monitor_Y') if config.compute_test_value != 'off': Y.tag.test_value = Y_space.get_origin_batch(batch_size).astype(Y.dtype) ipt = (X, Y) else: Y = None ipt = X custom_channels = {} for cost_name in costs: if cost_name == '': prefix = '' else: prefix = cost_name + '_' cost = costs[cost_name] raw_channels = cost.get_monitoring_channels(model, X, Y) channels = {} for name in raw_channels: channels[prefix+name] = raw_channels[name] custom_channels.update(channels) model_channels = model.get_monitoring_channels(X, Y) custom_channels.update(model_channels) if is_stochastic(mode): seed = [[2013, 02, 22]] else: seed = None for dataset_name in dataset: cur_dataset = dataset[dataset_name] self.add_dataset(dataset=cur_dataset, mode=mode, batch_size=batch_size, num_batches=num_batches, seed=seed) if dataset_name == '': dprefix = '' else: dprefix = dataset_name + '_' # These channel name 'objective' must not vary, since callbacks that respond to the # values in the monitor use the name to find it. for cost_name in costs: cost = costs[cost_name] cost_value = cost(model, X, Y) if cost_value is not None: if cost_name == '': name = dprefix + 'objective' else: name = dprefix + cost_name self.add_channel(name=name, ipt=ipt, val=cost_value, dataset=cur_dataset) for key in custom_channels: self.add_channel(name=dprefix + key, ipt=ipt, val=custom_channels[key], dataset=cur_dataset)
class RMSProp(LearningRule): """ Implements the RMSProp learning rule as described by Hinton in `lecture 6 <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>` of the Coursera Neural Networks for Machine Learning course. In short, Hinton suggests "[the] magnitude of the gradient can be very different for different weights and can change during learning. This makes it hard to choose a global learning rate." RMSProp solves this problem by "[dividing] the learning rate for a weight by a running average of the magnitudes of recent gradients for that weight." Parameters ---------- decay : float, optional Decay constant similar to that used in AdaDelta and Momentum methods. max_scaling: float, optional Restrict the RMSProp gradient scaling coefficient to values below `max_scaling`. Notes ----- An instance of this LearningRule should only be used with one TrainingAlgorithm, and its get_updates method should be called only once. This is required in order to make the monitoring channels correctly report the moving averages. """ def __init__(self, decay=0.9, max_scaling=1e5): assert 0. <= decay < 1. assert max_scaling > 0 self.decay = sharedX(decay, 'decay') self.epsilon = 1. / max_scaling self.mean_square_grads = OrderedDict() @wraps(LearningRule.add_channels_to_monitor) def add_channels_to_monitor(self, monitor, monitoring_dataset): """ The channels added are the min, mean, and max of the mean_square_grad of each parameter. """ channel_mapping = {'_min': T.min, '_max': T.max, '_mean': T.mean} for mean_square_grad in self.mean_square_grads.values(): for suffix, op in channel_mapping.items(): monitor.add_channel(name=(mean_square_grad.name + suffix), ipt=None, val=op(mean_square_grad), data_specs=(NullSpace(), ''), dataset=monitoring_dataset) return @wraps(LearningRule.get_updates) def get_updates(self, learning_rate, grads, lr_scalers=None): """ Notes ----- This method has the side effect of storing the moving average of the square gradient in `self.mean_square_grads`. This is necessary in order for the monitoring channels to be able to track the value of these moving averages. Therefore, this method should only get called once for each instance of RMSProp. """ updates = OrderedDict() for param in grads: # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(param.get_value() * 0.) if param.name is None: raise ValueError("Model parameters must be named.") mean_square_grad.name = 'mean_square_grad_' + param.name if param.name in self.mean_square_grads: warnings.warn("Calling get_updates more than once on the " "gradients of `%s` may make monitored values " "incorrect." % param.name) # Store variable in self.mean_square_grads for monitoring. self.mean_square_grads[param.name] = mean_square_grad # Accumulate gradient new_mean_squared_grad = (self.decay * mean_square_grad + (1 - self.decay) * T.sqr(grads[param])) # Compute update scaled_lr = lr_scalers.get(param, 1.) * learning_rate rms_grad_t = T.sqrt(new_mean_squared_grad) rms_grad_t = T.maximum(rms_grad_t, self.epsilon) delta_x_t = -scaled_lr * grads[param] / rms_grad_t # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[param] = param + delta_x_t return updates
class Monitor(object): """ A class for monitoring Models while they are being trained. A monitor object records the number of minibatches and number of examples the model has trained, as well as any number of "channels" that track quantities of interest (examples: the objective function, measures of hidden unit activity, reconstruction error, sum of squared second derivatives, average norm of the weight vectors, etc.) """ def __init__(self, model): """ Makes a monitor for `model`. Assumes the model has not been trained at all yet. Parameters ---------- model : pylearn2.models.model.Model instance """ self.training_succeeded = False self.model = model self.channels = OrderedDict() self._num_batches_seen = 0 self._examples_seen = 0 self._epochs_seen = 0 self._datasets = [] self._iteration_mode = [] self._batch_size = [] self._num_batches = [] self._dirty = True self._rng_seed = [] self.names_to_del = ['theano_function_mode'] self.t0 = time.time() self.theano_function_mode = None # Initialize self._nested_data_specs, self._data_specs_mapping, # and self._flat_data_specs self._build_data_specs() def _build_data_specs(self): """ Computes a nested data_specs for input and all channels Also computes the mapping to flatten it. This function is called from redo_theano. """ # Ask the model what it needs m_space, m_source = self.model.get_monitoring_data_specs() input_spaces = [m_space] input_sources = [m_source] for channel in self.channels.values(): space = channel.data_specs[0] assert isinstance(space, Space) input_spaces.append(space) input_sources.append(channel.data_specs[1]) nested_space = CompositeSpace(input_spaces) nested_source = tuple(input_sources) self._nested_data_specs = (nested_space, nested_source) self._data_specs_mapping = DataSpecsMapping(self._nested_data_specs) flat_space = self._data_specs_mapping.flatten(nested_space, return_tuple=True) flat_source = self._data_specs_mapping.flatten(nested_source, return_tuple=True) self._flat_data_specs = (CompositeSpace(flat_space), flat_source) def set_theano_function_mode(self, mode): """ Parameters ---------- mode : theano.compile.Mode Theano functions for the monitoring channels will be compiled and run using this mode. """ if self.theano_function_mode != mode: self._dirty = True self.theano_function_mode = mode def add_dataset(self, dataset, mode='sequential', batch_size=None, num_batches=None, seed=None): """ Determines the data used to calculate the values of each channel. Parameters ---------- dataset : object A `pylearn2.datasets.Dataset` object. mode : str or object, optional Iteration mode; see the docstring of the `iterator` method \ on `pylearn2.datasets.Dataset` for details. batch_size : int, optional The size of an individual batch. Optional if `mode` is \ 'sequential' and `num_batches` is specified (batch size \ will be calculated based on full dataset size). num_batches : int, optional The total number of batches. Unnecessary if `mode` is \ 'sequential' and `batch_size` is specified (number of \ batches will be calculated based on full dataset size). seed : int, optional Optional. The seed to be used for random iteration modes. """ # The user can ommit using lists if only one dataset is set if not isinstance(dataset, list): dataset = [dataset] if not isinstance(mode, list): mode = [mode] if not isinstance(batch_size, list): batch_size = [batch_size] if not isinstance(num_batches, list): num_batches = [num_batches] if seed is None: seed = [None] * len(dataset) if not isinstance(seed, list): seed = [seed] if len(mode) != len(dataset): raise ValueError("Received " + str(len(dataset)) + " dataset but " + str(len(mode)) + " modes.") if any([len(l) != len(dataset) for l in [batch_size, seed]]): raise ValueError("make sure each dataset has its iteration " + "batch size and number of batches.") for (d, m, b, n, sd) in safe_izip(dataset, mode, batch_size, num_batches, seed): try: it = d.iterator(mode=m, batch_size=b, num_batches=n, data_specs=self._flat_data_specs, return_tuple=True, rng=sd) except ValueError as exc: raise ValueError("invalid iteration parameters in " + "Monitor.add_dataset: " + str(exc)) if it.stochastic: # Must be a seed, not a random number generator. If it were a # random number generator, different iterators using it would # update its state, so we would not get the same iterator # each time. Also, must not be None, because this makes the # iterator pick a seed based on the clock if sd is None: raise TypeError("Monitor requires a seed when using " + "stochastic iteration modes.") if not isinstance(sd, (list, tuple, int)): raise TypeError("Monitor requires a seed (not a random " + "number generator) when using " + "stochastic iteration modes.") else: # The iterator should catch this, but let's double-check assert sd is None if not d in self._datasets: self._datasets.append(d) self._iteration_mode.append(m) self._batch_size.append(b) self._num_batches.append(n) self._rng_seed.append(sd) def __call__(self): """ Runs the model on the monitoring dataset in order to add one data point to each of the channels. """ # If the channels have changed at all, we need to recompile the theano # functions used to compute them if self._dirty: self.redo_theano() datasets = self._datasets # Set all channels' val_shared to 0 self.begin_record_entry() for d, i, b, n, a, sd, ne in safe_izip(datasets, self._iteration_mode, self._batch_size, self._num_batches, self.accum, self._rng_seed, self.num_examples): if isinstance(d, basestring): d = yaml_parse.load(d) raise NotImplementedError() # need to put d back into self._datasets myiterator = d.iterator(mode=i, batch_size=b, num_batches=n, data_specs=self._flat_data_specs, return_tuple=True, rng=sd) # If self._flat_data_specs is empty, no channel needs data, # so we do not need to call the iterator in order to average # the monitored values across different batches, we only # have to call them once. if len(self._flat_data_specs[1]) == 0: X = () self.run_prereqs(X, d) a(*X) else: actual_ne = 0 for X in myiterator: # X is a flat (not nested) tuple self.run_prereqs(X, d) a(*X) actual_ne += self._flat_data_specs[0].np_batch_size(X) # end for X if actual_ne != ne: raise RuntimeError("At compile time, your iterator said " "it had %d examples total, but at " "runtime it gave us %d." % (ne, actual_ne)) # end for d log.info("Monitoring step:") log.info("\tEpochs seen: %d" % self._epochs_seen) log.info("\tBatches seen: %d" % self._num_batches_seen) log.info("\tExamples seen: %d" % self._examples_seen) t = time.time() - self.t0 for channel_name in sorted(self.channels.keys(), key=number_aware_alphabetical_key): channel = self.channels[channel_name] channel.time_record.append(t) channel.batch_record.append(self._num_batches_seen) channel.example_record.append(self._examples_seen) channel.epoch_record.append(self._epochs_seen) val = channel.val_shared.get_value() channel.val_record.append(val) # TODO: use logging infrastructure so that user can configure # formatting if abs(val) < 1e4: val_str = str(val) else: val_str = '%.3e' % val log.info("\t%s: %s" % (channel_name, val_str)) def run_prereqs(self, data, dataset): """ Runs all "prerequistie functions" on a batch of data. Always called right before computing the monitoring channels on that batch. Parameters ---------- data : tuple or Variable a member of the Space used as input to the monitoring functions dataset : Dataset the Dataset the data was drawn from """ if dataset not in self.prereqs: return for prereq in self.prereqs[dataset]: prereq(*data) def get_batches_seen(self): """ Returns the number of batches the model has learned on (assuming that the learning code has been calling Monitor.report_batch correctly). """ return self._num_batches_seen def get_epochs_seen(self): """ Returns ------- epochs_seen : int The number of epochs the model has been trained on. One "epoch" is one pass through Dataset.iterator. """ return self._epochs_seen def get_examples_seen(self): """ Returns ------- examples_seen : int The number of examples the model has learned on (assuming that the learning code has been calling Monitor.report_batch correctly) """ return self._examples_seen def report_batch(self, num_examples): """ Call this whenever the model has learned on another batch of examples. Report how many examples were learned on. Parameters ---------- num_examples : int The number of examples learned on in this minibatch. """ self._examples_seen += num_examples self._num_batches_seen += 1 def report_epoch(self): """ Call this whenever the model has completed another "epoch" of learning. We regard one pass through Dataset.iterator as one epoch. """ self._epochs_seen += 1 def redo_theano(self): """ Recompiles Theano functions used by this monitor. This is called any time we need to evaluate the channels and the channel definitions have changed since last we called it, or if the theano functions are unavailable for any other reason (first time they are needed after construction or deserialization, etc.) All channels are compiled as part of the same theano function so that the theano optimizations can eliminate subexpressions that are shared between multiple channels. """ self._dirty = False # Recompute the data specs, since the channels may have changed. self._build_data_specs() init_names = dir(self) self.prereqs = OrderedDict() for channel in self.channels.values(): if channel.prereqs is not None: dataset = channel.dataset if dataset not in self.prereqs: self.prereqs[dataset] = [] prereqs = self.prereqs[dataset] for prereq in channel.prereqs: if prereq not in prereqs: prereqs.append(prereq) updates = OrderedDict() for channel in self.channels.values(): updates[channel.val_shared] = np.cast[config.floatX](0.0) with log_timing(log, "compiling begin_record_entry"): self.begin_record_entry = function( inputs=[], updates=updates, mode=self.theano_function_mode, name='Monitor.begin_record_entry') updates = OrderedDict() givens = OrderedDict() # Get the appropriate kind of theano variable to represent the data # the model acts on batch_names = ['monitoring_%s' % s for s in self._flat_data_specs[1]] theano_args = self._flat_data_specs[0].make_theano_batch(batch_names) # Get a symbolic expression of the batch size # We do it here, rather than for each channel, because channels with an # empty data_specs do not use data, and are unable to extract the batch # size. The case where the whole data specs is empty is not supported. batch_size = self._flat_data_specs[0].batch_size(theano_args) # Also get a nested representation, for joint iteration # with each of channel.graph_input nested_theano_args = self._data_specs_mapping.nest(theano_args) if not isinstance(nested_theano_args, tuple): nested_theano_args = (nested_theano_args, ) assert len(nested_theano_args) == (len(self.channels) + 1) log.info('Monitored channels: ') for key in sorted(self.channels.keys()): mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling monitor including ' + 'channel ' + key + '\n') log.info('\t%s' % key) it = [ d.iterator(mode=i, num_batches=n, batch_size=b, data_specs=self._flat_data_specs, return_tuple=True) for d, i, n, b in safe_izip(self._datasets, self._iteration_mode, self._num_batches, self._batch_size) ] self.num_examples = [ np.cast[config.floatX](float(i.num_examples)) for i in it ] givens = [OrderedDict() for d in self._datasets] updates = [OrderedDict() for d in self._datasets] for i, channel in enumerate(self.channels.values()): index = self._datasets.index(channel.dataset) d = self._datasets[index] g = givens[index] cur_num_examples = self.num_examples[index] u = updates[index] # Flatten channel.graph_input and the appropriate part of # nested_theano_args, to iterate jointly over them. c_mapping = DataSpecsMapping(channel.data_specs) channel_inputs = c_mapping.flatten(channel.graph_input, return_tuple=True) inputs = c_mapping.flatten(nested_theano_args[i + 1], return_tuple=True) for (channel_X, X) in safe_izip(channel_inputs, inputs): assert channel_X not in g or g[channel_X] is X assert channel_X.type == X.type, (channel_X.type, X.type) g[channel_X] = X if batch_size == 0: # No channel does need any data, so there is not need to # average results, and we will call the accum functions only # once. # TODO: better handling of channels not needing data when # some other channels need data. assert len(self._flat_data_specs[1]) == 0 val = channel.val else: if n == 0: raise ValueError("Iterating over 0 examples results in " + "divide by 0") val = (channel.val * T.cast(batch_size, config.floatX) / cur_num_examples) u[channel.val_shared] = channel.val_shared + val with log_timing(log, "Compiling accum"): # Check type of update expressions for up in updates: for key in up: if key.dtype != up[key].dtype: raise TypeError('Monitoring channel shared variable ' + key.name + ' has dtype ' + key.dtype + ' but is driven by an expression ' + 'with type ' + up[key].dtype) self.accum = [] for idx, packed in enumerate(safe_izip(givens, updates)): g, u = packed mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for elem in g: mode.record.handle_line('g key ' + var_descriptor(elem) + '\n') mode.record.handle_line('g val ' + var_descriptor(g[elem]) + '\n') for elem in u: mode.record.handle_line('u key ' + var_descriptor(elem) + '\n') mode.record.handle_line('u val ' + var_descriptor(u[elem]) + '\n') function_name = 'Monitor.accum[%d]' % idx if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('compiling supervised accum\n') # Some channels may not depend on the data, ie, they might just # monitor the model parameters, or some shared variable updated # by the training algorithm, so we need to ignore the unused # input error self.accum.append( function(theano_args, givens=g, updates=u, mode=self.theano_function_mode, name=function_name)) for a in self.accum: if mode is not None and hasattr(mode, 'record'): for elem in a.maker.fgraph.outputs: mode.record.handle_line('accum output ' + var_descriptor(elem) + '\n') log.info("graph size: %d" % len(a.maker.fgraph.toposort())) final_names = dir(self) self.register_names_to_del( [name for name in final_names if name not in init_names]) def register_names_to_del(self, names): """ Register names of fields that should be deleted before pickling. Parameters ---------- names : list A list of attribute names as strings. """ for name in names: if name not in self.names_to_del: self.names_to_del.append(name) def __getstate__(self): """ In order to avoid pickling a copy of the dataset whenever a monitor is saved, the __getstate__ method replaces the dataset field with the dataset's yaml source. This is not a perfect solution because it won't work with job resuming, which would require saving the state of the dataset's random number generator. Like in the Model class, we also need to avoid saving any Theano functions, so we delete everything that can be regenerated with `redo_theano` by deleting the fields in `self.names_to_del` """ # Patch old pickled monitors if not hasattr(self, '_datasets'): self._datasets = [self._dataset] del self._dataset temp = self._datasets if self._datasets: self._datasets = [] for dataset in temp: if isinstance(dataset, basestring): self._datasets.append(dataset) else: try: self._datasets.append(dataset.yaml_src) except AttributeError: warnings.warn('Trained model saved without ' + 'indicating yaml_src') d = copy.copy(self.__dict__) self._datasets = temp for name in self.names_to_del: if name in d: del d[name] return d def __setstate__(self, d): """ Sets the object to have the state described by `d`. Parameters ---------- d : dict A dictionary mapping string names of fields to values for these fields. """ # patch old pkl files if '_dataset' in d: d['_datasets'] = [d['_dataset']] del d['_dataset'] self.__dict__.update(d) def add_channel(self, name, ipt, val, dataset=None, prereqs=None, data_specs=None): """ Asks the monitor to start tracking a new value. Can be called even after the monitor is already in use. Parameters ---------- name : str The display name in the monitor. ipt : tensor_like The symbolic tensor which should be clamped to the data. \ (or a list/tuple containing symbolic tensors, following the \ data_specs) val : tensor_like The value (function of `ipt`) to be tracked. dataset : pylearn2.datasets.Dataset Which dataset to compute this channel on prereqs : list of callables that take a list of numpy tensors Each prereq must be called exactly once per each new batch of \ data drawn *from dataset* before the channel value is computed \ if two channels provide a prereq with exactly the same id, that \ prereq will only be called once data_specs : (space, source) pair Identifies the order, format and semantics of ipt """ if isinstance(val, (float, int, long)): val = np.cast[theano.config.floatX](val) val = T.as_tensor_variable(val) if data_specs is None: warnings.warn("parameter 'data_specs' should be provided when " + "calling add_channel. We will build a default one.", stacklevel=2) if isinstance(ipt, list): ipt = tuple(ipt) if ipt is not None and not isinstance(ipt, tuple): ipt = (ipt, ) if ipt is None: data_specs = (NullSpace(), '') elif len(ipt) == 0: data_specs = (CompositeSpace([]), ()) elif hasattr(dataset, 'get_data_specs'): dataset_space, dataset_source = dataset.get_data_specs() if (len(ipt) == 1 and dataset_source is not None and (not isinstance(dataset_source, tuple) or len(dataset_source) == 1) and 'features' in dataset_source): data_specs = (dataset_space, dataset_source) elif (len(ipt) == 2 and dataset_source == ('features', 'targets')): data_specs = (dataset_space, dataset_source) else: raise ValueError("Cannot infer default data_specs for " + "the following input points and " + "dataset: ipt = %s, dataset = %s" % (ipt, dataset)) data_specs[0].validate(ipt) mapping = DataSpecsMapping(data_specs) flat_ipt = mapping.flatten(ipt) if not isinstance(flat_ipt, tuple): flat_ipt = (flat_ipt, ) inputs = theano.gof.graph.inputs([val]) for elem in inputs: if not hasattr(elem, 'get_value') and \ not isinstance(elem, theano.gof.graph.Constant): if elem not in flat_ipt: raise ValueError("Unspecified input: " + str(elem) + ". This may be due to an incorrect " + "implementation of a cost's " + "get_data_specs() method, or of a " + "model's get_monitoring_data_specs() " + "method.") mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): mode.record.handle_line('Adding monitor channel ' + name + '\n') assert isinstance(flat_ipt, tuple) if len(flat_ipt) != 1: for elem in flat_ipt: mode.record.handle_line('Includes input var ' + var_descriptor(elem) + '\n') else: mode.record.handle_line(name + ' input var is ' + var_descriptor(flat_ipt[0]) + '\n') mode.record.handle_line('channel ' + name + ' is ' + var_descriptor(val) + '\n') if dataset is None: if len(self._datasets) == 1: dataset = self._datasets[0] elif len(self._datasets) == 0: raise ValueError(_err_no_data) else: raise ValueError(_err_ambig_data) try: self._datasets.index(dataset) except ValueError: raise ValueError("The dataset specified is not one of the " + "monitor's datasets") if name in self.channels: raise ValueError("Tried to create the same channel twice (%s)" % name) self.channels[name] = MonitorChannel(ipt, val, name, data_specs, dataset, prereqs) self._dirty = True def _sanity_check(self): """ Sometimes we serialize models and then load them somewhere else but still try to use their Monitor, and the Monitor is in a mangled state. I've added some calls to _sanity_check to try to catch when that happens. Not sure what to do for a long term fix. I think it requires making theano graphs serializable first. """ for name in self.channels: channel = self.channels[name] assert hasattr(channel, 'prereqs') @classmethod def get_monitor(cls, model): """ Returns a model's monitor. If the model doesn't have a monitor yet, installs one and returns that. Parameters ---------- model : object An object that implements the `Model` interface specified in \ `pylearn2.models`. """ if hasattr(model, 'monitor'): rval = model.monitor rval._sanity_check() else: rval = Monitor(model) model.monitor = rval return rval # TODO: find out if this method is used anywhere, remove if not. @property def batch_size(self): """ Returns ------- batch_size : int The size of the batches used for monitoring """ return self._batch_size # TODO: find out if this method is used anywhere, remove if not. @property def num_batches(self): """ Returns ------- num_batches : int The number of batches used for monitoring """ return self._num_batches def setup(self, dataset, cost, batch_size, num_batches=None, extra_costs=None, mode='sequential', obj_prereqs=None, cost_monitoring_args=None): """ Sets up the monitor for a cost minimization problem. Adds channels defined by both the model and the cost for the specified dataset(s), as well as a channel called 'objective' defined by the costs' __call__ method. Parameters ---------- dataset : pylearn2.datasets.Dataset Dataset or dictionary mapping string names to Datasets. If \ string names are used, then for every dataset, each channel \ defined by the model or cost will be replicated with that \ dataset's name followed by an underscore as the prefix. For \ example, if your cost defines a channel called 'misclass', and \ datasets is {'train' : train_dataset, 'valid' : valid_dataset} \ you will get channels called 'train_misclass' and 'valid_misclass'. cost : pylearn2.costs.Cost The cost being optimized by training. The value of the cost will appear as the `objective` channel. Its `get_monitoring_channels` method will also be used to supply other channels. extra_costs : OrderedDict, optional A dictionary mapping channel names to Cost objects. Their value will appear as the specified channel name. They will also provide more monitoring channels via their `get_monitoring_channels` method. obj_prereqs : None, or list of functions Functions to pass as prerequisites to the `objective` channel. cost_monitoring_args : dict Dictionary of kwargs that will be passed to \ `cost.get_monitoring_channels()` (but not for the extra_costs). """ if dataset is None: return if isinstance(dataset, Dataset): dataset = {'': dataset} else: assert isinstance(dataset, dict) assert all(isinstance(key, str) for key in dataset) assert all(isinstance(dataset[key], Dataset) for key in dataset) if extra_costs is None: costs = {} else: costs = extra_costs assert '' not in costs costs[''] = cost if cost_monitoring_args is None: cost_monitoring_args = {} model = self.model # Build a composite data_specs containing the specs for all costs, # then the specs of the model cost_names = sorted(costs.keys()) spaces = [] sources = [] for c in cost_names: c_space, c_source = costs[c].get_data_specs(model) spaces.append(c_space) sources.append(c_source) # Ask the model for the data_specs needed m_space, m_source = model.get_monitoring_data_specs() spaces.append(m_space) sources.append(m_source) nested_space = CompositeSpace(spaces) nested_sources = tuple(sources) # Flatten this data_specs, so we build only one symbolic Theano # variable for each of the unique (space, source) pairs. mapping = DataSpecsMapping((nested_space, nested_sources)) space_tuple = mapping.flatten(nested_space, return_tuple=True) source_tuple = mapping.flatten(nested_sources, return_tuple=True) ipt = tuple( space.make_theano_batch(name='monitor_%s' % source, batch_size=None) for (space, source) in safe_zip(space_tuple, source_tuple)) # Build a nested tuple from ipt, to dispatch the appropriate parts # of the ipt batch to each cost nested_ipt = mapping.nest(ipt) custom_channels = {} for i, cost_name in enumerate(cost_names): if cost_name == '': prefix = '' else: prefix = cost_name + '_' cost = costs[cost_name] cost_ipt = nested_ipt[i] raw_channels = cost.get_monitoring_channels(model, cost_ipt) channels = {} for name in raw_channels: # We need three things: the value itself (raw_channels[name]), # the input variables (cost_ipt), and the data_specs for # these input variables ((spaces[i], sources[i])) channels[prefix + name] = (raw_channels[name], cost_ipt, (spaces[i], sources[i])) custom_channels.update(channels) # Use the last inputs from nested_ipt for the model model_channels = model.get_monitoring_channels(nested_ipt[-1]) channels = {} for name in model_channels: # Note: some code used to consider that model_channels[name] # could be a a (channel, prereqs) pair, this is not supported. channels[name] = (model_channels[name], nested_ipt[-1], (spaces[-1], sources[-1])) custom_channels.update(channels) if is_stochastic(mode): seed = [[2013, 02, 22]] else: seed = None for dataset_name in dataset: cur_dataset = dataset[dataset_name] self.add_dataset(dataset=cur_dataset, mode=mode, batch_size=batch_size, num_batches=num_batches, seed=seed) if dataset_name == '': dprefix = '' else: dprefix = dataset_name + '_' # These channel name 'objective' must not vary, since callbacks # that respond to the values in the monitor use the name to find # it. for i, cost_name in enumerate(cost_names): cost = costs[cost_name] cost_ipt = nested_ipt[i] cost_value = cost.expr(model, cost_ipt) if cost_value is not None: if cost_name == '': name = dprefix + 'objective' prereqs = obj_prereqs else: name = dprefix + cost_name prereqs = None cost.get_data_specs(model)[0].validate(cost_ipt) self.add_channel(name=name, ipt=cost_ipt, val=cost_value, data_specs=cost.get_data_specs(model), dataset=cur_dataset, prereqs=prereqs) for key in custom_channels: val, ipt, data_specs = custom_channels[key] data_specs[0].validate(ipt) self.add_channel(name=dprefix + key, ipt=ipt, val=val, data_specs=data_specs, dataset=cur_dataset)
def get_funcs(self, learning_rate, grads, inp, cost, errors, lr_scalers=None): """ .. todo:: WRITEME Parameters ---------- learning_rate : float Learning rate coefficient. Learning rate is not being used but, pylearn2 requires a learning rate to be defined. grads : dict A dictionary mapping from the model's parameters to their gradients. lr_scalers : dict A dictionary mapping from the model's parameters to a learning rate multiplier. """ updates = OrderedDict({}) eps = self.damping step = sharedX(0., name="step") if self.skip_nan_inf: #If norm of the gradients of a parameter is inf or nan don't update that parameter #That might be useful for RNNs. grads = OrderedDict({p: T.switch(T.or_(T.isinf(grads[p]), T.isnan(grads[p])), 0, grads[p]) for p in grads.keys()}) # Block-normalize gradients: nparams = len(grads.keys()) # Apply the gradient clipping, this is only sometimes # necessary for RNNs and sometimes for very deep networks if self.grad_clip: assert self.grad_clip > 0. assert self.grad_clip <= 1., "Norm of the gradients per layer can not be larger than 1." gnorm = sum([g.norm(2) for g in grads.values()]) notfinite = T.or_(T.isnan(gnorm), T.isinf(gnorm)) for p, g in grads.iteritems(): tmpg = T.switch(gnorm / nparams > self.grad_clip, g * self.grad_clip * nparams / gnorm , g) grads[p] = T.switch(notfinite, as_floatX(0.1)*p, tmpg) tot_norm_up = 0 gshared = OrderedDict({p: sharedX(p.get_value() * 0., name='%s_grad' % p.name) for p, g in grads.iteritems()}) gsup = [(gshared[p], g) for p, g in grads.iteritems()] get_norms = lambda x: T.sqrt(sum(map(lambda y: (y**2).sum(), x))) gnorm = get_norms(grads.values()) pnorm = get_norms(grads.keys()) f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm], updates=gsup) fix_decay = self.slow_decay**(step + 1) for param in gshared.keys(): gshared[param].name = "grad_%s" % param.name mean_grad = sharedX(param.get_value() * 0. + eps, name="mean_grad_%s" % param.name) gnorm_sqr = sharedX(0.0 + eps, name="gnorm_%s" % param.name) prod_taus = sharedX((np.ones_like(param.get_value()) - 2*eps), name="prod_taus_x_t_" + param.name) slow_constant = 2.1 if self.use_adagrad: # sum_square_grad := \sum_i g_i^2 sum_square_grad = sharedX(param.get_value(borrow=True) * 0., name="sum_square_grad_%s" % param.name) """ Initialization of accumulators """ taus_x_t = sharedX((np.ones_like(param.get_value()) + eps) * slow_constant, name="taus_x_t_" + param.name) self.taus_x_t = taus_x_t #Variance reduction parameters #Numerator of the gamma: gamma_nume_sqr = sharedX(np.zeros_like(param.get_value()) + eps, name="gamma_nume_sqr_" + param.name) #Denominator of the gamma: gamma_deno_sqr = sharedX(np.zeros_like(param.get_value()) + eps, name="gamma_deno_sqr_" + param.name) #For the covariance parameter := E[\gamma \alpha]_{t-1} cov_num_t = sharedX(np.zeros_like(param.get_value()) + eps, name="cov_num_t_" + param.name) # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(np.zeros_like(param.get_value()) + eps, name="msg_" + param.name) # mean_square_dx := E[(\Delta x)^2]_{t-1} mean_square_dx = sharedX(param.get_value() * 0., name="msd_" + param.name) if self.use_corrected_grad: old_grad = sharedX(param.get_value() * 0. + eps) #The uncorrected gradient of previous of the previous update: old_plain_grad = sharedX(param.get_value() * 0. + eps) mean_curvature = sharedX(param.get_value() * 0. + eps) mean_curvature_sqr = sharedX(param.get_value() * 0. + eps) # Initialize the E[\Delta]_{t-1} mean_dx = sharedX(param.get_value() * 0.) # Block-wise normalize the gradient: norm_grad = gshared[param] #For the first time-step, assume that delta_x_t := norm_grad gnorm = T.sqr(norm_grad).sum() cond = T.eq(step, 0) gnorm_sqr_o = cond * gnorm + (1 - cond) * gnorm_sqr gnorm_sqr_b = gnorm_sqr_o / (1 - fix_decay) norm_grad = norm_grad / (T.sqrt(gnorm_sqr_b) + eps) msdx = cond * norm_grad**2 + (1 - cond) * mean_square_dx mdx = cond * norm_grad + (1 - cond) * mean_dx new_prod_taus = ( prod_taus * (1 - 1 / taus_x_t) ) """ Compute the new updated values. """ # E[g_i^2]_t new_mean_squared_grad = ( mean_square_grad * (1 - 1 / taus_x_t) + T.sqr(norm_grad) / (taus_x_t) ) new_mean_squared_grad.name = "msg_" + param.name # E[g_i]_t new_mean_grad = ( mean_grad * (1 - 1 / taus_x_t) + norm_grad / taus_x_t ) new_mean_grad.name = "nmg_" + param.name mg = new_mean_grad / (1 - new_prod_taus) mgsq = new_mean_squared_grad / (1 - new_prod_taus) new_gnorm_sqr = ( gnorm_sqr_o * self.slow_decay + T.sqr(norm_grad).sum() * (1 - self.slow_decay) ) # Keep the rms for numerator and denominator of gamma. new_gamma_nume_sqr = ( gamma_nume_sqr * (1 - 1 / taus_x_t) + T.sqr((norm_grad - old_grad) * (old_grad - mg)) / taus_x_t ) new_gamma_nume_sqr.name = "ngammasqr_num_" + param.name new_gamma_deno_sqr = ( gamma_deno_sqr * (1 - 1 / taus_x_t) + T.sqr((mg - norm_grad) * (old_grad - mg)) / taus_x_t ) new_gamma_deno_sqr.name = "ngammasqr_den_" + param.name gamma = T.sqrt(gamma_nume_sqr) / (T.sqrt(gamma_deno_sqr + eps) + \ self.gamma_reg) gamma.name = "gamma_" + param.name if self.gamma_clip and self.gamma_clip > -1: gamma = T.minimum(gamma, self.gamma_clip) momentum_step = gamma * mg corrected_grad_cand = (norm_grad + momentum_step) / (1 + gamma) #For starting the variance reduction. if self.start_var_reduction > -1: cond = T.le(self.start_var_reduction, step) corrected_grad = cond * corrected_grad_cand + (1 - cond) * norm_grad else: corrected_grad = norm_grad if self.use_adagrad: g = corrected_grad # Accumulate gradient new_sum_squared_grad = ( sum_square_grad + T.sqr(g) ) rms_g_t = T.sqrt(new_sum_squared_grad) rms_g_t = T.maximum(rms_g_t, 1.0) #Use the gradients from the previous update #to compute the \nabla f(x_t) - \nabla f(x_{t-1}) cur_curvature = norm_grad - old_plain_grad #cur_curvature = theano.printing.Print("Curvature: ")(cur_curvature) cur_curvature_sqr = T.sqr(cur_curvature) new_curvature_ave = ( mean_curvature * (1 - 1 / taus_x_t) + (cur_curvature / taus_x_t) ) new_curvature_ave.name = "ncurve_ave_" + param.name #Average average curvature nc_ave = new_curvature_ave / (1 - new_prod_taus) new_curvature_sqr_ave = ( mean_curvature_sqr * (1 - 1 / taus_x_t) + (cur_curvature_sqr / taus_x_t) ) new_curvature_sqr_ave.name = "ncurve_sqr_ave_" + param.name #Unbiased average squared curvature nc_sq_ave = new_curvature_sqr_ave / (1 - new_prod_taus) epsilon = 1e-7 #lr_scalers.get(param, 1.) * learning_rate scaled_lr = sharedX(1.0) rms_dx_tm1 = T.sqrt(msdx + epsilon) rms_curve_t = T.sqrt(new_curvature_sqr_ave + epsilon) #This is where the update step is being defined delta_x_t = -scaled_lr * (rms_dx_tm1 / rms_curve_t - cov_num_t / (new_curvature_sqr_ave + epsilon)) delta_x_t.name = "delta_x_t_" + param.name # This part seems to be necessary for only RNNs # For feedforward networks this does not seem to be important. if self.delta_clip: logger.info("Clipping will be applied on the adaptive step size.") delta_x_t = delta_x_t.clip(-self.delta_clip, self.delta_clip) if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: logger.info("Clipped adagrad is disabled.") delta_x_t = delta_x_t * corrected_grad else: logger.info("Clipping will not be applied on the adaptive step size.") if self.use_adagrad: delta_x_t = delta_x_t * corrected_grad / rms_g_t else: logger.info("Clipped adagrad will not be used.") delta_x_t = delta_x_t * corrected_grad new_taus_t = (1 - T.sqr(mdx) / (msdx + eps)) * taus_x_t + sharedX(1 + eps, "stabilized") #To compute the E[\Delta^2]_t new_mean_square_dx = ( msdx * (1 - 1 / taus_x_t) + (T.sqr(delta_x_t) / taus_x_t) ) #To compute the E[\Delta]_t new_mean_dx = ( mdx * (1 - 1 / taus_x_t) + (delta_x_t / (taus_x_t)) ) #Perform the outlier detection: #This outlier detection is slightly different: new_taus_t = T.switch(T.or_(abs(norm_grad - mg) > (2 * T.sqrt(mgsq - mg**2)), abs(cur_curvature - nc_ave) > (2 * T.sqrt(nc_sq_ave - nc_ave**2))), T.switch(new_taus_t > 2.5, sharedX(2.5), new_taus_t + sharedX(1.0) + eps), new_taus_t) #Apply the bound constraints on tau: new_taus_t = T.maximum(self.lower_bound_tau, new_taus_t) new_taus_t = T.minimum(self.upper_bound_tau, new_taus_t) new_cov_num_t = ( cov_num_t * (1 - 1 / taus_x_t) + (delta_x_t * cur_curvature) * (1 / taus_x_t) ) update_step = delta_x_t tot_norm_up += update_step.norm(2) # Apply updates updates[mean_square_grad] = new_mean_squared_grad updates[mean_square_dx] = new_mean_square_dx updates[mean_dx] = new_mean_dx updates[gnorm_sqr] = new_gnorm_sqr updates[gamma_nume_sqr] = new_gamma_nume_sqr updates[gamma_deno_sqr] = new_gamma_deno_sqr updates[taus_x_t] = new_taus_t updates[cov_num_t] = new_cov_num_t updates[mean_grad] = new_mean_grad updates[old_plain_grad] = norm_grad updates[mean_curvature] = new_curvature_ave updates[mean_curvature_sqr] = new_curvature_sqr_ave if self.perform_update: updates[param] = param + update_step updates[step] = step + 1 updates[prod_taus] = new_prod_taus if self.use_adagrad: updates[sum_square_grad] = new_sum_squared_grad if self.use_corrected_grad: updates[old_grad] = corrected_grad f_update = theano.function([learning_rate], [tot_norm_up], updates=updates, on_unused_input='ignore') return f_grad_shared, f_update
class Optimizer(object): """ Default interface for an optimizer implementation - this provides the necessary parameter updates when training a model on a dataset using an online stochastic process. """ def __init__(self, model, dataset, n_epoch=1000, batch_size=100, minimum_batch_size=1, save_frequency=10, early_stop_threshold=.9995, early_stop_length=30, learning_rate=1e-3, lr_decay='exponential', lr_factor=1, **kwargs): """ Initialize the Optimizer. Parameters ---------- model : Model The Model to train. dataset : Dataset The Dataset to use when training the Model. n_epoch : int how many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. minimum_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_frequency : int How many epochs to train between each new save of the Model's parameters. early_stop_threshold : float The factor by how much the best validation training score needs to improve to determine early stopping. early_stop_length : int The patience or number of epochs to wait after the early_stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The type of decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for options. lr_factor : float The amount to use for the decay function when changing the learning rate over epochs. See `opendeep.utils.decay` for its effect for given decay functions. """ log.info("Initializing optimizer %s", str(type(self))) if early_stop_threshold is None: early_stop_threshold = 1. if save_frequency is None: save_frequency = 1000000 if early_stop_length is None: early_stop_length = 100 self.args = locals().copy() self.args.pop('self') kwargs = self.args.pop('kwargs') self.args = add_kwargs_to_dict(kwargs, self.args) # log the arguments log.info("optimizer config args: %s", str(self.args)) assert isinstance(model, Model), "Optimizer input model needs to be an opendeep Model class!" assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be an opendeep Dataset class!" self.model = model self.dataset = dataset # Learning rate - how drastic of a step do the parameters change self.learning_rate = sharedX(learning_rate, 'learning_rate') self.lr_scalers = self.model.get_lr_scalers() if lr_decay: self.learning_rate_decay = get_decay_function(lr_decay, self.learning_rate, self.learning_rate.get_value(), lr_factor) else: self.learning_rate_decay = False self.noise_switches = raise_to_list(self.model.get_noise_switch()) self.batch_size = batch_size self.minimum_batch_size = minimum_batch_size self.n_epoch = n_epoch self.save_frequency = save_frequency self.early_stop_threshold = early_stop_threshold self.early_stop_length = early_stop_length def _get_batch_indices(self, data_lengths): """ Computes the tuples of (start_index, end_index) that represent the appropriate slices of the concatenated dataset with regards to the given data_lengths. This allows for lists of data lengths to represent sequences, so that the concatenated batches returned do not overstep the start of a new sequence. Parameters ---------- data_lengths : list(int) or int List of num_examples for each dataset (the length of the datasets - this is a list in the case of sequences). Returns ------- list((int, int)) List of tuples (start, end) representing the batch slices for the total dataset if it were concatenated. """ batch_indices = [] start_idx = 0 for len in raise_to_list(data_lengths): # integer division to determine number of whole batches for this length n_batches = len / int(self.batch_size) # add the (start_idx, end_idx) tuple to the list for i in range(n_batches): end_idx = start_idx + self.batch_size batch_indices.append((start_idx, end_idx)) start_idx = end_idx # remainder to find number of leftover examples remainder = numpy.remainder(len, self.batch_size) end_idx = start_idx + remainder # check if it is bigger than the minimum allowed size if remainder >= self.minimum_batch_size: batch_indices.append((start_idx, end_idx)) start_idx = end_idx return batch_indices def _get_givens_subset(self, subset, batch_slice): """ This translates a batch slice of start and end indices into the actual data from the given subset. Parameters ---------- subset : int The subset to use - determined in opendeep.data.datasets as TRAIN, VALID, or TEST attributes. batch_slice : symbolic slice The symbolic slice to grab from the data. Returns ------- OrderedDict The givens to provide to a function where it sets the input variable to the actual batch representation of data from the dataset: (input_variable: data[batch]) """ # translate the data_idx into the givens for the model # first get the lists of input variables the model requires - inputs and targets model_inputs = raise_to_list(self.model.get_inputs()) model_targets = raise_to_list(self.model.get_targets()) givens = None if self.dataset.getSubset(subset)[0] is not None: # grab the data and labels data, labels = self.dataset.getSubset(subset) # create the givens for the input function as pairs of (input_variable: sliced_data) givens = OrderedDict(zip(model_inputs, [data[batch_slice]])) # include labels as well if they are required by the model if model_targets is not None and len(model_targets) > 0: if labels is None: log.error("No labels in the dataset!") raise AssertionError, "No lables in the dataset!" givens.update(OrderedDict(zip(model_targets, [labels[batch_slice]]))) else: log.warning("Dataset doesn't have subset %s" % get_subset_strings(subset)) return givens def get_updates(self, gradients): """ This returns the parameter updates to use during training. It defaults to only using (annealed) learning rate. Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. """ log.debug('Setting up Stochastic Gradient Descent for optimizer...') updates = OrderedDict() for (param, gradient) in six.iteritems(gradients): scaled_lr = self.learning_rate * self.lr_scalers.get(param, 1.) updates[param] = param - scaled_lr * gradient return updates def get_lr_monitor(self): """ Returns a monitor dictionary to the Optimizer's learning rate. Returns ------- dict Mapping 'learning_rate' to `self.learning_rate` shared variable. """ return {'learning_rate': self.learning_rate} def train(self, monitor_channels=None, train_outservice=None, plot=None, continue_training=False): """ This method performs the training!!! It is an online training method that goes over minibatches from the dataset for a number of epochs, updating parameters after each minibatch. You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully. Parameters ---------- monitor_channels : list(MonitorsChannel or Monitor), optional The list of channels or monitors containing monitor expressions/variables to compile and evaluate on the data. train_outservice : OutService, optional The OutService to use for the automatically created train_cost monitor. Default of None just outputs to logs. plot : Plot, optional The Plot object to use if we want to graph the outputs (uses bokeh server). continue_training : bool Whether to continue training from a previous point. """ ############################################### # theano index variable to use on the dataset # ############################################### # index to a [mini]batch - both start and end data_idx = T.iscalar('data_index') data_end_idx = T.iscalar('data_end_index') function_input = [data_idx, data_end_idx] batch_slice = slice(data_idx, data_end_idx) # compute number of minibatches for training, validation and testing # shapes is list of list - input list of datasets to optimizer (for multiple inputs), and each dataset # could be a list of shared variables (like multiple sequences from files) train_data_shapes = raise_to_list(self.dataset.getDataShape(TRAIN)) valid_data_shapes = raise_to_list(self.dataset.getDataShape(VALID)) test_data_shapes = raise_to_list(self.dataset.getDataShape(TEST)) # train_batches is going to be lists of tuples that contain the start and end indices for train data. # this is more useful in the case of datasets that are lists of sequences, so that the start and end # indices can make sure a batch does not cross the sequence boundary on the concatenated data train_data_lens = [shape[0] for shape in train_data_shapes] self.train_batches = self._get_batch_indices(train_data_lens) if valid_data_shapes is not None: valid_data_lens = [shape[0] for shape in valid_data_shapes] self.valid_batches = self._get_batch_indices(valid_data_lens) else: self.valid_batches = None if test_data_shapes is not None: test_data_lens = [shape[0] for shape in test_data_shapes] self.test_batches = self._get_batch_indices(test_data_lens) else: self.test_batches = None # create the givens for the input function as pairs of (input_variable: sliced_data) train_givens = self._get_givens_subset(TRAIN, batch_slice) valid_givens = self._get_givens_subset(VALID, batch_slice) test_givens = self._get_givens_subset(TEST, batch_slice) # Now time to create the gradient updates for the model - make sure to handle the possible # list of costs used for pretraining of certain parts of the model. train_costs = raise_to_list(self.model.get_train_cost()) train_updates = [] self.gradients = [] for i, train_cost in enumerate(train_costs): # Now create the training cost function for the model to use while training - update parameters # gradient! gradients, _ = self.model.get_gradient(cost=train_cost) self.gradients.append(gradients) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable updates = self.model.get_updates() if updates: updates.update(gradient_updates) else: updates = gradient_updates train_updates.append(updates) # grab the model parameters to use during training self.params = self.model.get_params() log.info("%s params: %s", str(type(self.model)), str(self.params)) # deal with the monitor channels if they were given (or take them from the plot) if monitor_channels is None and plot is not None and len(plot.channels) > 0: monitor_channels = plot.channels self.train_monitors_dict = {} self.valid_monitors_dict = {} self.test_monitors_dict = {} self.train_monitors_outservice_dict = {} self.valid_monitors_outservice_dict = {} self.test_monitors_outservice_dict = {} if monitor_channels: # collapse the appropriate monitors into their (name, expression, out_service) tuples train_collapsed = collapse_channels(monitor_channels, train=True) valid_collapsed = collapse_channels(monitor_channels, valid=True) test_collapsed = collapse_channels(monitor_channels, test=True) # get name: expression dictionary self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) self.test_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in test_collapsed]) # get name: outservice dictionary self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed]) self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed]) self.test_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in test_collapsed]) # finally deal with an outservice provided to monitor training cost self.train_outservice = train_outservice # remove redundant files made by the fileservice for the train monitor. # TODO: THIS FEELS LIKE A HACK. I don't like it. if isinstance(self.train_outservice, FileService): os.remove(self.train_outservice.valid_filename) os.remove(self.train_outservice.test_filename) ####################################### # compile train and monitor functions # ####################################### train_functions = [] for i in range(len(train_costs)): updates = train_updates[i] train_cost = train_costs[i] # Compile the training function! log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_updates), str(type(self.model))) t = time.time() f_learn = function(inputs=function_input, updates=updates, outputs=[train_cost] + self.train_monitors_dict.values(), givens=train_givens, name='f_learn_%d' % i) log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) train_functions.append(f_learn) # figure out if we want valid and test self.valid_flag = (self.dataset.getSubset(VALID)[0] is not None) and (len(self.valid_monitors_dict) > 0) self.test_flag = (self.dataset.getSubset(TEST)[0] is not None) and (len(self.test_monitors_dict) > 0) # Now compile the monitor functions! log.debug("Compiling monitor functions...") monitor_t = time.time() # valid monitors if self.valid_flag: self.valid_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=self.valid_monitors_dict.values(), givens=valid_givens, name='valid_monitor_function' ) else: self.valid_monitor_function = None # test monitors if self.test_flag: self.test_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=self.test_monitors_dict.values(), givens=test_givens, name='test_monitor_function' ) else: self.test_monitor_function = None log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) ################## # start training # ################## # make sure to deal with a list of train_cost functions - for layer-wise pretraining! # this list of training functions was created during __init__() start_time = time.time() for func_i, train_function in enumerate(train_functions): log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS (continue_training=%s)-----------", str(type(self.model)), func_i + 1, len(train_functions), self.n_epoch, str(continue_training)) log.debug("Train dataset size is: %s", self.dataset.getDataShape(TRAIN)) if self.dataset.getSubset(VALID)[0] is not None: log.debug("Valid dataset size is: %s", self.dataset.getDataShape(VALID)) if self.dataset.getSubset(TEST)[0] is not None: log.debug("Test dataset size is: %s", self.dataset.getDataShape(TEST)) self.STOP = False self.epoch_counter = 0 if not continue_training: # reset any decay params for decay_param in self.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(train_function, plot) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") set_shared_values(self.params, self.best_params) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl') log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t)) log.info("------------TOTAL %s TRAIN TIME TOOK %s---------", str(type(self.model)), make_time_units_string(time.time() - start_time)) def _perform_one_epoch(self, f_learn, plot=None): """ Performs a single training iteration with the given learn function. """ self.epoch_counter += 1 t = time.time() log.info('EPOCH %s', str(self.epoch_counter)) # set the noise switches on for training function! (this is where things like dropout happen) switch_vals = [] if len(self.noise_switches) > 0 and (self.valid_flag or self.test_flag or self.epoch_counter == 1): log.debug("Turning on %s noise switches", str(len(self.noise_switches))) switch_vals = [switch.get_value() for switch in self.noise_switches] [switch.set_value(1.) for switch in self.noise_switches] # train train_costs = [] train_monitors = {key: [] for key in self.train_monitors_dict.keys()} for batch_start, batch_end in self.train_batches: _outs = raise_to_list(f_learn(batch_start, batch_end)) train_costs.append(_outs[0]) # handle any user defined monitors if len(train_monitors) > 0: current_monitors = zip(self.train_monitors_dict.keys(), _outs[1:]) for name, val in current_monitors: train_monitors[name].append(val) # get the mean values for the batches mean_train = numpy.mean(train_costs, 0) current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in train_monitors.items()} # log the mean values! log.info('Train cost: %s', trunc(mean_train)) if len(current_mean_monitors) > 0: log.info('Train monitors: %s', str(current_mean_monitors)) # send the values to their outservices if self.train_outservice: self.train_outservice.write(mean_train, TRAIN) for name, service in self.train_monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], TRAIN) # if there is a plot, also send them over! if plot: current_mean_monitors.update({TRAIN_COST_KEY: mean_train}) plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :) if len(self.noise_switches) > 0 and (self.valid_flag or self.test_flag): log.debug("Turning off %s noise switches", str(len(self.noise_switches))) [switch.set_value(0.) for switch in self.noise_switches] # valid if self.valid_flag: valid_monitors = {key: [] for key in self.valid_monitors_dict.keys()} for batch_start, batch_end in self.valid_batches: _outs = raise_to_list(self.valid_monitor_function(batch_start, batch_end)) current_monitors = zip(self.valid_monitors_dict.keys(), _outs) for name, val in current_monitors: valid_monitors[name].append(val) # get the mean values for the batches current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in valid_monitors.items()} # log the mean values! log.info('Valid monitors: %s', str(current_mean_monitors)) # send the values to their outservices for name, service in self.valid_monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], VALID) # if there is a plot, also send them over! if plot: plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) #test if self.test_flag: test_monitors = {key: [] for key in self.test_monitors_dict.keys()} for batch_start, batch_end in self.test_batches: _outs = raise_to_list(self.test_monitor_function(batch_start, batch_end)) current_monitors = zip(self.test_monitors_dict.keys(), _outs) for name, val in current_monitors: test_monitors[name].append(val) # get the mean values for the batches current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in test_monitors.items()} # log the mean values! log.info('Test monitors: %s', str(current_mean_monitors)) # send the values to their outservices for name, service in self.test_monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], TEST) # if there is a plot, also send them over! if plot: plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) # check for early stopping on train costs cost = numpy.sum(train_costs) if cost < self.best_cost * self.early_stop_threshold: self.patience = 0 self.best_cost = cost # save the parameters that made it the best self.best_params = get_shared_values(self.params) else: self.patience += 1 # check for stopping either from n_epochs or from threshold/patience stop = False if self.epoch_counter >= self.n_epoch: log.info("Stopping (reached max number of epochs)...") stop = True if self.patience >= self.early_stop_length: log.info("Stopping early (reached stop threshold)...") stop = True timing = time.time() - t self.times.append(timing) log.info('time: ' + make_time_units_string(timing)) log.debug('remaining time: ' + make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times))) if (self.epoch_counter % self.save_frequency) == 0: #save params self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl') # ANNEAL! if not stop: # perform the appropriate decay on the decay functions/parameters for this optimizer and model for decay_param in self.get_decay_params(): decay_param.decay() # reset the switches if len(self.noise_switches) > 0: [switch.set_value(val) for switch, val in zip(self.noise_switches, switch_vals)] # return whether or not to stop this epoch return stop def get_decay_params(self): """ Returns a list of all the Decay objects to decay during training. Returns ------- list List of Decay objects to use after each training epoch - in this case the learning rate decay. """ decay_params = self.model.get_decay_params() if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay: decay_params.append(self.learning_rate_decay) return decay_params