def main(): var = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W') updates = [(var, add_uniform(input=var, noise_level=.02))] stats = get_stats(var) l1 = stats.pop('l1') l2 = stats.pop('l2') min = stats.pop('min') max = stats.pop('max') var = stats.pop('var') std = stats.pop('std') mean = stats.pop('mean') mean_monitor = Monitor('mean', mean, train=True, valid=True, out_service=FileService('outs/mean.txt')) var_monitor = Monitor('var', var, out_service=FileService('outs/var.txt')) w_channel = MonitorsChannel('W', monitors=mean_monitor) stat_channel = MonitorsChannel('stats', monitors=[var_monitor]) monitors = [w_channel, stat_channel] train_collapsed_raw = collapse_channels(monitors, train=True) train_collapsed = OrderedDict([(item[0], item[1]) for item in train_collapsed_raw]) train_services = OrderedDict([(item[0], item[2]) for item in train_collapsed_raw]) valid_collapsed_raw = collapse_channels(monitors, valid=True) valid_collapsed = OrderedDict([(item[0], item[1]) for item in valid_collapsed_raw]) valid_services = OrderedDict([(item[0], item[2]) for item in valid_collapsed_raw]) log.debug('compiling...') f = theano.function(inputs=[], outputs=train_collapsed.values(), updates=updates) f2 = theano.function(inputs=[], outputs=valid_collapsed.values(), updates=updates) log.debug('done') t1=time.time() for epoch in range(10): t=time.time() log.debug(epoch) vals = f() m = OrderedDict(zip(train_collapsed.keys(), vals)) for name, service in train_services.items(): if name in m: service.write(m[name], TRAIN) log.debug('----- '+make_time_units_string(time.time()-t)) for epoch in range(10): t = time.time() log.debug(epoch) vals = f2() m = OrderedDict(zip(valid_collapsed.keys(), vals)) for name, service in valid_services.items(): if name in m: service.write(m[name], VALID) log.debug('----- ' + make_time_units_string(time.time() - t)) log.debug("TOTAL TIME "+make_time_units_string(time.time()-t1))
def get_gradients(self, model, data, ** kwargs): cost = self.expr(model=model, data=data, **kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore') gradients = OrderedDict(izip(params, grads)) if self.gradient_clipping: norm_gs = 0. for grad in gradients.values(): norm_gs += (grad ** 2).sum() not_finite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) norm_gs = T.sqrt(norm_gs) norm_gs = T.switch(T.ge(norm_gs, self.max_magnitude), self.max_magnitude / norm_gs, 1.) for param, grad in gradients.items(): gradients[param] = T.switch(not_finite, .1 * param, grad * norm_gs) updates = OrderedDict() return gradients, updates
def orderings(self): """ Return dict d s.t. d[node] is a list of nodes that must be evaluated before node itself can be evaluated. This is used primarily by the destroy_handler feature to ensure that all clients of any destroyed inputs have already computed their outputs. :note: This only calls the orderings() fct on all features. It does not take care of computing dependencies by itself. """ ords = OrderedDict() assert isinstance(self._features, list) for feature in self._features: if hasattr(feature, 'orderings'): orderings = feature.orderings(self) if not isinstance(orderings, OrderedDict): raise TypeError("Non-deterministic return value from " + str(feature.orderings) + ". Nondeterministic object is " + str(orderings)) for node, prereqs in orderings.items(): if not isinstance(prereqs, (list, OrderedSet)): raise TypeError( "prereqs must be a type with a " "deterministic iteration order, or toposort " " will be non-deterministic.") ords.setdefault(node, []).extend(prereqs) # eliminate duplicate prereqs for (node, prereqs) in ords.items(): ords[node] = list(OrderedSet(prereqs)) return ords
class StemCell(NonlinCell): """ WRITEME Parameters ---------- .. todo:: """ def __init__(self, name, parent=[], parent_dim=[], nout=None, init_W=InitCell('randn'), init_b=InitCell('zeros'), cons=0., use_bias=1, lr_scaler=None, **kwargs): super(StemCell, self).__init__(**kwargs) if name is None: name = self.__class__.name__.lower() self.name = name self.nout = nout self.init_W = init_W self.init_b = init_b self.cons = cons self.parent = OrderedDict() parent_dim = tolist(parent_dim) for i, par in enumerate(tolist(parent)): if len(parent_dim) != 0 and len(parent) != 0: if len(parent) != len(parent_dim): raise AssertionError("You probably had a mistake providing,\ write number of values. It will end,\ up with a model containing a bug.") self.parent[par] = parent_dim[i] else: self.parent[par] = None self.lr_scaler = lr_scaler self.use_bias = use_bias def fprop(self): raise NotImplementedError( str(type(self)) + " does not implement Layer.fprop.") def initialize(self): params = OrderedDict() for parname, parout in self.parent.items(): W_shape = (parout, self.nout) W_name = 'W_' + parname + '__' + self.name params[W_name] = self.init_W.get(W_shape) if self.use_bias: params['b_'+self.name] = self.init_b.get(self.nout) return params
class RecurrentLayer(StemCell): """ Abstract class for recurrent layers Parameters ---------- .. todo:: """ def __init__(self, recurrent=[], recurrent_dim=[], self_recurrent=1, clip_gradient = True, clip_bound = 5, init_U=InitCell('ortho'), **kwargs): super(RecurrentLayer, self).__init__(**kwargs) self.recurrent = OrderedDict() if self_recurrent: self.recurrent[self.name] = self.nout recurrent_dim = tolist(recurrent_dim) for i, rec in enumerate(tolist(recurrent)): if len(recurrent_dim) != 0: self.recurrent[rec] = recurrent_dim[i] else: self.recurrent[rec] = None self.clip_gradient = clip_gradient self.clip_bound = clip_bound self.init_U = init_U def get_init_state(self, batch_size): state = T.zeros((batch_size, self.nout), dtype=theano.config.floatX) state = T.unbroadcast(state, *range(state.ndim)) return state def initialize(self): self.params = super(RecurrentLayer, self).initialize() for recname, recout in self.recurrent.items(): U_shape = (recout, self.nout) U_name = 'U_'+recname+'__'+self.name self.alloc(self.init_U.get(U_shape, U_name)) return self.params
class RecurrentLayer(StemCell): """ Abstract class for recurrent layers Parameters ---------- .. todo:: """ def __init__(self, recurrent=[], recurrent_dim=[], self_recurrent=1, init_U=InitCell('ortho'), **kwargs): super(RecurrentLayer, self).__init__(**kwargs) self.recurrent = OrderedDict() if self_recurrent: self.recurrent[self.name] = self.nout recurrent_dim = tolist(recurrent_dim) for i, rec in enumerate(tolist(recurrent)): if len(recurrent_dim) != 0: self.recurrent[rec] = recurrent_dim[i] else: self.recurrent[rec] = None self.init_U = init_U def get_init_state(self, batch_size): state = T.zeros((batch_size, self.nout), dtype=theano.config.floatX) state = T.unbroadcast( state, *range(state.ndim) ) #[0,1] this is to raise an error if length of dimensions are not 1 return state def initialize(self): params = super(RecurrentLayer, self).initialize() for recname, recout in self.recurrent.items(): U_shape = (recout, self.nout) U_name = 'U_' + recname + '__' + self.name params[U_name] = self.init_U.get(U_shape) return params
class RecurrentLayer(StemCell): """ Abstract class for recurrent layers Parameters ---------- .. todo:: """ def __init__(self, recurrent=[], recurrent_dim=[], skip_list=[], use_fast_fprop=0, self_recurrent=1, init_state_cons=0., init_U=InitCell('ortho'), **kwargs): super(RecurrentLayer, self).__init__(**kwargs) self.recurrent = OrderedDict() if self_recurrent: self.recurrent[self.name] = self.nout recurrent_dim = tolist(recurrent_dim) for i, rec in enumerate(tolist(recurrent)): if len(recurrent_dim) != 0: self.recurrent[rec] = recurrent_dim[i] else: self.recurrent[rec] = None self.init_U = init_U self.init_states = OrderedDict() self.init_state_cons = init_state_cons self.use_fast_fprop = use_fast_fprop self.skip_list = tolist(skip_list) if len(self.skip_list) > 0: if len(self.skip_list) != len(self.parent): raise ValueError("length of parents and skip list should match") def get_init_state(self, batch_size): state = T.zeros((batch_size, self.nout), dtype=theano.config.floatX) + self.init_state_cons state = T.unbroadcast(state, *range(state.ndim)) return state def initialize(self): super(RecurrentLayer, self).initialize() for recname, recout in self.recurrent.items(): U_shape = (recout, self.nout) U_name = 'U_'+recname+'__'+self.name self.alloc(self.init_U.get(U_shape, U_name))
class RecurrentLayer(StemCell): """ Abstract class for recurrent layers Parameters ---------- .. todo:: """ def __init__(self, batch_size, recurrent=[], recurrent_dim=[], self_recurrent=1, init_state_cons=0., init_U=InitCell('ortho'), **kwargs): super(RecurrentLayer, self).__init__(**kwargs) self.recurrent = OrderedDict() if self_recurrent: self.recurrent[self.name] = self.nout recurrent_dim = tolist(recurrent_dim) for i, rec in enumerate(tolist(recurrent)): if len(recurrent_dim) != 0: self.recurrent[rec] = recurrent_dim[i] else: self.recurrent[rec] = None self.batch_size = batch_size self.init_U = init_U self.init_states = OrderedDict() self.init_state_cons = init_state_cons def get_init_state(self, batch_size=None): if batch_size is None: batch_size = self.batch_size state = T.zeros((batch_size, self.nout)) + self.init_state_cons state = T.unbroadcast(state, *range(state.ndim)) return state def initialize(self): super(RecurrentLayer, self).initialize() for recname, recout in self.recurrent.items(): U_shape = (recout, self.nout) U_name = 'U_' + recname + '__' + self.name self.alloc(self.init_U.get(U_shape, U_name))
def get_updates(self, grads): """ .. todo:: WRITEME """ updates = OrderedDict() g_tt = OrderedDict() cnt = sharedX(0, 'counter') for p, g in grads.items(): lr_scaler = self.lr_scalers.get(str(p), 1.) m = sharedX(p.get_value() * 0.) v = sharedX(p.get_value() * 0.) b1 = self.b1 * self.lambd**cnt m_t = b1 * m + (1 - b1) * g v_t = self.b2 * v + (1 - self.b2) * g**2 m_t_hat = m_t / (1. - self.b1**(cnt + 1)) v_t_hat = v_t / (1. - self.b2**(cnt + 1)) g_t = m_t_hat / (T.sqrt(v_t_hat) + self.e) p_t = p - lr_scaler * self.lr * g_t g_tt[p] = g_t updates[m] = m_t updates[v] = v_t updates[p] = p_t if self.post_clip: g_norm = sum([T.sqr(x/self.batch_size).sum() for x in g_tt.values()]) not_finite = T.or_(T.isnan(g_norm), T.isinf(g_norm)) g_norm = T.sqrt(g_norm) scaler = self.scaler / T.maximum(self.scaler, g_norm) for p, g in g_tt.items(): lr_scaler = self.lr_scalers.get(str(p), 1.) p_t = p - lr_scaler * self.lr * g * scaler updates[p] = p_t updates[cnt] = cnt + 1 return updates
class Optimizer(object): """ Default interface for an optimizer implementation - this provides the necessary parameter updates when training a model on a dataset using an online stochastic process. The base framework for performing stochastic gradient descent. """ def __init__(self, dataset, loss=None, model=None, epochs=1000, batch_size=100, min_batch_size=1, save_freq=10, stop_threshold=None, stop_patience=50, learning_rate=1e-3, lr_decay=None, lr_decay_factor=None, grad_clip=None, hard_clip=False, **kwargs): """ Initialize the Optimizer. Parameters ---------- dataset : Dataset The :class:`opendeep.data.Dataset` to use when training the Model. loss : Loss The :class:`opendeep.optimization.loss.Loss` function to compare the model to a 'target' result. model : Model The :class:`opendeep.models.Model` to train. Needed if the Optimizer isn't being passed to a Model's .train() method. epochs : int How many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. min_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_freq : int, optional How many epochs to train between each new save of the Model's parameters. stop_threshold : float, optional The factor by how much the best validation training score needs to improve to determine early stopping. stop_patience : int, optional The patience or number of epochs to wait after the stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for classes of decay and documentation. lr_decay_factor : float The amount of decay to use for the ``lr_decay`` type of decay. grad_clip : float, optional Whether to clip gradients. This will clip the norm of the gradients either with a hard cutoff or rescaling. hard_clip : bool Whether to use a hard cutoff or rescaling for clipping gradients. """ log.info("Initializing optimizer %s", str(self.__class__.__name__)) # Deal with early stopping None initializations (no early stopping). if not stop_threshold: stop_threshold = numpy.inf if not save_freq: save_freq = 1000000 if not stop_patience: stop_patience = 1 # Put all init parameters in self.args so we can log the initial configuration. self.args = locals().copy() self.args.pop('self') kwargs = self.args.pop('kwargs') self.args = add_kwargs_to_dict(kwargs, self.args) # log the arguments log.info("Optimizer config args: %s", str(self.args)) # if the optimizer wasn't initialized with a Model (train() being called from the model class itself), # just return. (This seems kinda hacky but hey, people wanted .train() to happen from Model and there # wasn't really a better way unless the epoch looping logic was in that method for Model. That wasn't # the best option because other methods besides stochastic ones can exist for optimizers in the future. # TODO: fix this up - feels like a hack just to make model.train() work... if not model: return # Otherwise, things are proceeding as normal. Carry on... assert isinstance(model, Model), "Optimizer input model needs to be a Model class! " \ "Found %s" % str(model.__class__.__name__) assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be a Dataset class! " \ "Found %s" % str(dataset.__class__.__name__) # deal with loss expression/targets if loss is not None: assert isinstance(loss, Loss), "Optimizer input loss needs to be a Loss class! " \ "Found %s" % str(loss.__class__.__name__) if isinstance(loss, Loss): self.loss_targets = loss.get_targets() self.loss_expression = loss.get_loss() else: assert model.get_loss() is not None, "No Loss specified, and the model does not have one implemented." if isinstance(model.get_loss(), tuple): self.loss_targets = raise_to_list(model.get_loss()[0]) self.loss_expression = model.get_loss()[1] else: self.loss_targets = None self.loss_expression = model.get_loss() model_inputs = raise_to_list(model.get_inputs()) n_model_inputs = len(model_inputs) model_targets = self.loss_targets or [] for input in model_inputs: if input in model_targets: model_targets.remove(input) n_model_targets = len(model_targets) self.unsupervised = (n_model_targets is 0) # make sure the number of inputs/targets matches up with the dataset properties # train assert n_model_inputs == len(raise_to_list(dataset.train_inputs)), \ "Dataset has %d train inputs, while model expects %d" % \ (len(raise_to_list(dataset.train_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.train_targets) or []), \ "Dataset has %d train targets, while model expects %d" % \ (len(raise_to_list(dataset.train_targets) or []), n_model_targets) # valid if dataset.valid_inputs is not None: assert n_model_inputs == len(raise_to_list(dataset.valid_inputs)), \ "Dataset has %d valid inputs, while model expects %d" % \ (len(raise_to_list(dataset.valid_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.valid_targets) or []), \ "Dataset has %d valid targets, while model expects %d" % \ (len(raise_to_list(dataset.valid_targets) or []), n_model_targets) # test if dataset.test_inputs is not None: assert n_model_inputs == len(raise_to_list(dataset.test_inputs)), \ "Dataset has %d test inputs, while model expects %d" % \ (len(raise_to_list(dataset.test_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.test_targets) or []), \ "Dataset has %d test targets, while model expects %d" % \ (len(raise_to_list(dataset.test_targets) or []), n_model_targets) # now we are happy, we can add them to `self` self.model = model self.dataset = dataset self.loss = loss # Learning rate - how drastic of a step do the parameters change self.learning_rate = sharedX(learning_rate, 'learning_rate') # whether to scale individual model parameters' learning rates. self.lr_scalers = self.model.get_lr_scalers() # whether to decay if lr_decay: self.learning_rate_decay = get_decay_function(lr_decay, self.learning_rate, learning_rate, lr_decay_factor) else: self.learning_rate_decay = False # rest of initial parameters needed for training. self.batch_size = batch_size self.min_batch_size = min_batch_size self.n_epoch = epochs self.save_frequency = save_freq self.early_stop_threshold = stop_threshold self.early_stop_length = stop_patience self.grad_clip = grad_clip self.hard_clip = hard_clip def get_updates(self, gradients): """ This returns the parameter updates to use during training. It defaults to only using (annealed) learning rate. Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. """ log.debug('Setting up Stochastic Gradient Descent for optimizer...') updates = OrderedDict() for (param, gradient) in six.iteritems(gradients): scaled_lr = self.learning_rate * self.lr_scalers.get(param, 1.) updates[param] = param - scaled_lr * gradient return updates def train(self, monitor_channels=None, train_outservice=None, plot=None): """ This method performs the training!!! It is an online training method that goes over minibatches from the dataset for a number of epochs, updating parameters after each minibatch. You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully. Parameters ---------- monitor_channels : list(MonitorsChannel or Monitor), optional The list of channels or monitors containing monitor expressions/variables to compile and evaluate on the data. train_outservice : OutService, optional The OutService to use for the automatically created train_cost monitor. Default of None just outputs to logs. plot : Plot, optional The Plot object to use if we want to graph the outputs (uses bokeh server). """ if not self.model: log.error("No self.model for the Optimizer!") raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() " "was called from the Model. Try initializing the Optimizer with the model param " "and calling optimizer.train().") ######################### # gradients and updates # ######################### # grab the model parameters to use during training self.params = self.model.get_params() # Now create the training cost function for the model to use while training - update parameters # gradient! gradients = grad(cost=self.loss_expression, wrt=list(self.params.values())) # now create the dictionary mapping the parameter with its gradient gradients = OrderedDict( [(param, g) for param, g in zip(list(self.params.values()), gradients)] ) # clip gradients if we want. gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable updates = self.model.get_updates() if updates: updates.update(gradient_updates) else: updates = gradient_updates log.info("%s params: %s", self.model._classname, str(list(self.params.keys()))) ############ # monitors # ############ # deal with the monitor channels if they were given (or take them from the plot) if monitor_channels is None and plot is not None and len(plot.channels) > 0: monitor_channels = plot.channels self.train_monitors_dict = {} self.valid_monitors_dict = {} self.test_monitors_dict = {} self.train_monitors_outservice_dict = {} self.valid_monitors_outservice_dict = {} self.test_monitors_outservice_dict = {} if monitor_channels: # collapse the appropriate monitors into their (name, expression, out_service) tuples train_collapsed = collapse_channels(monitor_channels, train=True) valid_collapsed = collapse_channels(monitor_channels, valid=True) test_collapsed = collapse_channels(monitor_channels, test=True) # get name: expression dictionary self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) self.test_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in test_collapsed]) # get name: outservice dictionary self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed]) self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed]) self.test_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in test_collapsed]) # finally deal with an outservice provided to monitor training cost self.train_outservice = train_outservice # remove redundant files made by the fileservice for the train monitor. # TODO: THIS FEELS LIKE A HACK. I don't like it. if isinstance(self.train_outservice, FileService): os.remove(self.train_outservice.valid_filename) os.remove(self.train_outservice.test_filename) ####################################### # compile train and monitor functions # ####################################### function_input = raise_to_list(self.model.get_inputs()) if self.loss_targets is not None: function_input += self.loss_targets # Compile the training function! log.info('Compiling f_learn function for model %s...', self.model._classname) t = time.time() f_learn = function(inputs=function_input, updates=updates, outputs=[self.loss_expression] + list(self.train_monitors_dict.values()), name='f_learn') log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) # figure out if we want valid and test (monitors) self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0) self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0) # Now compile the monitor functions! log.debug("Compiling monitor functions...") monitor_t = time.time() # valid monitors if self.valid_flag: self.valid_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.valid_monitors_dict.values()), name='valid_monitor_function' ) else: self.valid_monitor_function = None # test monitors if self.test_flag: self.test_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.test_monitors_dict.values()), name='test_monitor_function' ) else: self.test_monitor_function = None log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) ################## # start training # ################## log.info("-----------TRAINING %s FOR %d EPOCHS-----------", self.model._classname, self.n_epoch) self.STOP = False self.epoch_counter = 0 # reset any decay params for decay_param in self.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(f_learn, plot) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") for best_param, param_value in self.best_params.items(): self.params[best_param].set_value(param_value, borrow=False) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter)) log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t)) def _perform_one_epoch(self, f_learn, plot=None): """ Performs a single training iteration with the given learn function. """ self.epoch_counter += 1 t = time.time() log.info('EPOCH %s', str(self.epoch_counter)) # set the noise switches on for training function! (this is where things like dropout happen) if not self.model.switches_on: self.model.turn_on_switches() ######### # train # ######### train_costs = [] train_monitors = {key: [] for key in self.train_monitors_dict.keys()} train_data = [ minibatch(input_data, self.batch_size, self.min_batch_size) for input_data in raise_to_list(self.dataset.train_inputs) ] if self.dataset.train_targets is not None and not self.unsupervised: train_data += [ minibatch(target, self.batch_size, self.min_batch_size) for target in raise_to_list(self.dataset.train_targets) ] for batch in min_normalized_izip(*train_data): _outs = raise_to_list(f_learn(*batch)) train_costs.append(_outs[0]) # handle any user defined monitors if len(train_monitors) > 0: current_monitors = zip(self.train_monitors_dict.keys(), _outs[1:]) for name, val in current_monitors: val = numpy.asarray(val) train_monitors[name].append(val) # get the mean values for the batches mean_train = numpy.mean(train_costs, 0) current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in train_monitors.items()} # log the mean values! log.info('Train cost: %s', trunc(mean_train)) if len(current_mean_monitors) > 0: log.info('Train monitors: %s', str(current_mean_monitors)) # send the values to their outservices if self.train_outservice: self.train_outservice.write(mean_train, "train") for name, service in self.train_monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], "train") # if there is a plot, also send them over! if plot: current_mean_monitors.update({TRAIN_COST_KEY: mean_train}) plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :) if self.model.switches_on: self.model.turn_off_switches() ######### # valid # ######### self._compute_over_subset("valid", self.dataset.valid_inputs, self.dataset.valid_targets, self.valid_monitors_dict, self.valid_monitor_function, self.valid_monitors_outservice_dict, plot) ######## # test # ######## self._compute_over_subset("test", self.dataset.test_inputs, self.dataset.test_targets, self.test_monitors_dict, self.test_monitor_function, self.test_monitors_outservice_dict, plot) ########### # cleanup # ########### # check for early stopping on train costs cost = numpy.sum(train_costs) # if the cost improved, reset the patience and record the best cost. if cost < self.best_cost * self.early_stop_threshold: self.patience = 0 self.best_cost = cost # save the parameters that made it the best self.best_params = {key: param.get_value(borrow=False) for key, param in self.params.items()} elif not numpy.isnan(cost): self.patience += 1 # check for stopping either from n_epochs or from threshold/patience stop = False if self.epoch_counter >= self.n_epoch: log.info("Stopping (reached max number of epochs)...") stop = True if self.patience >= self.early_stop_length: log.info("Stopping early (reached stop threshold)...") stop = True timing = time.time() - t self.times.append(timing) log.info('time: ' + make_time_units_string(timing)) log.debug('remaining time: ' + make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times))) if (self.epoch_counter % self.save_frequency) == 0: #save params self.model.save_params('trained_epoch_' + str(self.epoch_counter)) # ANNEAL! if not stop: # perform the appropriate decay on the decay functions/parameters for this optimizer and model for decay_param in self.get_decay_params(): decay_param.decay() # return whether or not to stop this epoch return stop def _compute_over_subset(self, subset, inputs, targets, monitors_dict, monitor_function, monitors_outservice_dict, plot): inputs = raise_to_list(inputs) targets = raise_to_list(targets) if inputs is not None and len(monitors_dict) > 0: monitors = {key: [] for key in monitors_dict.keys()} data = [minibatch(input, self.batch_size, self.min_batch_size) for input in inputs] if targets is not None and not self.unsupervised: data += [minibatch(target, self.batch_size, self.min_batch_size) for target in targets] for batch in min_normalized_izip(*data): _outs = raise_to_list(monitor_function(*batch)) current_monitors = zip(monitors_dict.keys(), _outs) for name, val in current_monitors: val = numpy.asarray(val) monitors[name].append(val) # get the mean values for the batches current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in monitors.items()} # log the mean values! log.info('%s monitors: %s', subset, str(current_mean_monitors)) # send the values to their outservices for name, service in monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], subset) # if there is a plot, also send them over! if plot: plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) def get_decay_params(self): """ Returns a list of all the Decay objects to decay during training. Returns ------- list List of Decay objects to use after each training epoch - in this case the learning rate decay. """ decay_params = self.model.get_decay_params() if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay: decay_params.append(self.learning_rate_decay) return decay_params
class StemCell(NonlinCell): """ WRITEME Parameters ---------- .. todo:: """ def __init__(self, name, parent=[], parent_dim=[], nout=None, init_W=InitCell('randn'), init_b=InitCell('zeros'), cons=0., use_bias=1, lr_scaler=1., x_as_index=0, **kwargs): super(StemCell, self).__init__(**kwargs) if name is None: name = self.__class__.name__.lower() self.name = name self.nout = nout self.init_W = init_W self.init_b = init_b self.cons = cons self.x_as_index = x_as_index self.parent = OrderedDict() parent_dim = tolist(parent_dim) for i, par in enumerate(tolist(parent)): if len(parent_dim) != 0 and len(parent) != 0: if len(parent) != len(parent_dim): raise AssertionError( "You probably had a mistake providing,\ write number of values. It will end,\ up with a model containing a bug.") self.parent[par] = parent_dim[i] else: self.parent[par] = None self.lr_scaler = lr_scaler self.use_bias = use_bias def fprop(self): raise NotImplementedError( str(type(self)) + " does not implement Layer.fprop.") def initialize(self): params = OrderedDict() for parname, parout in self.parent.items(): W_shape = (parout, self.nout) W_name = 'W_' + parname + '__' + self.name params[W_name] = self.init_W.get(W_shape) if self.use_bias: params['b_' + self.name] = self.init_b.get(self.nout) return params
class Optimizer(object): """ Default interface for an optimizer implementation - this provides the necessary parameter updates when training a model on a dataset using an online stochastic process. The base framework for performing stochastic gradient descent. """ def __init__(self, dataset, loss=None, model=None, epochs=1000, batch_size=100, min_batch_size=1, save_freq=10, stop_threshold=None, stop_patience=50, learning_rate=1e-3, lr_decay=None, lr_decay_factor=None, grad_clip=None, hard_clip=False, **kwargs): """ Initialize the Optimizer. Parameters ---------- dataset : Dataset The :class:`opendeep.data.Dataset` to use when training the Model. loss : Loss The :class:`opendeep.optimization.loss.Loss` function to compare the model to a 'target' result. model : Model The :class:`opendeep.models.Model` to train. Needed if the Optimizer isn't being passed to a Model's .train() method. epochs : int How many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. min_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_freq : int, optional How many epochs to train between each new save of the Model's parameters. stop_threshold : float, optional The factor by how much the best validation training score needs to improve to determine early stopping. stop_patience : int, optional The patience or number of epochs to wait after the stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for classes of decay and documentation. lr_decay_factor : float The amount of decay to use for the ``lr_decay`` type of decay. grad_clip : float, optional Whether to clip gradients. This will clip the norm of the gradients either with a hard cutoff or rescaling. hard_clip : bool Whether to use a hard cutoff or rescaling for clipping gradients. """ log.info("Initializing optimizer %s", str(self.__class__.__name__)) # Deal with early stopping None initializations (no early stopping). if not stop_threshold: stop_threshold = numpy.inf if not save_freq: save_freq = 1000000 if not stop_patience: stop_patience = 1 # Put all init parameters in self.args so we can log the initial configuration. self.args = locals().copy() self.args.pop('self') kwargs = self.args.pop('kwargs') self.args = add_kwargs_to_dict(kwargs, self.args) # log the arguments log.info("Optimizer config args: %s", str(self.args)) # if the optimizer wasn't initialized with a Model (train() being called from the model class itself), # just return. (This seems kinda hacky but hey, people wanted .train() to happen from Model and there # wasn't really a better way unless the epoch looping logic was in that method for Model. That wasn't # the best option because other methods besides stochastic ones can exist for optimizers in the future. # TODO: fix this up - feels like a hack just to make model.train() work... if not model: return # Otherwise, things are proceeding as normal. Carry on... assert isinstance(model, Model), "Optimizer input model needs to be a Model class! " \ "Found %s" % str(model.__class__.__name__) assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be a Dataset class! " \ "Found %s" % str(dataset.__class__.__name__) # deal with loss expression/targets if loss is not None: assert isinstance(loss, Loss), "Optimizer input loss needs to be a Loss class! " \ "Found %s" % str(loss.__class__.__name__) if isinstance(loss, Loss): self.loss_targets = loss.get_targets() self.loss_expression = loss.get_loss() else: assert model.get_loss() is not None, "No Loss specified, and the model does not have one implemented." if isinstance(model.get_loss(), tuple): self.loss_targets = raise_to_list(model.get_loss()[0]) self.loss_expression = model.get_loss()[1] else: self.loss_targets = None self.loss_expression = model.get_loss() model_inputs = raise_to_list(model.get_inputs()) n_model_inputs = len(model_inputs) model_targets = self.loss_targets or [] for input in model_inputs: if input in model_targets: model_targets.remove(input) n_model_targets = len(model_targets) self.unsupervised = (n_model_targets is 0) # make sure the number of inputs/targets matches up with the dataset properties # train assert n_model_inputs == len(raise_to_list(dataset.train_inputs)), \ "Dataset has %d train inputs, while model expects %d" % \ (len(raise_to_list(dataset.train_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.train_targets) or []), \ "Dataset has %d train targets, while model expects %d" % \ (len(raise_to_list(dataset.train_targets) or []), n_model_targets) # valid if dataset.valid_inputs is not None: assert n_model_inputs == len(raise_to_list(dataset.valid_inputs)), \ "Dataset has %d valid inputs, while model expects %d" % \ (len(raise_to_list(dataset.valid_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.valid_targets) or []), \ "Dataset has %d valid targets, while model expects %d" % \ (len(raise_to_list(dataset.valid_targets) or []), n_model_targets) # test if dataset.test_inputs is not None: assert n_model_inputs == len(raise_to_list(dataset.test_inputs)), \ "Dataset has %d test inputs, while model expects %d" % \ (len(raise_to_list(dataset.test_inputs)), n_model_inputs) if not self.unsupervised: assert n_model_targets == len(raise_to_list(dataset.test_targets) or []), \ "Dataset has %d test targets, while model expects %d" % \ (len(raise_to_list(dataset.test_targets) or []), n_model_targets) # now we are happy, we can add them to `self` self.model = model self.dataset = dataset self.loss = loss # Learning rate - how drastic of a step do the parameters change self.learning_rate = sharedX(learning_rate, 'learning_rate') # whether to scale individual model parameters' learning rates. self.lr_scalers = self.model.get_lr_scalers() # whether to decay if lr_decay: self.learning_rate_decay = get_decay_function(lr_decay, self.learning_rate, learning_rate, lr_decay_factor) else: self.learning_rate_decay = False # rest of initial parameters needed for training. self.batch_size = batch_size self.min_batch_size = min_batch_size self.n_epoch = epochs self.save_frequency = save_freq self.early_stop_threshold = stop_threshold self.early_stop_length = stop_patience self.grad_clip = grad_clip self.hard_clip = hard_clip def get_updates(self, gradients): """ This returns the parameter updates to use during training. It defaults to only using (annealed) learning rate. Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. """ log.debug('Setting up Stochastic Gradient Descent for optimizer...') updates = OrderedDict() for (param, gradient) in iteritems(gradients): scaled_lr = self.learning_rate * self.lr_scalers.get(param, 1.) updates[param] = param - scaled_lr * gradient return updates def train(self, monitor_channels=None, plot=None): """ This method performs the training!!! It is an online training method that goes over minibatches from the dataset for a number of epochs, updating parameters after each minibatch. You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully. Parameters ---------- monitor_channels : list(MonitorsChannel or Monitor), optional The list of channels or monitors containing monitor expressions/variables to compile and evaluate on the data. plot : Plot, optional The Plot object to use if we want to graph the outputs (uses bokeh server). """ if not self.model: log.error("No self.model for the Optimizer!") raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() " "was called from the Model. Try initializing the Optimizer with the model param " "and calling optimizer.train().") ######################### # gradients and updates # ######################### # grab the model parameters to use during training self.params = self.model.get_params() # Now create the training cost function for the model to use while training - update parameters # gradient! # First find the basic variables that will be updated params = set() for param in self.params.values(): params.update(base_variables(param)) params = list(params) gradients = grad(cost=self.loss_expression, wrt=params) # now create the dictionary mapping the parameter with its gradient gradients = OrderedDict( [(param, g) for param, g in zip(params, gradients)] ) # clip gradients if we want. gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable updates = self.model.get_updates() if updates: updates.update(gradient_updates) else: updates = gradient_updates log.info("%s params: %s", self.model._classname, str(list(self.params.keys()))) ############ # monitors # ############ # deal with the monitor channels if they were given (or take them from the plot) if monitor_channels is None and plot is not None and len(plot.channels) > 0: monitor_channels = plot.channels self.train_monitors_dict = {} self.valid_monitors_dict = {} self.test_monitors_dict = {} self.train_monitors_outservice_dict = {} self.valid_monitors_outservice_dict = {} self.test_monitors_outservice_dict = {} if monitor_channels: # collapse the appropriate monitors into their (name, expression, out_service) tuples train_collapsed = collapse_channels(monitor_channels, train=True) valid_collapsed = collapse_channels(monitor_channels, valid=True) test_collapsed = collapse_channels(monitor_channels, test=True) # get name: expression dictionary self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) self.test_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in test_collapsed]) # get name: outservice dictionary self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed]) self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed]) self.test_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in test_collapsed]) ####################################### # compile train and monitor functions # ####################################### function_input = raise_to_list(self.model.get_inputs()) if self.loss_targets is not None: function_input += self.loss_targets # Compile the training function! log.info('Compiling f_learn function for model %s...', self.model._classname) t = time.time() f_learn = function(inputs=function_input, updates=updates, outputs=[self.loss_expression] + list(self.train_monitors_dict.values()), name='f_learn') log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) # figure out if we want valid and test (monitors) self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0) self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0) # Now compile the monitor functions! log.debug("Compiling monitor functions...") monitor_t = time.time() # valid monitors if self.valid_flag: self.valid_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.valid_monitors_dict.values()), name='valid_monitor_function' ) else: self.valid_monitor_function = None # test monitors if self.test_flag: self.test_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.test_monitors_dict.values()), name='test_monitor_function' ) else: self.test_monitor_function = None log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) ################## # start training # ################## log.info("-----------TRAINING %s FOR %d EPOCHS-----------", self.model._classname, self.n_epoch) self.STOP = False self.epoch_counter = 0 # reset any decay params for decay_param in self.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(f_learn, plot) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") self.model.set_param_values(self.best_params, borrow=False) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter)) log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t)) def _perform_one_epoch(self, f_learn, plot=None): """ Performs a single training iteration with the given learn function. """ self.epoch_counter += 1 t = time.time() log.info('EPOCH %s', str(self.epoch_counter)) # set the noise switches on for training function! (this is where things like dropout happen) if not self.model.switches_on: self.model.turn_on_switches() ######### # train # ######### train_costs = [] train_monitors = {key: [] for key in self.train_monitors_dict.keys()} train_data = [ minibatch(input_data, self.batch_size, self.min_batch_size) for input_data in raise_to_list(self.dataset.train_inputs) ] if self.dataset.train_targets is not None and not self.unsupervised: train_data += [ minibatch(target, self.batch_size, self.min_batch_size) for target in raise_to_list(self.dataset.train_targets) ] for batch in min_normalized_izip(*train_data): _outs = raise_to_list(f_learn(*batch)) train_costs.append(_outs[0]) # handle any user defined monitors (if different from the train cost) if len(train_monitors) > 0: current_monitors = zip(self.train_monitors_dict.keys(), _outs[1:]) for name, val in current_monitors: val = numpy.asarray(val) train_monitors[name].append(val) # get the mean values for the batches mean_train = numpy.mean(train_costs, 0) current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in train_monitors.items()} # log the mean values! log.info('Train cost: %s', trunc(mean_train)) if len(current_mean_monitors) > 0: log.info('Train monitors: %s', str(current_mean_monitors)) # send the values to their outservices for name, service in self.train_monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], "train") # if there is a plot, also send them over! if plot: plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :) if self.model.switches_on: self.model.turn_off_switches() ######### # valid # ######### self._compute_over_subset("valid", self.dataset.valid_inputs, self.dataset.valid_targets, self.valid_monitors_dict, self.valid_monitor_function, self.valid_monitors_outservice_dict, plot) ######## # test # ######## self._compute_over_subset("test", self.dataset.test_inputs, self.dataset.test_targets, self.test_monitors_dict, self.test_monitor_function, self.test_monitors_outservice_dict, plot) ########### # cleanup # ########### # check for early stopping on train costs cost = numpy.sum(train_costs) # if the cost improved, reset the patience and record the best cost. if cost < self.best_cost * self.early_stop_threshold: self.patience = 0 self.best_cost = cost # save the parameters that made it the best self.best_params = self.model.get_param_values(borrow=False) elif not numpy.isnan(cost): self.patience += 1 # check for stopping either from n_epochs or from threshold/patience stop = False if self.epoch_counter >= self.n_epoch: log.info("Stopping (reached max number of epochs)...") stop = True if self.patience >= self.early_stop_length: log.info("Stopping early (reached stop threshold)...") stop = True timing = time.time() - t self.times.append(timing) log.info('time: ' + make_time_units_string(timing)) log.debug('remaining time: ' + make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times))) if (self.epoch_counter % self.save_frequency) == 0: #save params self.model.save_params('trained_epoch_' + str(self.epoch_counter)) # ANNEAL! if not stop: # perform the appropriate decay on the decay functions/parameters for this optimizer and model for decay_param in self.get_decay_params(): decay_param.decay() # return whether or not to stop this epoch return stop def _compute_over_subset(self, subset, inputs, targets, monitors_dict, monitor_function, monitors_outservice_dict, plot): inputs = raise_to_list(inputs) targets = raise_to_list(targets) if inputs is not None and len(monitors_dict) > 0: monitors = {key: [] for key in monitors_dict.keys()} data = [minibatch(input, self.batch_size, self.min_batch_size) for input in inputs] if targets is not None and not self.unsupervised: data += [minibatch(target, self.batch_size, self.min_batch_size) for target in targets] for batch in min_normalized_izip(*data): _outs = raise_to_list(monitor_function(*batch)) current_monitors = zip(monitors_dict.keys(), _outs) for name, val in current_monitors: val = numpy.asarray(val) monitors[name].append(val) # get the mean values for the batches current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in monitors.items()} # log the mean values! log.info('%s monitors: %s', subset, str(current_mean_monitors)) # send the values to their outservices for name, service in monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], subset) # if there is a plot, also send them over! if plot: plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) def get_decay_params(self): """ Returns a list of all the Decay objects to decay during training. Returns ------- list List of Decay objects to use after each training epoch - in this case the learning rate decay. """ decay_params = self.model.get_decay_params() if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay: decay_params.append(self.learning_rate_decay) return decay_params
def main(): var = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W') updates = [(var, add_uniform(input=var, noise_level=.02))] stats = get_stats(var) l1 = stats.pop('l1') l2 = stats.pop('l2') min = stats.pop('min') max = stats.pop('max') var = stats.pop('var') std = stats.pop('std') mean = stats.pop('mean') mean_monitor = Monitor('mean', mean, train=True, valid=True, out_service=FileService('outs/mean.txt')) var_monitor = Monitor('var', var, out_service=FileService('outs/var.txt')) w_channel = MonitorsChannel('W', monitors=mean_monitor) stat_channel = MonitorsChannel('stats', monitors=[var_monitor]) monitors = [w_channel, stat_channel] train_collapsed_raw = collapse_channels(monitors, train=True) train_collapsed = OrderedDict([(item[0], item[1]) for item in train_collapsed_raw]) train_services = OrderedDict([(item[0], item[2]) for item in train_collapsed_raw]) valid_collapsed_raw = collapse_channels(monitors, valid=True) valid_collapsed = OrderedDict([(item[0], item[1]) for item in valid_collapsed_raw]) valid_services = OrderedDict([(item[0], item[2]) for item in valid_collapsed_raw]) log.debug('compiling...') f = theano.function(inputs=[], outputs=train_collapsed.values(), updates=updates) f2 = theano.function(inputs=[], outputs=valid_collapsed.values(), updates=updates) log.debug('done') t1 = time.time() for epoch in range(10): t = time.time() log.debug(epoch) vals = f() m = OrderedDict(zip(train_collapsed.keys(), vals)) for name, service in train_services.items(): if name in m: service.write(m[name], TRAIN) log.debug('----- ' + make_time_units_string(time.time() - t)) for epoch in range(10): t = time.time() log.debug(epoch) vals = f2() m = OrderedDict(zip(valid_collapsed.keys(), vals)) for name, service in valid_services.items(): if name in m: service.write(m[name], VALID) log.debug('----- ' + make_time_units_string(time.time() - t)) log.debug("TOTAL TIME " + make_time_units_string(time.time() - t1))
class StemCell(NonlinCell): """ WRITEME Parameters ---------- .. todo:: """ def __init__(self, parent=[], parent_dim=[], nout=None, init_W=InitCell('randn'), init_b=InitCell('zeros'), cons=0., name=None, lr_scaler=None, **kwargs): super(StemCell, self).__init__(**kwargs) if name is None: name = self.__class__.name__.lower() self.name = name self.nout = nout self.init_W = init_W self.init_b = init_b self.cons = cons self.parent = OrderedDict() parent_dim = tolist(parent_dim) for i, par in enumerate(tolist(parent)): if len(parent_dim) != 0 and len(parent) != 0: if len(parent) != len(parent_dim): raise AssertionError( "You probably had a mistake providing,\ write number of values. It will end,\ up with a model containing a bug.") self.parent[par] = parent_dim[i] else: self.parent[par] = None self.params = OrderedDict() self.lr_scaler = lr_scaler def get_params(self): return self.params def fprop(self, x=None): raise NotImplementedError( str(type(self)) + " does not implement Layer.fprop.") def alloc(self, x): self.params[x.name] = x def initialize(self): for parname, parout in self.parent.items(): W_shape = (parout, self.nout) W_name = 'W_' + parname + '__' + self.name self.alloc(self.init_W.get(W_shape, W_name)) self.alloc(self.init_b.get(self.nout, 'b_' + self.name)) def add_noisy_params(self, key=['W'], weight_noise=0.075): self.noisy_params = OrderedDict() for param in self.params.items(): if param[0].split('_')[0] in key: self.noisy_params[param[0]] = add_noise( param[1], weight_noise, self.theano_rng) def del_noisy_params(self): del self.noisy_params
class Optimizer(object): """ Default interface for an optimizer implementation - this provides the necessary parameter updates when training a model on a dataset using an online stochastic process. """ def __init__(self, model, dataset, n_epoch=1000, batch_size=100, minimum_batch_size=1, save_frequency=10, early_stop_threshold=.9995, early_stop_length=30, learning_rate=1e-3, lr_decay='exponential', lr_factor=1, **kwargs): """ Initialize the Optimizer. Parameters ---------- model : Model The Model to train. dataset : Dataset The Dataset to use when training the Model. n_epoch : int how many training iterations over the dataset to go. batch_size : int How many examples from the training dataset to use in parallel. minimum_batch_size : int The minimum number of examples required at a time (for things like time series, this would be > 1). save_frequency : int How many epochs to train between each new save of the Model's parameters. early_stop_threshold : float The factor by how much the best validation training score needs to improve to determine early stopping. early_stop_length : int The patience or number of epochs to wait after the early_stop_threshold has been reached before stopping. learning_rate : float The multiplicative amount to adjust parameters based on their gradient values. lr_decay : str The type of decay function to use for changing the learning rate over epochs. See `opendeep.utils.decay` for options. lr_factor : float The amount to use for the decay function when changing the learning rate over epochs. See `opendeep.utils.decay` for its effect for given decay functions. """ log.info("Initializing optimizer %s", str(type(self))) if early_stop_threshold is None: early_stop_threshold = 1. if save_frequency is None: save_frequency = 1000000 if early_stop_length is None: early_stop_length = 100 self.args = locals().copy() self.args.pop('self') kwargs = self.args.pop('kwargs') self.args = add_kwargs_to_dict(kwargs, self.args) # log the arguments log.info("optimizer config args: %s", str(self.args)) assert isinstance(model, Model), "Optimizer input model needs to be an opendeep Model class!" assert isinstance(dataset, Dataset), "Optimizer input dataset needs to be an opendeep Dataset class!" self.model = model self.dataset = dataset # Learning rate - how drastic of a step do the parameters change self.learning_rate = sharedX(learning_rate, 'learning_rate') self.lr_scalers = self.model.get_lr_scalers() if lr_decay: self.learning_rate_decay = get_decay_function(lr_decay, self.learning_rate, self.learning_rate.get_value(), lr_factor) else: self.learning_rate_decay = False self.noise_switches = raise_to_list(self.model.get_noise_switch()) self.batch_size = batch_size self.minimum_batch_size = minimum_batch_size self.n_epoch = n_epoch self.save_frequency = save_frequency self.early_stop_threshold = early_stop_threshold self.early_stop_length = early_stop_length def _get_batch_indices(self, data_lengths): """ Computes the tuples of (start_index, end_index) that represent the appropriate slices of the concatenated dataset with regards to the given data_lengths. This allows for lists of data lengths to represent sequences, so that the concatenated batches returned do not overstep the start of a new sequence. Parameters ---------- data_lengths : list(int) or int List of num_examples for each dataset (the length of the datasets - this is a list in the case of sequences). Returns ------- list((int, int)) List of tuples (start, end) representing the batch slices for the total dataset if it were concatenated. """ batch_indices = [] start_idx = 0 for len in raise_to_list(data_lengths): # integer division to determine number of whole batches for this length n_batches = len / int(self.batch_size) # add the (start_idx, end_idx) tuple to the list for i in range(n_batches): end_idx = start_idx + self.batch_size batch_indices.append((start_idx, end_idx)) start_idx = end_idx # remainder to find number of leftover examples remainder = numpy.remainder(len, self.batch_size) end_idx = start_idx + remainder # check if it is bigger than the minimum allowed size if remainder >= self.minimum_batch_size: batch_indices.append((start_idx, end_idx)) start_idx = end_idx return batch_indices def _get_givens_subset(self, subset, batch_slice): """ This translates a batch slice of start and end indices into the actual data from the given subset. Parameters ---------- subset : int The subset to use - determined in opendeep.data.datasets as TRAIN, VALID, or TEST attributes. batch_slice : symbolic slice The symbolic slice to grab from the data. Returns ------- OrderedDict The givens to provide to a function where it sets the input variable to the actual batch representation of data from the dataset: (input_variable: data[batch]) """ # translate the data_idx into the givens for the model # first get the lists of input variables the model requires - inputs and targets model_inputs = raise_to_list(self.model.get_inputs()) model_targets = raise_to_list(self.model.get_targets()) givens = None if self.dataset.getSubset(subset)[0] is not None: # grab the data and labels data, labels = self.dataset.getSubset(subset) # create the givens for the input function as pairs of (input_variable: sliced_data) givens = OrderedDict(zip(model_inputs, [data[batch_slice]])) # include labels as well if they are required by the model if model_targets is not None and len(model_targets) > 0: if labels is None: log.error("No labels in the dataset!") raise AssertionError, "No lables in the dataset!" givens.update(OrderedDict(zip(model_targets, [labels[batch_slice]]))) else: log.warning("Dataset doesn't have subset %s" % get_subset_strings(subset)) return givens def get_updates(self, gradients): """ This returns the parameter updates to use during training. It defaults to only using (annealed) learning rate. Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. """ log.debug('Setting up Stochastic Gradient Descent for optimizer...') updates = OrderedDict() for (param, gradient) in six.iteritems(gradients): scaled_lr = self.learning_rate * self.lr_scalers.get(param, 1.) updates[param] = param - scaled_lr * gradient return updates def get_lr_monitor(self): """ Returns a monitor dictionary to the Optimizer's learning rate. Returns ------- dict Mapping 'learning_rate' to `self.learning_rate` shared variable. """ return {'learning_rate': self.learning_rate} def train(self, monitor_channels=None, train_outservice=None, plot=None, continue_training=False): """ This method performs the training!!! It is an online training method that goes over minibatches from the dataset for a number of epochs, updating parameters after each minibatch. You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully. Parameters ---------- monitor_channels : list(MonitorsChannel or Monitor), optional The list of channels or monitors containing monitor expressions/variables to compile and evaluate on the data. train_outservice : OutService, optional The OutService to use for the automatically created train_cost monitor. Default of None just outputs to logs. plot : Plot, optional The Plot object to use if we want to graph the outputs (uses bokeh server). continue_training : bool Whether to continue training from a previous point. """ ############################################### # theano index variable to use on the dataset # ############################################### # index to a [mini]batch - both start and end data_idx = T.iscalar('data_index') data_end_idx = T.iscalar('data_end_index') function_input = [data_idx, data_end_idx] batch_slice = slice(data_idx, data_end_idx) # compute number of minibatches for training, validation and testing # shapes is list of list - input list of datasets to optimizer (for multiple inputs), and each dataset # could be a list of shared variables (like multiple sequences from files) train_data_shapes = raise_to_list(self.dataset.getDataShape(TRAIN)) valid_data_shapes = raise_to_list(self.dataset.getDataShape(VALID)) test_data_shapes = raise_to_list(self.dataset.getDataShape(TEST)) # train_batches is going to be lists of tuples that contain the start and end indices for train data. # this is more useful in the case of datasets that are lists of sequences, so that the start and end # indices can make sure a batch does not cross the sequence boundary on the concatenated data train_data_lens = [shape[0] for shape in train_data_shapes] self.train_batches = self._get_batch_indices(train_data_lens) if valid_data_shapes is not None: valid_data_lens = [shape[0] for shape in valid_data_shapes] self.valid_batches = self._get_batch_indices(valid_data_lens) else: self.valid_batches = None if test_data_shapes is not None: test_data_lens = [shape[0] for shape in test_data_shapes] self.test_batches = self._get_batch_indices(test_data_lens) else: self.test_batches = None # create the givens for the input function as pairs of (input_variable: sliced_data) train_givens = self._get_givens_subset(TRAIN, batch_slice) valid_givens = self._get_givens_subset(VALID, batch_slice) test_givens = self._get_givens_subset(TEST, batch_slice) # Now time to create the gradient updates for the model - make sure to handle the possible # list of costs used for pretraining of certain parts of the model. train_costs = raise_to_list(self.model.get_train_cost()) train_updates = [] self.gradients = [] for i, train_cost in enumerate(train_costs): # Now create the training cost function for the model to use while training - update parameters # gradient! gradients, _ = self.model.get_gradient(cost=train_cost) self.gradients.append(gradients) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable updates = self.model.get_updates() if updates: updates.update(gradient_updates) else: updates = gradient_updates train_updates.append(updates) # grab the model parameters to use during training self.params = self.model.get_params() log.info("%s params: %s", str(type(self.model)), str(self.params)) # deal with the monitor channels if they were given (or take them from the plot) if monitor_channels is None and plot is not None and len(plot.channels) > 0: monitor_channels = plot.channels self.train_monitors_dict = {} self.valid_monitors_dict = {} self.test_monitors_dict = {} self.train_monitors_outservice_dict = {} self.valid_monitors_outservice_dict = {} self.test_monitors_outservice_dict = {} if monitor_channels: # collapse the appropriate monitors into their (name, expression, out_service) tuples train_collapsed = collapse_channels(monitor_channels, train=True) valid_collapsed = collapse_channels(monitor_channels, valid=True) test_collapsed = collapse_channels(monitor_channels, test=True) # get name: expression dictionary self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) self.test_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in test_collapsed]) # get name: outservice dictionary self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed]) self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed]) self.test_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in test_collapsed]) # finally deal with an outservice provided to monitor training cost self.train_outservice = train_outservice # remove redundant files made by the fileservice for the train monitor. # TODO: THIS FEELS LIKE A HACK. I don't like it. if isinstance(self.train_outservice, FileService): os.remove(self.train_outservice.valid_filename) os.remove(self.train_outservice.test_filename) ####################################### # compile train and monitor functions # ####################################### train_functions = [] for i in range(len(train_costs)): updates = train_updates[i] train_cost = train_costs[i] # Compile the training function! log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_updates), str(type(self.model))) t = time.time() f_learn = function(inputs=function_input, updates=updates, outputs=[train_cost] + self.train_monitors_dict.values(), givens=train_givens, name='f_learn_%d' % i) log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) train_functions.append(f_learn) # figure out if we want valid and test self.valid_flag = (self.dataset.getSubset(VALID)[0] is not None) and (len(self.valid_monitors_dict) > 0) self.test_flag = (self.dataset.getSubset(TEST)[0] is not None) and (len(self.test_monitors_dict) > 0) # Now compile the monitor functions! log.debug("Compiling monitor functions...") monitor_t = time.time() # valid monitors if self.valid_flag: self.valid_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=self.valid_monitors_dict.values(), givens=valid_givens, name='valid_monitor_function' ) else: self.valid_monitor_function = None # test monitors if self.test_flag: self.test_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=self.test_monitors_dict.values(), givens=test_givens, name='test_monitor_function' ) else: self.test_monitor_function = None log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) ################## # start training # ################## # make sure to deal with a list of train_cost functions - for layer-wise pretraining! # this list of training functions was created during __init__() start_time = time.time() for func_i, train_function in enumerate(train_functions): log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS (continue_training=%s)-----------", str(type(self.model)), func_i + 1, len(train_functions), self.n_epoch, str(continue_training)) log.debug("Train dataset size is: %s", self.dataset.getDataShape(TRAIN)) if self.dataset.getSubset(VALID)[0] is not None: log.debug("Valid dataset size is: %s", self.dataset.getDataShape(VALID)) if self.dataset.getSubset(TEST)[0] is not None: log.debug("Test dataset size is: %s", self.dataset.getDataShape(TEST)) self.STOP = False self.epoch_counter = 0 if not continue_training: # reset any decay params for decay_param in self.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(train_function, plot) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") set_shared_values(self.params, self.best_params) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl') log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t)) log.info("------------TOTAL %s TRAIN TIME TOOK %s---------", str(type(self.model)), make_time_units_string(time.time() - start_time)) def _perform_one_epoch(self, f_learn, plot=None): """ Performs a single training iteration with the given learn function. """ self.epoch_counter += 1 t = time.time() log.info('EPOCH %s', str(self.epoch_counter)) # set the noise switches on for training function! (this is where things like dropout happen) switch_vals = [] if len(self.noise_switches) > 0 and (self.valid_flag or self.test_flag or self.epoch_counter == 1): log.debug("Turning on %s noise switches", str(len(self.noise_switches))) switch_vals = [switch.get_value() for switch in self.noise_switches] [switch.set_value(1.) for switch in self.noise_switches] # train train_costs = [] train_monitors = {key: [] for key in self.train_monitors_dict.keys()} for batch_start, batch_end in self.train_batches: _outs = raise_to_list(f_learn(batch_start, batch_end)) train_costs.append(_outs[0]) # handle any user defined monitors if len(train_monitors) > 0: current_monitors = zip(self.train_monitors_dict.keys(), _outs[1:]) for name, val in current_monitors: train_monitors[name].append(val) # get the mean values for the batches mean_train = numpy.mean(train_costs, 0) current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in train_monitors.items()} # log the mean values! log.info('Train cost: %s', trunc(mean_train)) if len(current_mean_monitors) > 0: log.info('Train monitors: %s', str(current_mean_monitors)) # send the values to their outservices if self.train_outservice: self.train_outservice.write(mean_train, TRAIN) for name, service in self.train_monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], TRAIN) # if there is a plot, also send them over! if plot: current_mean_monitors.update({TRAIN_COST_KEY: mean_train}) plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :) if len(self.noise_switches) > 0 and (self.valid_flag or self.test_flag): log.debug("Turning off %s noise switches", str(len(self.noise_switches))) [switch.set_value(0.) for switch in self.noise_switches] # valid if self.valid_flag: valid_monitors = {key: [] for key in self.valid_monitors_dict.keys()} for batch_start, batch_end in self.valid_batches: _outs = raise_to_list(self.valid_monitor_function(batch_start, batch_end)) current_monitors = zip(self.valid_monitors_dict.keys(), _outs) for name, val in current_monitors: valid_monitors[name].append(val) # get the mean values for the batches current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in valid_monitors.items()} # log the mean values! log.info('Valid monitors: %s', str(current_mean_monitors)) # send the values to their outservices for name, service in self.valid_monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], VALID) # if there is a plot, also send them over! if plot: plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) #test if self.test_flag: test_monitors = {key: [] for key in self.test_monitors_dict.keys()} for batch_start, batch_end in self.test_batches: _outs = raise_to_list(self.test_monitor_function(batch_start, batch_end)) current_monitors = zip(self.test_monitors_dict.keys(), _outs) for name, val in current_monitors: test_monitors[name].append(val) # get the mean values for the batches current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in test_monitors.items()} # log the mean values! log.info('Test monitors: %s', str(current_mean_monitors)) # send the values to their outservices for name, service in self.test_monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], TEST) # if there is a plot, also send them over! if plot: plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) # check for early stopping on train costs cost = numpy.sum(train_costs) if cost < self.best_cost * self.early_stop_threshold: self.patience = 0 self.best_cost = cost # save the parameters that made it the best self.best_params = get_shared_values(self.params) else: self.patience += 1 # check for stopping either from n_epochs or from threshold/patience stop = False if self.epoch_counter >= self.n_epoch: log.info("Stopping (reached max number of epochs)...") stop = True if self.patience >= self.early_stop_length: log.info("Stopping early (reached stop threshold)...") stop = True timing = time.time() - t self.times.append(timing) log.info('time: ' + make_time_units_string(timing)) log.debug('remaining time: ' + make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times))) if (self.epoch_counter % self.save_frequency) == 0: #save params self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl') # ANNEAL! if not stop: # perform the appropriate decay on the decay functions/parameters for this optimizer and model for decay_param in self.get_decay_params(): decay_param.decay() # reset the switches if len(self.noise_switches) > 0: [switch.set_value(val) for switch, val in zip(self.noise_switches, switch_vals)] # return whether or not to stop this epoch return stop def get_decay_params(self): """ Returns a list of all the Decay objects to decay during training. Returns ------- list List of Decay objects to use after each training epoch - in this case the learning rate decay. """ decay_params = self.model.get_decay_params() if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay: decay_params.append(self.learning_rate_decay) return decay_params