def main(): var = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W') updates = [(var, add_uniform(input=var, noise_level=.02))] stats = get_stats(var) l1 = stats.pop('l1') l2 = stats.pop('l2') min = stats.pop('min') max = stats.pop('max') var = stats.pop('var') std = stats.pop('std') mean = stats.pop('mean') mean_monitor = Monitor('mean', mean, train=True, valid=True, out_service=FileService('outs/mean.txt')) var_monitor = Monitor('var', var, out_service=FileService('outs/var.txt')) w_channel = MonitorsChannel('W', monitors=mean_monitor) stat_channel = MonitorsChannel('stats', monitors=[var_monitor]) monitors = [w_channel, stat_channel] train_collapsed_raw = collapse_channels(monitors, train=True) train_collapsed = OrderedDict([(item[0], item[1]) for item in train_collapsed_raw]) train_services = OrderedDict([(item[0], item[2]) for item in train_collapsed_raw]) valid_collapsed_raw = collapse_channels(monitors, valid=True) valid_collapsed = OrderedDict([(item[0], item[1]) for item in valid_collapsed_raw]) valid_services = OrderedDict([(item[0], item[2]) for item in valid_collapsed_raw]) log.debug('compiling...') f = theano.function(inputs=[], outputs=list(train_collapsed.values()), updates=updates) f2 = theano.function(inputs=[], outputs=list(valid_collapsed.values()), updates=updates) log.debug('done') t1=time.time() for epoch in range(10): t=time.time() log.debug(epoch) vals = f() m = OrderedDict(zip(train_collapsed.keys(), vals)) for name, service in train_services.items(): if name in m: service.write(m[name], "train") log.debug('----- '+make_time_units_string(time.time()-t)) for epoch in range(10): t = time.time() log.debug(epoch) vals = f2() m = OrderedDict(zip(valid_collapsed.keys(), vals)) for name, service in valid_services.items(): if name in m: service.write(m[name], "valid") log.debug('----- ' + make_time_units_string(time.time() - t)) log.debug("TOTAL TIME "+make_time_units_string(time.time()-t1))
def main(): var = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W') updates = [(var, add_uniform(input=var, noise_level=.02))] stats = get_stats(var) l1 = stats.pop('l1') l2 = stats.pop('l2') min = stats.pop('min') max = stats.pop('max') var = stats.pop('var') std = stats.pop('std') mean = stats.pop('mean') mean_monitor = Monitor('mean', mean, train=True, valid=True, out_service=FileService('outs/mean.txt')) var_monitor = Monitor('var', var, out_service=FileService('outs/var.txt')) w_channel = MonitorsChannel('W', monitors=mean_monitor) stat_channel = MonitorsChannel('stats', monitors=[var_monitor]) monitors = [w_channel, stat_channel] train_collapsed_raw = collapse_channels(monitors, train=True) train_collapsed = OrderedDict([(item[0], item[1]) for item in train_collapsed_raw]) train_services = OrderedDict([(item[0], item[2]) for item in train_collapsed_raw]) valid_collapsed_raw = collapse_channels(monitors, valid=True) valid_collapsed = OrderedDict([(item[0], item[1]) for item in valid_collapsed_raw]) valid_services = OrderedDict([(item[0], item[2]) for item in valid_collapsed_raw]) log.debug('compiling...') f = theano.function(inputs=[], outputs=train_collapsed.values(), updates=updates) f2 = theano.function(inputs=[], outputs=valid_collapsed.values(), updates=updates) log.debug('done') t1=time.time() for epoch in range(10): t=time.time() log.debug(epoch) vals = f() m = OrderedDict(zip(train_collapsed.keys(), vals)) for name, service in train_services.items(): if name in m: service.write(m[name], TRAIN) log.debug('----- '+make_time_units_string(time.time()-t)) for epoch in range(10): t = time.time() log.debug(epoch) vals = f2() m = OrderedDict(zip(valid_collapsed.keys(), vals)) for name, service in valid_services.items(): if name in m: service.write(m[name], VALID) log.debug('----- ' + make_time_units_string(time.time() - t)) log.debug("TOTAL TIME "+make_time_units_string(time.time()-t1))
def main(): w = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W') updates = [(w, add_uniform(input=w, noise_level=.02))] stats = get_stats(w) l1 = stats.pop('l1') l2 = stats.pop('l2') min = stats.pop('min') max = stats.pop('max') var = stats.pop('var') std = stats.pop('std') mean = stats.pop('mean') mean_monitor = Monitor('mean', mean, train=True, valid=True) stat_monitor = Monitor('max', max) w_channel = MonitorsChannel('W', monitors=mean_monitor) stat_channel = MonitorsChannel('stats', monitors=[stat_monitor]) monitors = [w_channel, stat_channel] train_collapsed = collapse_channels(monitors, train=True) train_collapsed = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) valid_collapsed = collapse_channels(monitors, valid=True) valid_collapsed = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) plot = Plot(bokeh_doc_name='test_plots', monitor_channels=monitors, open_browser=True) log.debug('compiling...') f = theano.function(inputs=[], outputs=list(train_collapsed.values()), updates=updates) f2 = theano.function(inputs=[], outputs=list(valid_collapsed.values()), updates=updates) log.debug('done') t1=time.time() for epoch in range(100): t=time.time() log.debug(epoch) vals = f() m = OrderedDict(zip(train_collapsed.keys(), vals)) plot.update_plots(epoch, m) time.sleep(0.02) log.debug('----- '+make_time_units_string(time.time()-t)) for epoch in range(100): t = time.time() log.debug(epoch) vals = f2() m = OrderedDict(zip(valid_collapsed.keys(), vals)) plot.update_plots(epoch, m) time.sleep(0.02) log.debug('----- ' + make_time_units_string(time.time() - t)) log.debug("TOTAL TIME "+make_time_units_string(time.time()-t1))
def compile_run_fn(self): """ This is a helper function to compile the f_run function for computing the model's outputs given inputs. Compile and set the f_run function used for `run()`. It sets the `self.f_run` attribute to the f_run function. .. note:: The run function defaults like so:: self.f_run = function(inputs = raise_to_list(self.get_inputs()), outputs = raise_to_list(self.get_outputs()), updates = self.get_updates(), name = 'f_run') """ if not hasattr(self, 'f_run'): log.debug("Compiling f_run...") t = time.time() outputs = raise_to_list(self.get_outputs()) if outputs is not None and len(outputs) == 1: outputs = outputs[0] self.f_run = function(inputs = raise_to_list(self.get_inputs()), outputs = outputs, updates = self.get_updates(), name = 'f_run') log.debug("Compilation done. Took %s", make_time_units_string(time.time() - t)) else: log.warn('f_run already exists!')
def compile_run_fn(self): """ This is a helper function to compile the f_run function for computing the model's outputs given inputs. Compile and set the f_run function used for `run()`. It sets the `self.f_run` attribute to the f_run function. .. note:: The run function defaults like so:: self.f_run = function(inputs = raise_to_list(self.get_inputs()), outputs = self.get_outputs(), updates = self.get_updates(), name = 'f_run') Returns ------- Theano function The compiled theano function for running the model. """ if not getattr(self, 'f_run', None): log.debug("Compiling f_run...") t = time.time() self.f_run = function(inputs = raise_to_list(self.get_inputs()), outputs = self.get_outputs(), updates = self.get_updates(), name = 'f_run') log.debug("Compilation done. Took %s", make_time_units_string(time.time() - t)) else: log.debug('f_run already exists!') return self.f_run
def compile_vocab(self, iters): """ Creates a dictionary mapping tokens (words or characters) to integers given the level and preprocessing. Parameters ---------- iters : iterable The iterable to go through when creating the vocaublary dictionary. Returns ------- vocab The dictionary mapping token: integer for all tokens in the `iters` iterable. """ log.debug("Creating vocabulary...") t = time.time() vocab = {self.unk_token: 0} i = 1 for token in iters: if token not in vocab: vocab[token] = i i += 1 log.debug("Vocab took %s to create." % make_time_units_string(time.time() - t)) return vocab
def compute_CSL_with_minibatches_one_chain(fn, minibatches): """ Computes the CSL over minibatches with a single chain (no parallel chains to average computation over). Parameters ---------- fn : theano function The CSL function to use. minibatches : tensor The minibatches of data as a 3D tensor with shape (num_minibatches, batch_size, input_dimensionality). Returns ------- float The mean LL value over minibatches. """ LLs = [] t = time.time() mean = None for i, minibatch in enumerate(minibatches): # loop through one minibatch LL = fn(minibatch) LLs.append(LL) mean = numpy.mean(LLs) log.info('%d / %d batches, LL mean so far %.4f' % (i + 1, minibatches.shape[0], mean)) log.info('mean LL %s' % mean) log.info('--- took %s ---' % make_time_units_string(time.time() - t)) return mean
def run(self, input): """ This method will return the Prototype's output (run through the `f_run` function), given an input. The input comes from all unique inputs to the models in the Prototype as calculated from `get_inputs()` and the outputs computed similarly from `get_outputs`. Try to avoid re-compiling the theano function created for run - check a `hasattr(self, 'f_run')` or something similar first. Parameters ---------- input: array_like Theano/numpy tensor-like object that is the input into the model's computation graph. Returns ------- array_like Theano/numpy tensor-like object that is the output of the model's computation graph. """ # make sure the input is raised to a list - we are going to splat it! input = raise_to_list(input) # first check if we already made an f_run function if hasattr(self, 'f_run'): return self.f_run(*input) # otherwise, compile it! else: inputs = self.get_inputs() outputs = self.get_outputs() updates = self.get_updates() t = time.time() log.info("Compiling f_run...") self.f_run = function(inputs=inputs, outputs=outputs, updates=updates, name="f_run") log.info("Compilation done! Took %s", make_time_units_string(time.time() - t)) return self.f_run(*input)
def compile_run_fn(self): """ This is a helper function to compile the f_run function for computing the model's outputs given inputs. Compile and set the f_run function used for `run()`. It sets the `self.f_run` attribute to the f_run function. .. note:: The run function defaults like so:: self.f_run = function(inputs = raise_to_list(self.get_inputs()), outputs = raise_to_list(self.get_outputs()), updates = self.get_updates(), name = 'f_run') """ if not hasattr(self, 'f_run'): log.debug("Compiling f_run...") t = time.time() outputs = raise_to_list(self.get_outputs()) if outputs is not None and len(outputs) == 1: outputs = outputs[0] self.f_run = function(inputs=raise_to_list(self.get_inputs()), outputs=outputs, updates=self.get_updates(), name='f_run') log.debug("Compilation done. Took %s", make_time_units_string(time.time() - t)) else: log.warn('f_run already exists!')
def compile_run_fn(self): """ This is a helper function to compile the f_run function for computing the model's outputs given inputs. Compile and set the f_run function used for `run()`. It sets the `self.f_run` attribute to the f_run function. .. note:: The run function defaults like so:: self.f_run = function(inputs = raise_to_list(self.get_inputs()), outputs = self.get_outputs(), updates = self.get_updates(), name = 'f_run') Returns ------- Theano function The compiled theano function for running the model. """ if not getattr(self, 'f_run', None): log.debug("Compiling f_run...") t = time.time() self.f_run = function(inputs=raise_to_list(self.get_inputs()), outputs=self.get_outputs(), updates=self.get_updates(), name='f_run') log.debug("Compilation done. Took %s", make_time_units_string(time.time() - t)) else: log.debug('f_run already exists!') return self.f_run
def __init__(self, dataset, subset=datasets.TRAIN, batch_size=1, minimum_batch_size=1, rng=None): _t = time.time() log.debug('Initializing a %s sequential iterator over %s', str(type(dataset)), datasets.get_subset_strings(subset)) super(self.__class__, self).__init__(dataset, subset, batch_size, minimum_batch_size, rng) log.debug('iterator took %s to make' % make_time_units_string(time.time() - _t))
def run(self, input): """ This method will return the Prototype's output (run through the `f_run` function), given an input. The input comes from all unique inputs to the models in the Prototype as calculated from `get_inputs()` and the outputs computed similarly from `get_outputs`. Try to avoid re-compiling the theano function created for run - check a `hasattr(self, 'f_run')` or something similar first. Parameters ---------- input: array_like Theano/numpy tensor-like object that is the input into the model's computation graph. Returns ------- array_like Theano/numpy tensor-like object that is the output of the model's computation graph. """ # set the noise switches off for running! we assume unseen data is noisy anyway :) old_switch_vals = [] if len(self.get_switches()) > 0: log.debug("Turning off %s noise switches, resetting them after run!", str(len(self.get_switches()))) old_switch_vals = [switch.get_value() for switch in self.get_switches()] [switch.set_value(0.) for switch in self.get_switches()] # make sure the input is raised to a list - we are going to splat it! input = raise_to_list(input) # first check if we already made an f_run function if hasattr(self, 'f_run'): output = self.f_run(*input) # otherwise, compile it! else: inputs = raise_to_list(self.get_inputs()) outputs = raise_to_list(self.get_outputs()) if outputs is not None and len(outputs) == 1: outputs = outputs[0] updates = self.get_updates() t = time.time() log.info("Compiling f_run...") self.f_run = function(inputs=inputs, outputs=outputs, updates=updates, name="f_run") log.info("Compilation done! Took %s", make_time_units_string(time.time() - t)) output = self.f_run(*input) # reset any switches to how they were! if len(self.get_switches()) > 0: [switch.set_value(val) for switch, val in zip(self.get_switches(), old_switch_vals)] return output
def train(self, continue_training=False): log.info( "-----------TRAINING %s FOR %s EPOCHS (continue_training=%s)-----------", str(type(self.model)), str(self.n_epoch), str(continue_training)) log.debug("Train dataset size is: %s", self.dataset.getDataShape(datasets.TRAIN)) if self.dataset.hasSubset(datasets.VALID): log.debug("Valid dataset size is: %s", self.dataset.getDataShape(datasets.VALID)) if self.dataset.hasSubset(datasets.TEST): log.debug("Test dataset size is: %s", self.dataset.getDataShape(datasets.TEST)) self.STOP = False self.epoch_counter = 0 if not continue_training: # reset the learning rate if hasattr(self, 'learning_rate_decay'): self.learning_rate_decay.reset() # reset the other model decaying functions for decay_param in self.model.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = float('inf') self.best_params = None self.patience = 0 start_time = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch() except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True #save params if self.best_params is not None: log.debug("Restoring best model parameters...") set_shared_values(self.params, self.best_params) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl') log.info("------------TOTAL %s TRAIN TIME TOOK %s---------", str(type(self.model)), make_time_units_string(time.time() - start_time))
def sample_some_numbers(n_samples): # The network's initial state init_vis = initial noisy_init_vis = self.f_noise(init_vis) network_state = [[noisy_init_vis] + [ numpy.zeros(shape=(initial.shape[0], self.layer_sizes[i + 1]), dtype=theano.config.floatX) for i in range(len(self.bias_list[1:])) ]] visible_chain = [init_vis] noisy_h0_chain = [noisy_init_vis] sampled_h = [] times = [] for i in xrange(n_samples - 1): _t = time.time() # feed the last state into the network, run new state, and obtain visible units expectation chain net_state_out, vis_pX_chain = sampling_wrapper( network_state[-1]) # append to the visible chain visible_chain += vis_pX_chain # append state output to the network state chain network_state.append(net_state_out) noisy_h0_chain.append(net_state_out[0]) if i % k == 0: sampled_h.append(T.stack(net_state_out[1:])) if i == k: log.debug("About " + make_time_units_string( numpy.mean(times) * (n_samples - 1 - i)) + " remaining...") times.append(time.time() - _t) log.DEBUG("Sampling done.") return numpy.vstack(visible_chain), sampled_h
def sample_some_numbers(n_samples): # The network's initial state init_vis = initial noisy_init_vis = self.f_noise(init_vis) network_state = [ [noisy_init_vis] + [ numpy.zeros(shape=(initial.shape[0], self.layer_sizes[i+1]), dtype=theano.config.floatX) for i in range(len(self.bias_list[1:])) ] ] visible_chain = [init_vis] noisy_h0_chain = [noisy_init_vis] sampled_h = [] times = [] for i in xrange(n_samples-1): _t = time.time() # feed the last state into the network, run new state, and obtain visible units expectation chain net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1]) # append to the visible chain visible_chain += vis_pX_chain # append state output to the network state chain network_state.append(net_state_out) noisy_h0_chain.append(net_state_out[0]) if i%k == 0: sampled_h.append(T.stack(net_state_out[1:])) if i == k: log.debug("About "+make_time_units_string(numpy.mean(times)*(n_samples-1-i))+" remaining...") times.append(time.time() - _t) log.DEBUG("Sampling done.") return numpy.vstack(visible_chain), sampled_h
def run(self, input): """ This method will return the model's output (run through the function), given an input. In the case that input_hooks or hidden_hooks are used, the function should use them appropriately and assume they are the input. Try to avoid re-compiling the theano function created for run - check a hasattr(self, 'f_run') or something similar first. I recommend creating your theano f_run in a create_computation_graph method to be called after the class initializes. ------------------ :param input: Theano/numpy tensor-like object that is the input into the model's computation graph. :type input: tensor :return: Theano/numpy tensor-like object that is the output of the model's computation graph. :rtype: tensor """ # set any noise switches to zero if len(self.get_noise_switch()) > 0: vals = [switch.get_value() for switch in self.get_noise_switch()] [switch.set_value(0.) for switch in self.get_noise_switch()] # check if the run function is already compiled, otherwise compile it! if not hasattr(self, 'f_run'): log.debug("Compiling f_run...") t = time.time() self.f_run = function(inputs = raise_to_list(self.get_inputs()), outputs = self.get_outputs(), updates = self.get_updates()) log.debug("Compilation done. Took %s", make_time_units_string(time.time() - t)) # because we use the splat to account for multiple inputs to the function, make sure input is a list. input = raise_to_list(input) # return the results of the run function! output = self.f_run(*input) # reset the noise switches if len(self.get_noise_switch()) > 0: [switch.set_value(val) for switch, val in zip(self.get_noise_switch(), vals)] return output
def compute_CSL_with_minibatches(fn, minibatches, chains): """ Computes CSL over parallel chains of minibatches (means the input chains is a 4D tensor consisting of minibatches of shape (N, K, D)). When there are N chains, each chain having K samples of dimension D Parameters ---------- fn : theano function The CSL function to use. minibatches : tensor The minibatches of data as a 3D tensor with shape (num_minibatches, batch_size, input_dimensionality). chains : tensor The chains of data as a 4D tensor with shape (n_minibatches, n_chains, batch_size, input_dimensionality). Returns ------- float The mean LL value over minibatches. """ # fn is the compiled theano fn LLs = [] t = time.time() for i, minibatch in enumerate(minibatches): # loop through one minibatch LL_minibatch_all_chains = [] for chain_minibatch in chains: # loop through a minibatch of chains LL = fn(minibatch, chain_minibatch) LL_minibatch_all_chains.append(LL) LL_minibatch_all_chains = numpy.concatenate(LL_minibatch_all_chains, axis=1) # import ipdb; ipdb.set_trace() LLs.append(LL_minibatch_all_chains) mean = numpy.mean(LLs) log.info('%d / %d batches, LL mean so far %.4f' % (i + 1, minibatches.shape[0], mean)) LLs = numpy.concatenate(LLs, axis=0) mean_LLs = LLs.mean() log.info('mean LL %s' % str(mean_LLs)) log.info('--- took %s ---' % make_time_units_string(time.time() - t)) return mean_LLs
def __init__(self, dataset, subset=datasets.TRAIN, batch_size=1, minimum_batch_size=1, rng=None): # initialize a numpy rng if one is not provided if rng is None: random.seed(123) self.rng = random else: self.rng = rng _t = time.time() log.debug('Initializing a %s random iterator over %s', str(type(dataset)), datasets.get_subset_strings(subset)) super(self.__class__, self).__init__(dataset, subset, batch_size, minimum_batch_size) # randomize the indices to access self.indices = numpy.arange(self.data_len) self.rng.shuffle(self.indices) log.debug('iterator took %s to make' % make_time_units_string(time.time() - _t))
def train(self, continue_training=False): """ This method performs the training!!! :param continue_training: :type continue_training: :return: :rtype: """ # grab the model parameters to use during training self.params = self.model.get_params() log.info("%s params: %s", str(type(self.model)), str(self.params)) ############################################### # theano index variable to use on the dataset # ############################################### # index to a [mini]batch - both start and end data_idx = T.iscalar('data_index') data_end_idx = T.iscalar('data_end_index') batch_slice = slice(data_idx, data_end_idx) # compute number of minibatches for training, validation and testing # shapes is list of list - input list of datasets to optimizer (for multiple inputs), and each dataset # could be a list of shared variables (like multiple sequences from files) train_data_shapes = raise_to_list(self.dataset.getDataShape(TRAIN)) valid_data_shapes = raise_to_list(self.dataset.getDataShape(VALID)) test_data_shapes = raise_to_list(self.dataset.getDataShape(TEST)) # train_batches is going to be lists of tuples that contain the start and end indices for train data train_data_lens = [shape[0] for shape in train_data_shapes] self.train_batches = self.get_batch_indices(train_data_lens) if valid_data_shapes is not None: valid_data_lens = [shape[0] for shape in valid_data_shapes] self.valid_batches = self.get_batch_indices(valid_data_lens) else: self.valid_batches = None if test_data_shapes is not None: test_data_lens = [shape[0] for shape in test_data_shapes] self.test_batches = self.get_batch_indices(test_data_lens) else: self.test_batches = None # translate the data_idx into the givens for the model model_inputs = raise_to_list(self.model.get_inputs()) model_targets = raise_to_list(self.model.get_targets()) train_data, train_labels = self.dataset.getSubset(TRAIN) train_givens = OrderedDict(zip(model_inputs, [train_data[batch_slice]])) if model_targets is not None and len(model_targets) > 0: train_givens.update(OrderedDict(zip(model_targets, [train_labels[batch_slice]]))) valid_data, valid_labels = self.dataset.getSubset(VALID) valid_givens = OrderedDict(zip(model_inputs, [valid_data[batch_slice]])) if model_targets is not None and len(model_targets) > 0: valid_givens.update(OrderedDict(zip(model_targets, [valid_labels[batch_slice]]))) test_data, test_labels = self.dataset.getSubset(TEST) test_givens = OrderedDict(zip(model_inputs, [test_data[batch_slice]])) if model_targets is not None and len(model_targets) > 0: test_givens.update(OrderedDict(zip(model_targets, [test_labels[batch_slice]]))) # Now time to create the training cost functions for the model - make sure to handle the possible # list of costs used for pretraining of certain parts of the model. train_costs = raise_to_list(self.model.get_train_cost()) self.train_functions = [] for i, train_cost in enumerate(train_costs): # Now create the training cost function for the model to use while training - update parameters # gradient! gradients, _ = self.model.get_gradient(cost=train_cost) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD, including AdaDelta! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable train_updates = self.model.get_updates() if train_updates: train_updates.update(gradient_updates) else: train_updates = gradient_updates # Compile the training function! log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_costs), str(type(self.model))) t = time.time() f_learn = function(inputs=[data_idx, data_end_idx], updates=train_updates, outputs=train_cost, givens=train_givens, name='f_learn_%d' % i) log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) self.train_functions.append(f_learn) # grab the expression(s) to use to monitor different model values during training log.debug("Compiling monitor functions...") monitor_t = time.time() self.monitors = OrderedDict(self.model.get_monitors()) self.monitor_names = self.monitors.keys() if len(self.monitors.keys()) > 0: self.train_monitor_function = function( inputs=[data_idx, data_end_idx], updates=self.model.get_updates(), outputs=self.monitors.values(), givens=train_givens, name="train_monitor_function" ) if len(self.monitors.keys()) > 0: self.valid_monitor_function = function( inputs=[data_idx, data_end_idx], updates=self.model.get_updates(), outputs=self.monitors.values(), givens=valid_givens, name="valid_monitor_function" ) if len(self.monitors.keys()) > 0: self.test_monitor_function = function( inputs=[data_idx, data_end_idx], updates=self.model.get_updates(), outputs=self.monitors.values(), givens=test_givens, name="test_monitor_function" ) log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) self.noise_switches = raise_to_list(self.model.get_noise_switch()) ################## # start training # ################## # make sure to deal with a list of train_cost functions - for layer-wise pretraining! # this list of training functions was created during __init__() start_time = time.time() for func_i, train_function in enumerate(self.train_functions): log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS (continue_training=%s)-----------", str(type(self.model)), func_i + 1, len(self.train_functions), self.n_epoch, str(continue_training)) log.debug("Train dataset size is: %s", self.dataset.getDataShape(TRAIN)) if self.dataset.hasSubset(VALID): log.debug("Valid dataset size is: %s", self.dataset.getDataShape(VALID)) if self.dataset.hasSubset(TEST): log.debug("Test dataset size is: %s", self.dataset.getDataShape(TEST)) self.STOP = False self.epoch_counter = 0 if not continue_training: # reset the learning rate if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay: self.learning_rate_decay.reset() # reset the other model decaying functions for decay_param in self.model.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(train_function) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") set_shared_values(self.params, self.best_params) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl') log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t)) log.info("------------TOTAL %s TRAIN TIME TOOK %s---------", str(type(self.model)), make_time_units_string(time.time() - start_time))
def _perform_one_epoch(self, f_learn, plot=None): """ Performs a single training iteration with the given learn function. """ self.epoch_counter += 1 t = time.time() log.info('EPOCH %s', str(self.epoch_counter)) # set the noise switches on for training function! (this is where things like dropout happen) if not self.model.switches_on: self.model.turn_on_switches() ######### # train # ######### train_costs = [] train_monitors = {key: [] for key in self.train_monitors_dict.keys()} train_data = [ minibatch(input_data, self.batch_size, self.min_batch_size) for input_data in raise_to_list(self.dataset.train_inputs) ] if self.dataset.train_targets is not None and not self.unsupervised: train_data += [ minibatch(target, self.batch_size, self.min_batch_size) for target in raise_to_list(self.dataset.train_targets) ] for batch in min_normalized_izip(*train_data): _outs = raise_to_list(f_learn(*batch)) train_costs.append(_outs[0]) # handle any user defined monitors if len(train_monitors) > 0: current_monitors = zip(self.train_monitors_dict.keys(), _outs[1:]) for name, val in current_monitors: val = numpy.asarray(val) train_monitors[name].append(val) # get the mean values for the batches mean_train = numpy.mean(train_costs, 0) current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in train_monitors.items()} # log the mean values! log.info('Train cost: %s', trunc(mean_train)) if len(current_mean_monitors) > 0: log.info('Train monitors: %s', str(current_mean_monitors)) # send the values to their outservices if self.train_outservice: self.train_outservice.write(mean_train, "train") for name, service in self.train_monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], "train") # if there is a plot, also send them over! if plot: current_mean_monitors.update({TRAIN_COST_KEY: mean_train}) plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :) if self.model.switches_on: self.model.turn_off_switches() ######### # valid # ######### self._compute_over_subset("valid", self.dataset.valid_inputs, self.dataset.valid_targets, self.valid_monitors_dict, self.valid_monitor_function, self.valid_monitors_outservice_dict, plot) ######## # test # ######## self._compute_over_subset("test", self.dataset.test_inputs, self.dataset.test_targets, self.test_monitors_dict, self.test_monitor_function, self.test_monitors_outservice_dict, plot) ########### # cleanup # ########### # check for early stopping on train costs cost = numpy.sum(train_costs) # if the cost improved, reset the patience and record the best cost. if cost < self.best_cost * self.early_stop_threshold: self.patience = 0 self.best_cost = cost # save the parameters that made it the best self.best_params = {key: param.get_value(borrow=False) for key, param in self.params.items()} elif not numpy.isnan(cost): self.patience += 1 # check for stopping either from n_epochs or from threshold/patience stop = False if self.epoch_counter >= self.n_epoch: log.info("Stopping (reached max number of epochs)...") stop = True if self.patience >= self.early_stop_length: log.info("Stopping early (reached stop threshold)...") stop = True timing = time.time() - t self.times.append(timing) log.info('time: ' + make_time_units_string(timing)) log.debug('remaining time: ' + make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times))) if (self.epoch_counter % self.save_frequency) == 0: #save params self.model.save_params('trained_epoch_' + str(self.epoch_counter)) # ANNEAL! if not stop: # perform the appropriate decay on the decay functions/parameters for this optimizer and model for decay_param in self.get_decay_params(): decay_param.decay() # return whether or not to stop this epoch return stop
def train(self, monitor_channels=None, train_outservice=None, plot=None): """ This method performs the training!!! It is an online training method that goes over minibatches from the dataset for a number of epochs, updating parameters after each minibatch. You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully. Parameters ---------- monitor_channels : list(MonitorsChannel or Monitor), optional The list of channels or monitors containing monitor expressions/variables to compile and evaluate on the data. train_outservice : OutService, optional The OutService to use for the automatically created train_cost monitor. Default of None just outputs to logs. plot : Plot, optional The Plot object to use if we want to graph the outputs (uses bokeh server). """ if not self.model: log.error("No self.model for the Optimizer!") raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() " "was called from the Model. Try initializing the Optimizer with the model param " "and calling optimizer.train().") ######################### # gradients and updates # ######################### # grab the model parameters to use during training self.params = self.model.get_params() # Now create the training cost function for the model to use while training - update parameters # gradient! gradients = grad(cost=self.loss_expression, wrt=list(self.params.values())) # now create the dictionary mapping the parameter with its gradient gradients = OrderedDict( [(param, g) for param, g in zip(list(self.params.values()), gradients)] ) # clip gradients if we want. gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable updates = self.model.get_updates() if updates: updates.update(gradient_updates) else: updates = gradient_updates log.info("%s params: %s", self.model._classname, str(list(self.params.keys()))) ############ # monitors # ############ # deal with the monitor channels if they were given (or take them from the plot) if monitor_channels is None and plot is not None and len(plot.channels) > 0: monitor_channels = plot.channels self.train_monitors_dict = {} self.valid_monitors_dict = {} self.test_monitors_dict = {} self.train_monitors_outservice_dict = {} self.valid_monitors_outservice_dict = {} self.test_monitors_outservice_dict = {} if monitor_channels: # collapse the appropriate monitors into their (name, expression, out_service) tuples train_collapsed = collapse_channels(monitor_channels, train=True) valid_collapsed = collapse_channels(monitor_channels, valid=True) test_collapsed = collapse_channels(monitor_channels, test=True) # get name: expression dictionary self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) self.test_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in test_collapsed]) # get name: outservice dictionary self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed]) self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed]) self.test_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in test_collapsed]) # finally deal with an outservice provided to monitor training cost self.train_outservice = train_outservice # remove redundant files made by the fileservice for the train monitor. # TODO: THIS FEELS LIKE A HACK. I don't like it. if isinstance(self.train_outservice, FileService): os.remove(self.train_outservice.valid_filename) os.remove(self.train_outservice.test_filename) ####################################### # compile train and monitor functions # ####################################### function_input = raise_to_list(self.model.get_inputs()) if self.loss_targets is not None: function_input += self.loss_targets # Compile the training function! log.info('Compiling f_learn function for model %s...', self.model._classname) t = time.time() f_learn = function(inputs=function_input, updates=updates, outputs=[self.loss_expression] + list(self.train_monitors_dict.values()), name='f_learn') log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) # figure out if we want valid and test (monitors) self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0) self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0) # Now compile the monitor functions! log.debug("Compiling monitor functions...") monitor_t = time.time() # valid monitors if self.valid_flag: self.valid_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.valid_monitors_dict.values()), name='valid_monitor_function' ) else: self.valid_monitor_function = None # test monitors if self.test_flag: self.test_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.test_monitors_dict.values()), name='test_monitor_function' ) else: self.test_monitor_function = None log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) ################## # start training # ################## log.info("-----------TRAINING %s FOR %d EPOCHS-----------", self.model._classname, self.n_epoch) self.STOP = False self.epoch_counter = 0 # reset any decay params for decay_param in self.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(f_learn, plot) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") for best_param, param_value in self.best_params.items(): self.params[best_param].set_value(param_value, borrow=False) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter)) log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))
def _perform_one_epoch(self, f_learn, plot=None): """ Performs a single training iteration with the given learn function. """ self.epoch_counter += 1 t = time.time() log.info('EPOCH %s', str(self.epoch_counter)) # set the noise switches on for training function! (this is where things like dropout happen) switch_vals = [] if len(self.noise_switches) > 0 and (self.valid_flag or self.test_flag or self.epoch_counter == 1): log.debug("Turning on %s noise switches", str(len(self.noise_switches))) switch_vals = [switch.get_value() for switch in self.noise_switches] [switch.set_value(1.) for switch in self.noise_switches] # train train_costs = [] train_monitors = {key: [] for key in self.train_monitors_dict.keys()} for batch_start, batch_end in self.train_batches: _outs = raise_to_list(f_learn(batch_start, batch_end)) train_costs.append(_outs[0]) # handle any user defined monitors if len(train_monitors) > 0: current_monitors = zip(self.train_monitors_dict.keys(), _outs[1:]) for name, val in current_monitors: train_monitors[name].append(val) # get the mean values for the batches mean_train = numpy.mean(train_costs, 0) current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in train_monitors.items()} # log the mean values! log.info('Train cost: %s', trunc(mean_train)) if len(current_mean_monitors) > 0: log.info('Train monitors: %s', str(current_mean_monitors)) # send the values to their outservices if self.train_outservice: self.train_outservice.write(mean_train, TRAIN) for name, service in self.train_monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], TRAIN) # if there is a plot, also send them over! if plot: current_mean_monitors.update({TRAIN_COST_KEY: mean_train}) plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :) if len(self.noise_switches) > 0 and (self.valid_flag or self.test_flag): log.debug("Turning off %s noise switches", str(len(self.noise_switches))) [switch.set_value(0.) for switch in self.noise_switches] # valid if self.valid_flag: valid_monitors = {key: [] for key in self.valid_monitors_dict.keys()} for batch_start, batch_end in self.valid_batches: _outs = raise_to_list(self.valid_monitor_function(batch_start, batch_end)) current_monitors = zip(self.valid_monitors_dict.keys(), _outs) for name, val in current_monitors: valid_monitors[name].append(val) # get the mean values for the batches current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in valid_monitors.items()} # log the mean values! log.info('Valid monitors: %s', str(current_mean_monitors)) # send the values to their outservices for name, service in self.valid_monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], VALID) # if there is a plot, also send them over! if plot: plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) #test if self.test_flag: test_monitors = {key: [] for key in self.test_monitors_dict.keys()} for batch_start, batch_end in self.test_batches: _outs = raise_to_list(self.test_monitor_function(batch_start, batch_end)) current_monitors = zip(self.test_monitors_dict.keys(), _outs) for name, val in current_monitors: test_monitors[name].append(val) # get the mean values for the batches current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in test_monitors.items()} # log the mean values! log.info('Test monitors: %s', str(current_mean_monitors)) # send the values to their outservices for name, service in self.test_monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], TEST) # if there is a plot, also send them over! if plot: plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) # check for early stopping on train costs cost = numpy.sum(train_costs) if cost < self.best_cost * self.early_stop_threshold: self.patience = 0 self.best_cost = cost # save the parameters that made it the best self.best_params = get_shared_values(self.params) else: self.patience += 1 # check for stopping either from n_epochs or from threshold/patience stop = False if self.epoch_counter >= self.n_epoch: log.info("Stopping (reached max number of epochs)...") stop = True if self.patience >= self.early_stop_length: log.info("Stopping early (reached stop threshold)...") stop = True timing = time.time() - t self.times.append(timing) log.info('time: ' + make_time_units_string(timing)) log.debug('remaining time: ' + make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times))) if (self.epoch_counter % self.save_frequency) == 0: #save params self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl') # ANNEAL! if not stop: # perform the appropriate decay on the decay functions/parameters for this optimizer and model for decay_param in self.get_decay_params(): decay_param.decay() # reset the switches if len(self.noise_switches) > 0: [switch.set_value(val) for switch, val in zip(self.noise_switches, switch_vals)] # return whether or not to stop this epoch return stop
def train(self, monitor_channels=None, train_outservice=None, plot=None, additional_cost=None): """ This method performs the training!!! It is an online training method that goes over minibatches from the dataset for a number of epochs, updating parameters after each minibatch. You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully. Parameters ---------- monitor_channels : list(MonitorsChannel or Monitor), optional The list of channels or monitors containing monitor expressions/variables to compile and evaluate on the data. train_outservice : OutService, optional The OutService to use for the automatically created train_cost monitor. Default of None just outputs to logs. plot : Plot, optional The Plot object to use if we want to graph the outputs (uses bokeh server). additional_cost : theano expression or list(theano expression), optional Any additional cost expressions to use during training (things like regularization). These will be summed with the existing cost. """ if not self.model: log.error("No self.model for the Optimizer!") raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() " "was called from the Model. Try initializing the Optimizer with the model param " "and calling optimizer.train().") ##################################################### # handle additional costs (normally regularization) # ##################################################### # Create the gradient updates for the model - make sure to handle the possible # list of costs used for pretraining of certain parts of the model. train_costs = raise_to_list(self.model.get_train_cost()) # deal with any other additional costs (like regularization, etc.) if additional_cost is not None: additional_costs = raise_to_list(additional_cost) if len(additional_costs) > 1: additional_cost = T.sum(additional_costs) ######################### # gradients and updates # ######################### train_updates = [] self.gradients = [] for i, train_cost in enumerate(train_costs): # Now create the training cost function for the model to use while training - update parameters # gradient! if len(train_costs) > 1 and additional_cost is not None: log.warning("additional_cost will double count with gradients during layer-wise pretraining!") warnings.warn("additional_cost will double count with gradients during layer-wise pretraining!") # TODO: additional_cost will double count with gradients during layer-wise pretraining. # Need to somehow make w.r.t. params appropriate for the individual training costs. gradients, _ = self.model.get_gradient(cost=train_cost, additional_cost=additional_cost) # clip gradients if we want. gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip) # append to list self.gradients.append(gradients) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable updates = self.model.get_updates() if updates: updates.update(gradient_updates) else: updates = gradient_updates train_updates.append(updates) # grab the model parameters to use during training self.params = self.model.get_params() log.info("%s params: %s", str(type(self.model)), str(self.params)) ############ # monitors # ############ # deal with the monitor channels if they were given (or take them from the plot) if monitor_channels is None and plot is not None and len(plot.channels) > 0: monitor_channels = plot.channels self.train_monitors_dict = {} self.valid_monitors_dict = {} self.test_monitors_dict = {} self.train_monitors_outservice_dict = {} self.valid_monitors_outservice_dict = {} self.test_monitors_outservice_dict = {} if monitor_channels: # collapse the appropriate monitors into their (name, expression, out_service) tuples train_collapsed = collapse_channels(monitor_channels, train=True) valid_collapsed = collapse_channels(monitor_channels, valid=True) test_collapsed = collapse_channels(monitor_channels, test=True) # get name: expression dictionary self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) self.test_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in test_collapsed]) # get name: outservice dictionary self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed]) self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed]) self.test_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in test_collapsed]) # finally deal with an outservice provided to monitor training cost self.train_outservice = train_outservice # remove redundant files made by the fileservice for the train monitor. # TODO: THIS FEELS LIKE A HACK. I don't like it. if isinstance(self.train_outservice, FileService): os.remove(self.train_outservice.valid_filename) os.remove(self.train_outservice.test_filename) ####################################### # compile train and monitor functions # ####################################### function_input = raise_to_list(self.model.get_inputs()) + raise_to_list(self.model.get_targets()) train_functions = [] for i, (updates, train_cost) in enumerate(zip(train_updates, train_costs)): # Compile the training function! log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_updates), str(type(self.model))) t = time.time() f_learn = function(inputs=function_input, updates=updates, outputs=[train_cost] + list(self.train_monitors_dict.values()), name='f_learn_%d' % i) log.info('f_learn %d compilation took %s', i + 1, make_time_units_string(time.time() - t)) train_functions.append(f_learn) # figure out if we want valid and test (monitors) self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0) self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0) # Now compile the monitor functions! log.debug("Compiling monitor functions...") monitor_t = time.time() # valid monitors if self.valid_flag: self.valid_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.valid_monitors_dict.values()), name='valid_monitor_function' ) else: self.valid_monitor_function = None # test monitors if self.test_flag: self.test_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.test_monitors_dict.values()), name='test_monitor_function' ) else: self.test_monitor_function = None log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) ################## # start training # ################## # make sure to deal with a list of train_cost functions - for layer-wise pretraining! # this list of training functions was created during __init__() start_time = time.time() for func_i, train_function in enumerate(train_functions): log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS-----------", str(type(self.model)), func_i + 1, len(train_functions), self.n_epoch) self.STOP = False self.epoch_counter = 0 # reset any decay params for decay_param in self.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(train_function, plot) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") set_shared_values(self.params, self.best_params) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl') log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t)) log.info("------------TOTAL %s TRAIN TIME TOOK %s---------", str(type(self.model)), make_time_units_string(time.time() - start_time))
def _perform_one_epoch(self): self.epoch_counter += 1 t = time.time() log.info('EPOCH %s', str(self.epoch_counter)) #train train_costs = [] train_monitors = {key: [] for key in self.monitors.keys()} for x, y in self.iterator(self.dataset, datasets.TRAIN, self.batch_size, self.minimum_batch_size, self.rng): if self.unsupervised: train_costs.append(self.f_learn(x)) for key in self.monitors.keys(): monitor_function = self.monitors[key] train_monitors[key].append(monitor_function(x)) else: train_costs.append(self.f_learn(x, y)) for key in self.monitors.keys(): monitor_function = self.monitors[key] train_monitors[key].append(monitor_function(x, y)) log.info('Train cost: %s', trunc(numpy.mean(train_costs, 0))) if len(self.monitors.keys()) > 0: log.info( 'Train monitors: %s', str({ key: numpy.mean(value, 0) for key, value in train_monitors.items() })) #valid if self.dataset.hasSubset( datasets.VALID) and len(self.monitors.keys()) > 0: valid_monitors = {key: [] for key in self.monitors.keys()} for x, y in self.iterator(self.dataset, datasets.VALID, self.batch_size, self.minimum_batch_size, self.rng): if self.unsupervised: for key in self.monitors.keys(): monitor_function = self.monitors[key] valid_monitors[key].append(monitor_function(x)) else: for key in self.monitors.keys(): monitor_function = self.monitors[key] valid_monitors[key].append(monitor_function(x, y)) log.info( 'Valid monitors: %s', str({ key: numpy.mean(value, 0) for key, value in valid_monitors.items() })) #test if self.dataset.hasSubset( datasets.TEST) and len(self.monitors.keys()) > 0: test_monitors = {key: [] for key in self.monitors.keys()} for x, y in self.iterator(self.dataset, datasets.TEST, self.batch_size, self.minimum_batch_size, self.rng): if self.unsupervised: for key in self.monitors.keys(): monitor_function = self.monitors[key] test_monitors[key].append(monitor_function(x)) else: for key in self.monitors.keys(): monitor_function = self.monitors[key] test_monitors[key].append(monitor_function(x, y)) log.info( 'Test monitors: %s', str({ key: numpy.mean(value, 0) for key, value in test_monitors.items() })) # check for early stopping on train costs cost = numpy.sum(train_costs) if cost < self.best_cost * self.early_stop_threshold: self.patience = 0 self.best_cost = cost # save the parameters that made it the best self.best_params = get_shared_values(self.params) else: self.patience += 1 if self.epoch_counter >= self.n_epoch or self.patience >= self.early_stop_length: log.info("Stopping early...") self.STOP = True timing = time.time() - t self.times.append(timing) log.info('time: ' + make_time_units_string(timing)) log.info('remaining time: ' + make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times))) if (self.epoch_counter % self.save_frequency) == 0: #save params self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl') # ANNEAL! if hasattr(self, 'learning_rate_decay'): self.learning_rate_decay.decay() if hasattr(self, 'momentum_decay'): self.momentum_decay.decay() for decay_param in self.model.get_decay_params(): decay_param.decay()
def train(self, monitor_channels=None, train_outservice=None, plot=None, continue_training=False): """ This method performs the training!!! It is an online training method that goes over minibatches from the dataset for a number of epochs, updating parameters after each minibatch. You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully. Parameters ---------- monitor_channels : list(MonitorsChannel or Monitor), optional The list of channels or monitors containing monitor expressions/variables to compile and evaluate on the data. train_outservice : OutService, optional The OutService to use for the automatically created train_cost monitor. Default of None just outputs to logs. plot : Plot, optional The Plot object to use if we want to graph the outputs (uses bokeh server). continue_training : bool Whether to continue training from a previous point. """ ############################################### # theano index variable to use on the dataset # ############################################### # index to a [mini]batch - both start and end data_idx = T.iscalar('data_index') data_end_idx = T.iscalar('data_end_index') function_input = [data_idx, data_end_idx] batch_slice = slice(data_idx, data_end_idx) # compute number of minibatches for training, validation and testing # shapes is list of list - input list of datasets to optimizer (for multiple inputs), and each dataset # could be a list of shared variables (like multiple sequences from files) train_data_shapes = raise_to_list(self.dataset.getDataShape(TRAIN)) valid_data_shapes = raise_to_list(self.dataset.getDataShape(VALID)) test_data_shapes = raise_to_list(self.dataset.getDataShape(TEST)) # train_batches is going to be lists of tuples that contain the start and end indices for train data. # this is more useful in the case of datasets that are lists of sequences, so that the start and end # indices can make sure a batch does not cross the sequence boundary on the concatenated data train_data_lens = [shape[0] for shape in train_data_shapes] self.train_batches = self._get_batch_indices(train_data_lens) if valid_data_shapes is not None: valid_data_lens = [shape[0] for shape in valid_data_shapes] self.valid_batches = self._get_batch_indices(valid_data_lens) else: self.valid_batches = None if test_data_shapes is not None: test_data_lens = [shape[0] for shape in test_data_shapes] self.test_batches = self._get_batch_indices(test_data_lens) else: self.test_batches = None # create the givens for the input function as pairs of (input_variable: sliced_data) train_givens = self._get_givens_subset(TRAIN, batch_slice) valid_givens = self._get_givens_subset(VALID, batch_slice) test_givens = self._get_givens_subset(TEST, batch_slice) # Now time to create the gradient updates for the model - make sure to handle the possible # list of costs used for pretraining of certain parts of the model. train_costs = raise_to_list(self.model.get_train_cost()) train_updates = [] self.gradients = [] for i, train_cost in enumerate(train_costs): # Now create the training cost function for the model to use while training - update parameters # gradient! gradients, _ = self.model.get_gradient(cost=train_cost) self.gradients.append(gradients) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable updates = self.model.get_updates() if updates: updates.update(gradient_updates) else: updates = gradient_updates train_updates.append(updates) # grab the model parameters to use during training self.params = self.model.get_params() log.info("%s params: %s", str(type(self.model)), str(self.params)) # deal with the monitor channels if they were given (or take them from the plot) if monitor_channels is None and plot is not None and len(plot.channels) > 0: monitor_channels = plot.channels self.train_monitors_dict = {} self.valid_monitors_dict = {} self.test_monitors_dict = {} self.train_monitors_outservice_dict = {} self.valid_monitors_outservice_dict = {} self.test_monitors_outservice_dict = {} if monitor_channels: # collapse the appropriate monitors into their (name, expression, out_service) tuples train_collapsed = collapse_channels(monitor_channels, train=True) valid_collapsed = collapse_channels(monitor_channels, valid=True) test_collapsed = collapse_channels(monitor_channels, test=True) # get name: expression dictionary self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) self.test_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in test_collapsed]) # get name: outservice dictionary self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed]) self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed]) self.test_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in test_collapsed]) # finally deal with an outservice provided to monitor training cost self.train_outservice = train_outservice # remove redundant files made by the fileservice for the train monitor. # TODO: THIS FEELS LIKE A HACK. I don't like it. if isinstance(self.train_outservice, FileService): os.remove(self.train_outservice.valid_filename) os.remove(self.train_outservice.test_filename) ####################################### # compile train and monitor functions # ####################################### train_functions = [] for i in range(len(train_costs)): updates = train_updates[i] train_cost = train_costs[i] # Compile the training function! log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_updates), str(type(self.model))) t = time.time() f_learn = function(inputs=function_input, updates=updates, outputs=[train_cost] + self.train_monitors_dict.values(), givens=train_givens, name='f_learn_%d' % i) log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) train_functions.append(f_learn) # figure out if we want valid and test self.valid_flag = (self.dataset.getSubset(VALID)[0] is not None) and (len(self.valid_monitors_dict) > 0) self.test_flag = (self.dataset.getSubset(TEST)[0] is not None) and (len(self.test_monitors_dict) > 0) # Now compile the monitor functions! log.debug("Compiling monitor functions...") monitor_t = time.time() # valid monitors if self.valid_flag: self.valid_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=self.valid_monitors_dict.values(), givens=valid_givens, name='valid_monitor_function' ) else: self.valid_monitor_function = None # test monitors if self.test_flag: self.test_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=self.test_monitors_dict.values(), givens=test_givens, name='test_monitor_function' ) else: self.test_monitor_function = None log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) ################## # start training # ################## # make sure to deal with a list of train_cost functions - for layer-wise pretraining! # this list of training functions was created during __init__() start_time = time.time() for func_i, train_function in enumerate(train_functions): log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS (continue_training=%s)-----------", str(type(self.model)), func_i + 1, len(train_functions), self.n_epoch, str(continue_training)) log.debug("Train dataset size is: %s", self.dataset.getDataShape(TRAIN)) if self.dataset.getSubset(VALID)[0] is not None: log.debug("Valid dataset size is: %s", self.dataset.getDataShape(VALID)) if self.dataset.getSubset(TEST)[0] is not None: log.debug("Test dataset size is: %s", self.dataset.getDataShape(TEST)) self.STOP = False self.epoch_counter = 0 if not continue_training: # reset any decay params for decay_param in self.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(train_function, plot) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") set_shared_values(self.params, self.best_params) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl') log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t)) log.info("------------TOTAL %s TRAIN TIME TOOK %s---------", str(type(self.model)), make_time_units_string(time.time() - start_time))
def __init__(self, model, dataset, iterator_class=SequentialIterator, config=None, defaults=_defaults, rng=None, n_epoch=None, batch_size=None, minimum_batch_size=None, save_frequency=None, early_stop_threshold=None, early_stop_length=None, learning_rate=None, lr_decay=None, lr_factor=None, momentum=None, momentum_decay=None, momentum_factor=None, nesterov_momentum=None, flag_para_load=None): # superclass init super(SGD, self).__init__(config=config, defaults=defaults) # config and defaults are now combined in self.args! yay! self.model = model self.dataset = dataset self.iterator = iterator_class # Training epochs - how many times to iterate over the whole dataset self.n_epoch = n_epoch or self.args.get('n_epoch') # Dataset iteration batch sizes - number of examples in each calculation self.batch_size = batch_size or self.args.get('batch_size') self.minimum_batch_size = minimum_batch_size or self.args.get( 'minimum_batch_size') # Number of epochs between saving model parameters self.save_frequency = save_frequency or self.args.get('save_frequency') # Early stopping threshold and patience - by how much does the cost have to improve over a number of epochs self.early_stop_threshold = early_stop_threshold or self.args.get( 'early_stop_threshold') self.early_stop_length = early_stop_length or self.args.get( 'early_stop_length') # Learning rate - how drastic of a step do the parameters change lr = learning_rate or self.args.get('learning_rate') self.learning_rate = sharedX(lr, 'learning_rate') self.lr_scalers = self.model.get_lr_scalers() if lr_decay or self.args.get('lr_decay'): self.learning_rate_decay = get_decay_function( lr_decay or self.args.get('lr_decay'), self.learning_rate, self.learning_rate.get_value(), lr_factor or self.args.get('lr_factor')) # Momentum - smoothing over the parameter changes (see Hinton) self.momentum = sharedX(momentum or self.args.get('momentum'), 'momentum') if self.args.get('momentum_decay'): self.momentum_decay = get_decay_function( momentum_decay or self.args.get('momentum_decay'), self.momentum, self.momentum.get_value(), momentum_factor or self.args.get('momentum_factor')) self.nesterov_momentum = nesterov_momentum or self.args.get( 'nesterov_momentum') # RNG for working on random iterator if rng is None: random.seed(123) self.rng = random else: self.rng = rng self.params = self.model.get_params() # Now create the training cost function for the model to use while training - update parameters log.info("%s params: %s", str(type(self.model)), str(self.params)) # gradient! gradient = grad(self.model.get_train_cost(), self.params) grads = OrderedDict(zip(self.params, gradient)) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD, including AdaDelta! # It tells how to update the params each training epoch gradient_updates = self.get_updates(grads) # Combine the updates from the model also if applicable train_updates = model.get_updates() if train_updates: train_updates.update(gradient_updates) else: train_updates = gradient_updates # Compile the training function! log.info('Compiling f_learn function for model %s...', str(type(self.model))) t = time.time() self.f_learn = function(inputs=model.get_inputs(), updates=train_updates, outputs=self.model.get_train_cost(), name='f_learn') log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) # Determine if this function is unsupervised or not by looking at the number of inputs to the f_learn function. # If there is only one input, it is unsupervised, otherwise, it is supervised. # This workaround was provided by Pascal Lamblin on the theano-users google group num_inputs = len( [i for i in self.f_learn.maker.inputs if not i.shared]) if num_inputs == 1: log.debug("Model is unsupervised: 1 input to f_learn.") self.unsupervised = True elif num_inputs == 2: log.debug("Model is supervised: 2 inputs to f_learn.") self.unsupervised = False else: log.error( "Number of inputs to f_learn on model %s was %s. Needs to be 1 for unsupervised or 2 for supervised.", str(type(self.model)), str(num_inputs)) raise AssertionError( "Number of inputs to f_learn on model %s was %s. Needs to be 1 for unsupervised or 2 for supervised." % str(type(self.model)), str(num_inputs)) # grab the function(s) to use to monitor different model values during training self.monitors = self.model.get_monitors()
def _build_computation_graph(self): ###################### BUILD NETWORK ########################## # whether or not to mirror the input images before feeding them into the network if self.flag_datalayer: layer_1_input = mirror_images( input=self.x, image_shape=(self.batch_size, 3, 256, 256), # bc01 format cropsize=227, rand=self.rand, flag_rand=self.rand_crop) else: layer_1_input = self.x # 4D tensor (going to be in bc01 format) # Start with 5 convolutional pooling layers log.debug("convpool layer 1...") convpool_layer1 = ConvPoolLayer(inputs_hook=((self.batch_size, 3, 227, 227), layer_1_input), filter_shape=(96, 3, 11, 11), convstride=4, padsize=0, group=1, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer1.get_params() log.debug("convpool layer 2...") convpool_layer2 = ConvPoolLayer(inputs_hook=(( self.batch_size, 96, 27, 27, ), convpool_layer1.get_outputs()), filter_shape=(256, 96, 5, 5), convstride=1, padsize=2, group=2, poolsize=3, poolstride=2, bias_init=0.1, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer2.get_params() log.debug("convpool layer 3...") convpool_layer3 = ConvPoolLayer( inputs_hook=((self.batch_size, 256, 13, 13), convpool_layer2.get_outputs()), filter_shape=(384, 256, 3, 3), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer3.get_params() log.debug("convpool layer 4...") convpool_layer4 = ConvPoolLayer( inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer3.get_outputs()), filter_shape=(384, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=1, poolstride=0, bias_init=0.1, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer4.get_params() log.debug("convpool layer 5...") convpool_layer5 = ConvPoolLayer( inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer4.get_outputs()), filter_shape=(256, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer5.get_params() # Now onto the fully-connected layers! fc_config = { 'activation': 'rectifier', # type of activation function to use for output 'weights_init': 'gaussian', # either 'gaussian' or 'uniform' - how to initialize weights 'weights_mean': 0.0, # mean for gaussian weights init 'weights_std': 0.005, # standard deviation for gaussian weights init 'bias_init': 0.0 # how to initialize the bias parameter } log.debug("fully connected layer 1 (model layer 6)...") # we want to have dropout applied to the training version, but not the test version. fc_layer6_input = T.flatten(convpool_layer5.get_outputs(), 2) fc_layer6 = BasicLayer(inputs_hook=(9216, fc_layer6_input), output_size=4096, noise='dropout', noise_level=0.5, **fc_config) # Add this layer's parameters! self.params += fc_layer6.get_params() # Add the dropout noise switch self.noise_switches += fc_layer6.get_noise_switch() log.debug("fully connected layer 2 (model layer 7)...") fc_layer7 = BasicLayer(inputs_hook=(4096, fc_layer6.get_outputs()), output_size=4096, noise='dropout', noise_level=0.5, **fc_config) # Add this layer's parameters! self.params += fc_layer7.get_params() # Add the dropout noise switch self.noise_switches += fc_layer7.get_noise_switch() # last layer is a softmax prediction output layer softmax_config = { 'weights_init': 'gaussian', 'weights_mean': 0.0, 'weights_std': 0.005, 'bias_init': 0.0 } log.debug("softmax classification layer (model layer 8)...") softmax_layer8 = SoftmaxLayer(inputs_hook=(4096, fc_layer7.get_outputs()), output_size=1000, **softmax_config) # Add this layer's parameters! self.params += softmax_layer8.get_params() # finally the softmax output from the whole thing! self.output = softmax_layer8.get_outputs() self.targets = softmax_layer8.get_targets() ##################### # Cost and monitors # ##################### self.train_cost = softmax_layer8.negative_log_likelihood() cost = softmax_layer8.negative_log_likelihood() errors = softmax_layer8.errors() train_errors = softmax_layer8.errors() self.monitors = OrderedDict([('cost', cost), ('errors', errors), ('dropout_errors', train_errors)]) ######################### # Compile the functions # ######################### log.debug("Compiling functions!") t = time.time() log.debug("f_run...") # use the actual argmax from the classification self.f_run = function(inputs=[self.x], outputs=softmax_layer8.get_argmax_prediction()) log.debug("compilation took %s", make_time_units_string(time.time() - t))
def train(self, monitor_channels=None, plot=None): """ This method performs the training!!! It is an online training method that goes over minibatches from the dataset for a number of epochs, updating parameters after each minibatch. You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully. Parameters ---------- monitor_channels : list(MonitorsChannel or Monitor), optional The list of channels or monitors containing monitor expressions/variables to compile and evaluate on the data. plot : Plot, optional The Plot object to use if we want to graph the outputs (uses bokeh server). """ if not self.model: log.error("No self.model for the Optimizer!") raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() " "was called from the Model. Try initializing the Optimizer with the model param " "and calling optimizer.train().") ######################### # gradients and updates # ######################### # grab the model parameters to use during training self.params = self.model.get_params() # Now create the training cost function for the model to use while training - update parameters # gradient! # First find the basic variables that will be updated params = set() for param in self.params.values(): params.update(base_variables(param)) params = list(params) gradients = grad(cost=self.loss_expression, wrt=params) # now create the dictionary mapping the parameter with its gradient gradients = OrderedDict( [(param, g) for param, g in zip(params, gradients)] ) # clip gradients if we want. gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable updates = self.model.get_updates() if updates: updates.update(gradient_updates) else: updates = gradient_updates log.info("%s params: %s", self.model._classname, str(list(self.params.keys()))) ############ # monitors # ############ # deal with the monitor channels if they were given (or take them from the plot) if monitor_channels is None and plot is not None and len(plot.channels) > 0: monitor_channels = plot.channels self.train_monitors_dict = {} self.valid_monitors_dict = {} self.test_monitors_dict = {} self.train_monitors_outservice_dict = {} self.valid_monitors_outservice_dict = {} self.test_monitors_outservice_dict = {} if monitor_channels: # collapse the appropriate monitors into their (name, expression, out_service) tuples train_collapsed = collapse_channels(monitor_channels, train=True) valid_collapsed = collapse_channels(monitor_channels, valid=True) test_collapsed = collapse_channels(monitor_channels, test=True) # get name: expression dictionary self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) self.test_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in test_collapsed]) # get name: outservice dictionary self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed]) self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed]) self.test_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in test_collapsed]) ####################################### # compile train and monitor functions # ####################################### function_input = raise_to_list(self.model.get_inputs()) if self.loss_targets is not None: function_input += self.loss_targets # Compile the training function! log.info('Compiling f_learn function for model %s...', self.model._classname) t = time.time() f_learn = function(inputs=function_input, updates=updates, outputs=[self.loss_expression] + list(self.train_monitors_dict.values()), name='f_learn') log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) # figure out if we want valid and test (monitors) self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0) self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0) # Now compile the monitor functions! log.debug("Compiling monitor functions...") monitor_t = time.time() # valid monitors if self.valid_flag: self.valid_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.valid_monitors_dict.values()), name='valid_monitor_function' ) else: self.valid_monitor_function = None # test monitors if self.test_flag: self.test_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.test_monitors_dict.values()), name='test_monitor_function' ) else: self.test_monitor_function = None log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) ################## # start training # ################## log.info("-----------TRAINING %s FOR %d EPOCHS-----------", self.model._classname, self.n_epoch) self.STOP = False self.epoch_counter = 0 # reset any decay params for decay_param in self.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(f_learn, plot) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") self.model.set_param_values(self.best_params, borrow=False) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter)) log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))
def _perform_one_epoch(self, f_learn, plot=None): """ Performs a single training iteration with the given learn function. """ self.epoch_counter += 1 t = time.time() log.info('EPOCH %s', str(self.epoch_counter)) # set the noise switches on for training function! (this is where things like dropout happen) if not self.model.switches_on: self.model.turn_on_switches() ######### # train # ######### train_costs = [] train_monitors = {key: [] for key in self.train_monitors_dict.keys()} train_data = [ minibatch(input_data, self.batch_size, self.min_batch_size) for input_data in raise_to_list(self.dataset.train_inputs) ] if self.dataset.train_targets is not None and not self.unsupervised: train_data += [ minibatch(target, self.batch_size, self.min_batch_size) for target in raise_to_list(self.dataset.train_targets) ] for batch in min_normalized_izip(*train_data): _outs = raise_to_list(f_learn(*batch)) train_costs.append(_outs[0]) # handle any user defined monitors (if different from the train cost) if len(train_monitors) > 0: current_monitors = zip(self.train_monitors_dict.keys(), _outs[1:]) for name, val in current_monitors: val = numpy.asarray(val) train_monitors[name].append(val) # get the mean values for the batches mean_train = numpy.mean(train_costs, 0) current_mean_monitors = {key: numpy.mean(vals, 0) for key, vals in train_monitors.items()} # log the mean values! log.info('Train cost: %s', trunc(mean_train)) if len(current_mean_monitors) > 0: log.info('Train monitors: %s', str(current_mean_monitors)) # send the values to their outservices for name, service in self.train_monitors_outservice_dict.items(): if name in current_mean_monitors and service: service.write(current_mean_monitors[name], "train") # if there is a plot, also send them over! if plot: plot.update_plots(epoch=self.epoch_counter, monitors=current_mean_monitors) # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :) if self.model.switches_on: self.model.turn_off_switches() ######### # valid # ######### self._compute_over_subset("valid", self.dataset.valid_inputs, self.dataset.valid_targets, self.valid_monitors_dict, self.valid_monitor_function, self.valid_monitors_outservice_dict, plot) ######## # test # ######## self._compute_over_subset("test", self.dataset.test_inputs, self.dataset.test_targets, self.test_monitors_dict, self.test_monitor_function, self.test_monitors_outservice_dict, plot) ########### # cleanup # ########### # check for early stopping on train costs cost = numpy.sum(train_costs) # if the cost improved, reset the patience and record the best cost. if cost < self.best_cost * self.early_stop_threshold: self.patience = 0 self.best_cost = cost # save the parameters that made it the best self.best_params = self.model.get_param_values(borrow=False) elif not numpy.isnan(cost): self.patience += 1 # check for stopping either from n_epochs or from threshold/patience stop = False if self.epoch_counter >= self.n_epoch: log.info("Stopping (reached max number of epochs)...") stop = True if self.patience >= self.early_stop_length: log.info("Stopping early (reached stop threshold)...") stop = True timing = time.time() - t self.times.append(timing) log.info('time: ' + make_time_units_string(timing)) log.debug('remaining time: ' + make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times))) if (self.epoch_counter % self.save_frequency) == 0: #save params self.model.save_params('trained_epoch_' + str(self.epoch_counter)) # ANNEAL! if not stop: # perform the appropriate decay on the decay functions/parameters for this optimizer and model for decay_param in self.get_decay_params(): decay_param.decay() # return whether or not to stop this epoch return stop
def train(self, monitor_channels=None, train_outservice=None, plot=None, additional_cost=None): """ This method performs the training!!! It is an online training method that goes over minibatches from the dataset for a number of epochs, updating parameters after each minibatch. You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully. Parameters ---------- monitor_channels : list(MonitorsChannel or Monitor), optional The list of channels or monitors containing monitor expressions/variables to compile and evaluate on the data. train_outservice : OutService, optional The OutService to use for the automatically created train_cost monitor. Default of None just outputs to logs. plot : Plot, optional The Plot object to use if we want to graph the outputs (uses bokeh server). additional_cost : theano expression or list(theano expression), optional Any additional cost expressions to use during training (things like regularization). These will be summed with the existing cost. """ if not self.model: log.error("No self.model for the Optimizer!") raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() " "was called from the Model. Try initializing the Optimizer with the model param " "and calling optimizer.train().") ##################################################### # handle additional costs (normally regularization) # ##################################################### # Create the gradient updates for the model - make sure to handle the possible # list of costs used for pretraining of certain parts of the model. train_costs = raise_to_list(self.model.get_train_cost()) # deal with any other additional costs (like regularization, etc.) if additional_cost is not None: additional_costs = raise_to_list(additional_cost) if len(additional_costs) > 1: additional_cost = T.sum(additional_costs) ######################### # gradients and updates # ######################### train_updates = [] self.gradients = [] for i, train_cost in enumerate(train_costs): # Now create the training cost function for the model to use while training - update parameters # gradient! if len(train_costs) > 1 and additional_cost is not None: log.warning("additional_cost will double count with gradients during layer-wise pretraining!") warnings.warn("additional_cost will double count with gradients during layer-wise pretraining!") # TODO: additional_cost will double count with gradients during layer-wise pretraining. # Need to somehow make w.r.t. params appropriate for the individual training costs. gradients, _ = self.model.get_gradient(cost=train_cost, additional_cost=additional_cost) # clip gradients if we want. gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip) # append to list self.gradients.append(gradients) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable updates = self.model.get_updates() if updates: updates.update(gradient_updates) else: updates = gradient_updates train_updates.append(updates) # grab the model parameters to use during training self.params = self.model.get_params() log.info("%s params: %s", str(type(self.model)), str(self.params)) ############ # monitors # ############ # deal with the monitor channels if they were given (or take them from the plot) if monitor_channels is None and plot is not None and len(plot.channels) > 0: monitor_channels = plot.channels self.train_monitors_dict = {} self.valid_monitors_dict = {} self.test_monitors_dict = {} self.train_monitors_outservice_dict = {} self.valid_monitors_outservice_dict = {} self.test_monitors_outservice_dict = {} if monitor_channels: # collapse the appropriate monitors into their (name, expression, out_service) tuples train_collapsed = collapse_channels(monitor_channels, train=True) valid_collapsed = collapse_channels(monitor_channels, valid=True) test_collapsed = collapse_channels(monitor_channels, test=True) # get name: expression dictionary self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) self.test_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in test_collapsed]) # get name: outservice dictionary self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed]) self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed]) self.test_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in test_collapsed]) # finally deal with an outservice provided to monitor training cost self.train_outservice = train_outservice # remove redundant files made by the fileservice for the train monitor. # TODO: THIS FEELS LIKE A HACK. I don't like it. if isinstance(self.train_outservice, FileService): os.remove(self.train_outservice.valid_filename) os.remove(self.train_outservice.test_filename) ####################################### # compile train and monitor functions # ####################################### function_input = raise_to_list(self.model.get_inputs()) + raise_to_list(self.model.get_targets()) train_functions = [] for i, (updates, train_cost) in enumerate(zip(train_updates, train_costs)): # Compile the training function! log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_updates), str(type(self.model))) t = time.time() f_learn = function(inputs=function_input, updates=updates, outputs=[train_cost] + list(self.train_monitors_dict.values()), name='f_learn_%d' % i) log.info('f_learn %d compilation took %s', i + 1, make_time_units_string(time.time() - t)) train_functions.append(f_learn) # figure out if we want valid and test (monitors) self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0) self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0) # Now compile the monitor functions! log.debug("Compiling monitor functions...") monitor_t = time.time() # valid monitors if self.valid_flag: self.valid_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.valid_monitors_dict.values()), name='valid_monitor_function' ) else: self.valid_monitor_function = None # test monitors if self.test_flag: self.test_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.test_monitors_dict.values()), name='test_monitor_function' ) else: self.test_monitor_function = None log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) ################## # start training # ################## # make sure to deal with a list of train_cost functions - for layer-wise pretraining! # this list of training functions was created during __init__() start_time = time.time() for func_i, train_function in enumerate(train_functions): log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS-----------", str(type(self.model)), func_i + 1, len(train_functions), self.n_epoch) self.STOP = False self.epoch_counter = 0 # reset any decay params for decay_param in self.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(train_function, plot) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") set_shared_values(self.params, self.best_params) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter)) log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t)) log.info("------------TOTAL %s TRAIN TIME TOOK %s---------", str(type(self.model)), make_time_units_string(time.time() - start_time))
def build_computation_graph(self): ################# # Build the GSN # ################# log.debug("Building GSN graphs...") # GSN for training - with noise specified in initialization # if there is no hiddens_hook, build the GSN normally using the input X if not self.hiddens_flag: p_X_chain, _ = self.build_gsn(add_noise=self.add_noise) # if there is a hiddens_hook, we want to change the order layers are updated and make this purely # generative from the hiddens else: p_X_chain, _, = self.build_gsn(hiddens=self.hiddens, add_noise=self.add_noise, reverse=True) # GSN for prediction - same as above but no noise # deal with hiddens_hook exactly as above. if not self.hiddens_flag: p_X_chain_recon, recon_hiddens = self.build_gsn(add_noise=False) else: p_X_chain_recon, recon_hiddens = self.build_gsn(hiddens=self.hiddens, add_noise=False, reverse=True) #################### # Costs and output # #################### log.debug('Cost w.r.t p(X|...) at every step in the graph for the GSN') # use the noisy ones for training cost costs = [self.cost_function(output=rX, target=self.X, **self.cost_args) for rX in p_X_chain] self.show_cost = costs[-1] # for a monitor to show progress cost = numpy.sum(costs) # THIS IS THE TRAINING COST - RECONSTRUCTION OF OUTPUT FROM NOISY GRAPH # use the non-noisy graph for prediction gsn_costs_recon = [self.cost_function(output=rX, target=self.X, **self.cost_args) for rX in p_X_chain_recon] # another monitor, same as self.show_cost but on the non-noisy graph. self.monitor = gsn_costs_recon[-1] # this should be considered the main output of the computation, the sample after the # last walkback from the non-noisy graph. output = p_X_chain_recon[-1] # these should be considered the model's hidden representation - the hidden representation after # the last walkback from the non-noisy graph. hiddens = recon_hiddens train_mse = T.mean(T.sqr(p_X_chain[-1] - self.X), axis=0) train_mse = T.mean(train_mse) mse = T.mean(T.sqr(p_X_chain_recon[-1] - self.X), axis=0) mse = T.mean(mse) monitors = OrderedDict([('noisy_recon_cost', self.show_cost), ('recon_cost', self.monitor), ('mse', mse), ('train_mse', train_mse)]) ############ # Sampling # ############ # the input to the sampling function X_sample = T.matrix("X_sampling") self.network_state_input = [X_sample] + [T.matrix("H_sampling_"+str(i+1)) for i in range(self.layers)] # "Output" state of the network (noisy) # initialized with input, then we apply updates self.network_state_output = [X_sample] + self.network_state_input[1:] visible_pX_chain = [] # ONE update log.debug("Performing one walkback in network state sampling.") self.update_layers(self.network_state_output, visible_pX_chain, add_noise=True, reverse=False) ##################################################### # Create the run and monitor functions # ##################################################### log.debug("Compiling functions...") t = time.time() # doesn't make sense to have this if there is a hiddens_hook if not self.hiddens_flag: # THIS IS THE MAIN PREDICT FUNCTION - takes in a real matrix and produces the output from the non-noisy # computation graph log.debug("f_run...") self.f_run = function(inputs = [self.X], outputs = output, name = 'gsn_f_run') # this is a helper function - it corrupts inputs when testing the non-noisy graph (aka before feeding the # input to f_run) log.debug("f_noise...") self.f_noise = function(inputs = [self.X], outputs = self.input_noise(self.X), name = 'gsn_f_noise') # the sampling function, for creating lots of samples from the computational graph. (mostly for log-likelihood # or visualization) log.debug("f_sample...") if self.layers == 1: self.f_sample = function(inputs = [X_sample], outputs = visible_pX_chain[-1], name = 'gsn_f_sample_single_layer') else: # WHY IS THERE A WARNING???? # because the first odd layers are not used -> directly computed FROM THE EVEN layers # unused input = warn self.f_sample = function(inputs = self.network_state_input, outputs = self.network_state_output + visible_pX_chain, name = 'gsn_f_sample') log.debug("GSN compiling done. Took %s", make_time_units_string(time.time() - t)) return cost, monitors, output, hiddens
def _perform_one_epoch(self, f_learn): self.epoch_counter += 1 t = time.time() log.info('EPOCH %s', str(self.epoch_counter)) # set the noise switches on for training function! (this is where things like dropout happen) if len(self.noise_switches) > 0: log.debug("Turning on %s noise switches", str(len(self.noise_switches))) switch_vals = [switch.get_value() for switch in self.noise_switches] [switch.set_value(0.) for switch in self.noise_switches] # train train_costs = [] train_monitors = {key: [] for key in self.monitors.keys()} for batch_start, batch_end in self.train_batches: train_costs.append(f_learn(batch_start, batch_end)) self.call_monitors(monitor_function=self.train_monitor_function, monitors_dict=train_monitors, inputs=[batch_start, batch_end]) log.info('Train cost: %s', trunc(numpy.mean(train_costs, 0))) if len(self.monitors.keys()) > 0: log.info('Train monitors: %s', str({key: numpy.mean(value, 0) for key, value in train_monitors.items()})) # set the noise switches off for valid and test sets! we assume unseen data is noisy anyway :) if len(self.noise_switches) > 0: log.debug("Turning off %s noise switches", str(len(self.noise_switches))) [switch.set_value(0.) for switch in self.noise_switches] # valid if self.dataset.hasSubset(VALID) and len(self.monitors.keys()) > 0: valid_monitors = {key: [] for key in self.monitors.keys()} for batch_start, batch_end in self.valid_batches: self.call_monitors(monitor_function=self.valid_monitor_function, monitors_dict=valid_monitors, inputs=[batch_start, batch_end]) log.info('Valid monitors: %s', str({key: numpy.mean(value, 0) for key, value in valid_monitors.items()})) #test if self.dataset.hasSubset(TEST) and len(self.monitors.keys()) > 0: test_monitors = {key: [] for key in self.monitors.keys()} for batch_start, batch_end in self.test_batches: self.call_monitors(monitor_function=self.test_monitor_function, monitors_dict=test_monitors, inputs=[batch_start, batch_end]) log.info('Test monitors: %s', str({key: numpy.mean(value, 0) for key, value in test_monitors.items()})) # check for early stopping on train costs cost = numpy.sum(train_costs) if cost < self.best_cost * self.early_stop_threshold: self.patience = 0 self.best_cost = cost # save the parameters that made it the best self.best_params = get_shared_values(self.params) else: self.patience += 1 # check for stopping either from n_epochs or from threshold/patience stop = False if self.epoch_counter >= self.n_epoch: log.info("Stopping (reached max number of epochs)...") stop = True if self.patience >= self.early_stop_length: log.info("Stopping early (reached stop threshold)...") stop = True timing = time.time() - t self.times.append(timing) log.info('time: ' + make_time_units_string(timing)) log.info('remaining time: ' + make_time_units_string((self.n_epoch - self.epoch_counter) * numpy.mean(self.times))) if (self.epoch_counter % self.save_frequency) == 0: #save params self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl') # ANNEAL! if not stop: if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay: self.learning_rate_decay.decay() if hasattr(self, 'momentum_decay') and self.momentum_decay: self.momentum_decay.decay() for decay_param in self.model.get_decay_params(): decay_param.decay() # reset the switches if len(self.noise_switches) > 0: [switch.set_value(val) for switch, val in zip(self.noise_switches, switch_vals)] # return whether or not to stop this epoch return stop
def main(): var = theano.shared(T.zeros(shape=(88, 100), dtype=theano.config.floatX).eval(), name='W') updates = [(var, add_uniform(input=var, noise_level=.02))] stats = get_stats(var) l1 = stats.pop('l1') l2 = stats.pop('l2') min = stats.pop('min') max = stats.pop('max') var = stats.pop('var') std = stats.pop('std') mean = stats.pop('mean') mean_monitor = Monitor('mean', mean, train=True, valid=True) var_monitor = Monitor('var', var) w_channel = MonitorsChannel('W', monitors=mean_monitor) stat_channel = MonitorsChannel('stats', monitors=[var_monitor]) monitors = [w_channel, stat_channel] train_collapsed = collapse_channels(monitors, train=True) train_collapsed = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) valid_collapsed = collapse_channels(monitors, valid=True) valid_collapsed = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) plot = Plot(bokeh_doc_name='test_plots', monitor_channels=monitors, open_browser=True) log.debug('compiling...') f = theano.function(inputs=[], outputs=list(train_collapsed.values()), updates=updates) f2 = theano.function(inputs=[], outputs=list(valid_collapsed.values()), updates=updates) log.debug('done') t1 = time.time() for epoch in range(100): t = time.time() log.debug(epoch) vals = f() m = OrderedDict(zip(train_collapsed.keys(), vals)) plot.update_plots(epoch, m) log.debug('----- ' + make_time_units_string(time.time() - t)) for epoch in range(100): t = time.time() log.debug(epoch) vals = f2() m = OrderedDict(zip(valid_collapsed.keys(), vals)) plot.update_plots(epoch, m) log.debug('----- ' + make_time_units_string(time.time() - t)) log.debug("TOTAL TIME " + make_time_units_string(time.time() - t1))
def _build_computation_graph(self): ###################### BUILD NETWORK ########################## # whether or not to mirror the input images before feeding them into the network if self.flag_datalayer: layer_1_input = mirror_images(input=self.x, image_shape=(self.batch_size, 3, 256, 256), # bc01 format cropsize=227, rand=self.rand, flag_rand=self.rand_crop) else: layer_1_input = self.x # 4D tensor (going to be in bc01 format) # Start with 5 convolutional pooling layers log.debug("convpool layer 1...") convpool_layer1 = ConvPoolLayer(inputs_hook=((self.batch_size, 3, 227, 227), layer_1_input), filter_shape=(96, 3, 11, 11), convstride=4, padsize=0, group=1, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer1.get_params() log.debug("convpool layer 2...") convpool_layer2 = ConvPoolLayer(inputs_hook=((self.batch_size, 96, 27, 27, ), convpool_layer1.get_outputs()), filter_shape=(256, 96, 5, 5), convstride=1, padsize=2, group=2, poolsize=3, poolstride=2, bias_init=0.1, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer2.get_params() log.debug("convpool layer 3...") convpool_layer3 = ConvPoolLayer(inputs_hook=((self.batch_size, 256, 13, 13), convpool_layer2.get_outputs()), filter_shape=(384, 256, 3, 3), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer3.get_params() log.debug("convpool layer 4...") convpool_layer4 = ConvPoolLayer(inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer3.get_outputs()), filter_shape=(384, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=1, poolstride=0, bias_init=0.1, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer4.get_params() log.debug("convpool layer 5...") convpool_layer5 = ConvPoolLayer(inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer4.get_outputs()), filter_shape=(256, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer5.get_params() # Now onto the fully-connected layers! fc_config = { 'activation': 'rectifier', # type of activation function to use for output 'weights_init': 'gaussian', # either 'gaussian' or 'uniform' - how to initialize weights 'weights_mean': 0.0, # mean for gaussian weights init 'weights_std': 0.005, # standard deviation for gaussian weights init 'bias_init': 0.0 # how to initialize the bias parameter } log.debug("fully connected layer 1 (model layer 6)...") # we want to have dropout applied to the training version, but not the test version. fc_layer6_input = T.flatten(convpool_layer5.get_outputs(), 2) fc_layer6 = BasicLayer(inputs_hook=(9216, fc_layer6_input), output_size=4096, noise='dropout', noise_level=0.5, **fc_config) # Add this layer's parameters! self.params += fc_layer6.get_params() # Add the dropout noise switch self.noise_switches += fc_layer6.get_noise_switch() log.debug("fully connected layer 2 (model layer 7)...") fc_layer7 = BasicLayer(inputs_hook=(4096, fc_layer6.get_outputs()), output_size=4096, noise='dropout', noise_level=0.5, **fc_config) # Add this layer's parameters! self.params += fc_layer7.get_params() # Add the dropout noise switch self.noise_switches += fc_layer7.get_noise_switch() # last layer is a softmax prediction output layer softmax_config = { 'weights_init': 'gaussian', 'weights_mean': 0.0, 'weights_std': 0.005, 'bias_init': 0.0 } log.debug("softmax classification layer (model layer 8)...") softmax_layer8 = SoftmaxLayer(inputs_hook=(4096, fc_layer7.get_outputs()), output_size=1000, **softmax_config) # Add this layer's parameters! self.params += softmax_layer8.get_params() # finally the softmax output from the whole thing! self.output = softmax_layer8.get_outputs() self.targets = softmax_layer8.get_targets() ##################### # Cost and monitors # ##################### self.train_cost = softmax_layer8.negative_log_likelihood() cost = softmax_layer8.negative_log_likelihood() errors = softmax_layer8.errors() train_errors = softmax_layer8.errors() self.monitors = OrderedDict([('cost', cost), ('errors', errors), ('dropout_errors', train_errors)]) ######################### # Compile the functions # ######################### log.debug("Compiling functions!") t = time.time() log.debug("f_run...") # use the actual argmax from the classification self.f_run = function(inputs=[self.x], outputs=softmax_layer8.get_argmax_prediction()) log.debug("compilation took %s", make_time_units_string(time.time() - t))
def build_computation_graph(self): ################# # Build the GSN # ################# log.debug("Building GSN graphs...") # GSN for training - with noise specified in initialization # if there is no hiddens_hook, build the GSN normally using the input X if not self.hiddens_flag: p_X_chain, _ = self.build_gsn(add_noise=self.add_noise) # if there is a hiddens_hook, we want to change the order layers are updated and make this purely # generative from the hiddens else: p_X_chain, _, = self.build_gsn(hiddens=self.hiddens, add_noise=self.add_noise, reverse=True) # GSN for prediction - same as above but no noise # deal with hiddens_hook exactly as above. if not self.hiddens_flag: p_X_chain_recon, recon_hiddens = self.build_gsn(add_noise=False) else: p_X_chain_recon, recon_hiddens = self.build_gsn( hiddens=self.hiddens, add_noise=False, reverse=True) #################### # Costs and output # #################### log.debug('Cost w.r.t p(X|...) at every step in the graph for the GSN') # use the noisy ones for training cost costs = [ self.cost_function(output=rX, target=self.X, **self.cost_args) for rX in p_X_chain ] self.show_cost = costs[-1] # for a monitor to show progress cost = numpy.sum( costs ) # THIS IS THE TRAINING COST - RECONSTRUCTION OF OUTPUT FROM NOISY GRAPH # use the non-noisy graph for prediction gsn_costs_recon = [ self.cost_function(output=rX, target=self.X, **self.cost_args) for rX in p_X_chain_recon ] # another monitor, same as self.show_cost but on the non-noisy graph. self.monitor = gsn_costs_recon[-1] # this should be considered the main output of the computation, the sample after the # last walkback from the non-noisy graph. output = p_X_chain_recon[-1] # these should be considered the model's hidden representation - the hidden representation after # the last walkback from the non-noisy graph. hiddens = recon_hiddens train_mse = T.mean(T.sqr(p_X_chain[-1] - self.X), axis=0) train_mse = T.mean(train_mse) mse = T.mean(T.sqr(p_X_chain_recon[-1] - self.X), axis=0) mse = T.mean(mse) monitors = OrderedDict([('noisy_recon_cost', self.show_cost), ('recon_cost', self.monitor), ('mse', mse), ('train_mse', train_mse)]) ############ # Sampling # ############ # the input to the sampling function X_sample = T.matrix("X_sampling") self.network_state_input = [X_sample] + [ T.matrix("H_sampling_" + str(i + 1)) for i in range(self.layers) ] # "Output" state of the network (noisy) # initialized with input, then we apply updates self.network_state_output = [X_sample] + self.network_state_input[1:] visible_pX_chain = [] # ONE update log.debug("Performing one walkback in network state sampling.") self.update_layers(self.network_state_output, visible_pX_chain, add_noise=True, reverse=False) ##################################################### # Create the run and monitor functions # ##################################################### log.debug("Compiling functions...") t = time.time() # doesn't make sense to have this if there is a hiddens_hook if not self.hiddens_flag: # THIS IS THE MAIN PREDICT FUNCTION - takes in a real matrix and produces the output from the non-noisy # computation graph log.debug("f_run...") self.f_run = function(inputs=[self.X], outputs=output, name='gsn_f_run') # this is a helper function - it corrupts inputs when testing the non-noisy graph (aka before feeding the # input to f_run) log.debug("f_noise...") self.f_noise = function(inputs=[self.X], outputs=self.input_noise(self.X), name='gsn_f_noise') # the sampling function, for creating lots of samples from the computational graph. (mostly for log-likelihood # or visualization) log.debug("f_sample...") if self.layers == 1: self.f_sample = function(inputs=[X_sample], outputs=visible_pX_chain[-1], name='gsn_f_sample_single_layer') else: # WHY IS THERE A WARNING???? # because the first odd layers are not used -> directly computed FROM THE EVEN layers # unused input = warn self.f_sample = function(inputs=self.network_state_input, outputs=self.network_state_output + visible_pX_chain, name='gsn_f_sample') log.debug("GSN compiling done. Took %s", make_time_units_string(time.time() - t)) return cost, monitors, output, hiddens