def compile_run_fn(self): """ This is a helper function to compile the f_run function for computing the model's outputs given inputs. Compile and set the f_run function used for `run()`. It sets the `self.f_run` attribute to the f_run function. .. note:: The run function defaults like so:: self.f_run = function(inputs = raise_to_list(self.get_inputs()), outputs = raise_to_list(self.get_outputs()), updates = self.get_updates(), name = 'f_run') """ if not hasattr(self, 'f_run'): log.debug("Compiling f_run...") t = time.time() outputs = raise_to_list(self.get_outputs()) if outputs is not None and len(outputs) == 1: outputs = outputs[0] self.f_run = function(inputs = raise_to_list(self.get_inputs()), outputs = outputs, updates = self.get_updates(), name = 'f_run') log.debug("Compilation done. Took %s", make_time_units_string(time.time() - t)) else: log.warn('f_run already exists!')
def compile_run_fn(self): """ This is a helper function to compile the f_run function for computing the model's outputs given inputs. Compile and set the f_run function used for `run()`. It sets the `self.f_run` attribute to the f_run function. .. note:: The run function defaults like so:: self.f_run = function(inputs = raise_to_list(self.get_inputs()), outputs = self.get_outputs(), updates = self.get_updates(), name = 'f_run') Returns ------- Theano function The compiled theano function for running the model. """ if not getattr(self, 'f_run', None): log.debug("Compiling f_run...") t = time.time() self.f_run = function(inputs=raise_to_list(self.get_inputs()), outputs=self.get_outputs(), updates=self.get_updates(), name='f_run') log.debug("Compilation done. Took %s", make_time_units_string(time.time() - t)) else: log.debug('f_run already exists!') return self.f_run
def generate(self, initial=None, n_steps=None): """ Generate visible inputs from the model for `n_steps` and starting at recurrent hidden state `initial`. Parameters ---------- initial : tensor Recurrent hidden state to start generation from. n_steps : int Number of generation steps to do. Returns ------- tuple(array_like, array_like) The generated inputs and the ending recurrent hidden states. """ # compile the generate function! if not hasattr(self, 'f_generate'): log.debug("compiling f_generate...") self.f_generate = function(inputs=[self.generate_u0, self.n_steps], outputs=[self.x_ts, self.u_t], updates=self.updates_generate) log.debug("compilation done!") initial = initial or self.u0.eval() n_steps = n_steps or self.generate_n_steps return self.f_generate(initial, n_steps)
def compile_run_fn(self): """ This is a helper function to compile the f_run function for computing the model's outputs given inputs. Compile and set the f_run function used for `run()`. It sets the `self.f_run` attribute to the f_run function. .. note:: The run function defaults like so:: self.f_run = function(inputs = raise_to_list(self.get_inputs()), outputs = raise_to_list(self.get_outputs()), updates = self.get_updates(), name = 'f_run') """ if not hasattr(self, 'f_run'): log.debug("Compiling f_run...") t = time.time() outputs = raise_to_list(self.get_outputs()) if outputs is not None and len(outputs) == 1: outputs = outputs[0] self.f_run = function(inputs=raise_to_list(self.get_inputs()), outputs=outputs, updates=self.get_updates(), name='f_run') log.debug("Compilation done. Took %s", make_time_units_string(time.time() - t)) else: log.warn('f_run already exists!')
def compile_run_fn(self): """ This is a helper function to compile the f_run function for computing the model's outputs given inputs. Compile and set the f_run function used for `run()`. It sets the `self.f_run` attribute to the f_run function. .. note:: The run function defaults like so:: self.f_run = function(inputs = raise_to_list(self.get_inputs()), outputs = self.get_outputs(), updates = self.get_updates(), name = 'f_run') Returns ------- Theano function The compiled theano function for running the model. """ if not getattr(self, 'f_run', None): log.debug("Compiling f_run...") t = time.time() self.f_run = function(inputs = raise_to_list(self.get_inputs()), outputs = self.get_outputs(), updates = self.get_updates(), name = 'f_run') log.debug("Compilation done. Took %s", make_time_units_string(time.time() - t)) else: log.debug('f_run already exists!') return self.f_run
def _compile_csl_fn(): """ BUG HERE, not doing properly by chains (still has the bug, I don't see it) This is taking too much GPU mem mean: N(# of chains)*K(samples per chain)*D(data dim) minibatch: M(# of examples)*D (data dim) M * N matrix where each element is LL of one example against one chain. This function is for computing CSL over parallel chains of minibatches. Returns ------- theano function Function computing M * N matrix where each element is LL of one example against one chain. """ # when means is a 3D tensor (N, K, D) # When there are N chains, each chain having K samples of dimension D log.debug('building theano fn for Bernoulli CSL') means = T.tensor3('chains') minibatch = T.matrix('inputs') # how many chains CSL average over N = 5 # minibatch size M = 10 # data dim D = 784 minibatch.tag.test_value = as_floatX(numpy.random.binomial(1, 0.5, size=(M, D))) # chain length K = 100 means.tag.test_value = as_floatX(numpy.random.uniform(size=(N, K, D))) # computing LL # the length of each chain sample_size = means.shape[1] _minibatch = minibatch.dimshuffle(0, 'x', 'x', 1) _means = means.dimshuffle('x', 0, 1, 2) A = T.log(sample_size) B = _minibatch * T.log(_means) + (1. - _minibatch) * T.log(1. - _means) C = B.sum(axis=3) D = log_sum_exp_theano(C, axis=2) E = D - A # G = E.mean(axis=1) f = function( inputs=[minibatch, means], outputs=E, name='CSL_independent_bernoulli_fn' ) return f
def _compile_csl_fn(): """ BUG HERE, not doing properly by chains (still has the bug, I don't see it) This is taking too much GPU mem mean: N(# of chains)*K(samples per chain)*D(data dim) minibatch: M(# of examples)*D (data dim) M * N matrix where each element is LL of one example against one chain. This function is for computing CSL over parallel chains of minibatches. Returns ------- theano function Function computing M * N matrix where each element is LL of one example against one chain. """ # when means is a 3D tensor (N, K, D) # When there are N chains, each chain having K samples of dimension D log.debug('building theano fn for Bernoulli CSL') means = T.tensor3('chains') minibatch = T.matrix('inputs') # how many chains CSL average over N = 5 # minibatch size M = 10 # data dim D = 784 minibatch.tag.test_value = as_floatX( numpy.random.binomial(1, 0.5, size=(M, D))) # chain length K = 100 means.tag.test_value = as_floatX(numpy.random.uniform(size=(N, K, D))) # computing LL # the length of each chain sample_size = means.shape[1] _minibatch = minibatch.dimshuffle(0, 'x', 'x', 1) _means = means.dimshuffle('x', 0, 1, 2) A = T.log(sample_size) B = _minibatch * T.log(_means) + (1. - _minibatch) * T.log(1. - _means) C = B.sum(axis=3) D = log_sum_exp_theano(C, axis=2) E = D - A # G = E.mean(axis=1) f = function(inputs=[minibatch, means], outputs=E, name='CSL_independent_bernoulli_fn') return f
def run(self, input): """ This method will return the Prototype's output (run through the `f_run` function), given an input. The input comes from all unique inputs to the models in the Prototype as calculated from `get_inputs()` and the outputs computed similarly from `get_outputs`. Try to avoid re-compiling the theano function created for run - check a `hasattr(self, 'f_run')` or something similar first. Parameters ---------- input: array_like Theano/numpy tensor-like object that is the input into the model's computation graph. Returns ------- array_like Theano/numpy tensor-like object that is the output of the model's computation graph. """ # set the noise switches off for running! we assume unseen data is noisy anyway :) old_switch_vals = [] if len(self.get_switches()) > 0: log.debug("Turning off %s noise switches, resetting them after run!", str(len(self.get_switches()))) old_switch_vals = [switch.get_value() for switch in self.get_switches()] [switch.set_value(0.) for switch in self.get_switches()] # make sure the input is raised to a list - we are going to splat it! input = raise_to_list(input) # first check if we already made an f_run function if hasattr(self, 'f_run'): output = self.f_run(*input) # otherwise, compile it! else: inputs = raise_to_list(self.get_inputs()) outputs = raise_to_list(self.get_outputs()) if outputs is not None and len(outputs) == 1: outputs = outputs[0] updates = self.get_updates() t = time.time() log.info("Compiling f_run...") self.f_run = function(inputs=inputs, outputs=outputs, updates=updates, name="f_run") log.info("Compilation done! Took %s", make_time_units_string(time.time() - t)) output = self.f_run(*input) # reset any switches to how they were! if len(self.get_switches()) > 0: [switch.set_value(val) for switch, val in zip(self.get_switches(), old_switch_vals)] return output
def _compile_csl_fn_v2(mu): """ p(x) = sum_h p(x|h)p(h) where p(x|h) is independent Bernoulli with a vector mu, mu_i for dim_i This function is for computing CSL over minibatches (in a single chain). Parameters ---------- mu : array_like mu is (N,D) numpy array Returns ------- theano function Function computing the Bernoulli CSL log likelihood. """ # log.debug('building theano fn for Bernoulli CSL') x = T.fmatrix('inputs') x.tag.test_value = as_floatX(numpy.random.uniform(size=(10, 784))) mu = numpy.clip(mu, 1e-10, (1 - (1e-5))) mu = mu[None, :, :] inner_1 = numpy.log(mu) inner_2 = numpy.log(1. - mu) k = mu.shape[1] D = mu.shape[2] # there are two terms in the log(p(x|mu)) term_1 = -T.log(k) c = T.sum(x.dimshuffle(0, 'x', 1) * inner_1 + (1. - x.dimshuffle(0, 'x', 1)) * inner_2, axis=2) debug = c.sum(axis=1) term_2 = log_sum_exp_theano(c, axis=1) log_likelihood = term_1 + term_2 f = function([x], log_likelihood, name='CSL_independent_bernoulli_fn') return f
def generate(self, initial=None, n_steps=None): """ Generate visible inputs from the model for n_steps and starting at recurrent hidden state initial :param initial: recurrent hidden state to start generation from :type initial: tensor :param n_steps: number of generation steps to do :type n_steps: int :return: the generated inputs and the ending recurrent hidden state :rtype: matrix, matrix """ # compile the generate function! if not hasattr(self, 'f_generate'): self.f_generate = function(inputs=[self. generate_u0, self.n_steps], outputs=[self.v_ts, self.u_t], updates=self.updates_generate) initial = initial or self.u0.eval() n_steps = n_steps or self.generate_n_steps return self.f_generate(initial, n_steps)
def build_computation_graph(self): ################# # Build the GSN # ################# log.debug("Building GSN graphs...") # GSN for training - with noise specified in initialization # if there is no hiddens_hook, build the GSN normally using the input X if not self.hiddens_flag: p_X_chain, _ = self.build_gsn(add_noise=self.add_noise) # if there is a hiddens_hook, we want to change the order layers are updated and make this purely # generative from the hiddens else: p_X_chain, _, = self.build_gsn(hiddens=self.hiddens, add_noise=self.add_noise, reverse=True) # GSN for prediction - same as above but no noise # deal with hiddens_hook exactly as above. if not self.hiddens_flag: p_X_chain_recon, recon_hiddens = self.build_gsn(add_noise=False) else: p_X_chain_recon, recon_hiddens = self.build_gsn( hiddens=self.hiddens, add_noise=False, reverse=True) #################### # Costs and output # #################### log.debug('Cost w.r.t p(X|...) at every step in the graph for the GSN') # use the noisy ones for training cost costs = [ self.cost_function(output=rX, target=self.X, **self.cost_args) for rX in p_X_chain ] self.show_cost = costs[-1] # for a monitor to show progress cost = numpy.sum( costs ) # THIS IS THE TRAINING COST - RECONSTRUCTION OF OUTPUT FROM NOISY GRAPH # use the non-noisy graph for prediction gsn_costs_recon = [ self.cost_function(output=rX, target=self.X, **self.cost_args) for rX in p_X_chain_recon ] # another monitor, same as self.show_cost but on the non-noisy graph. self.monitor = gsn_costs_recon[-1] # this should be considered the main output of the computation, the sample after the # last walkback from the non-noisy graph. output = p_X_chain_recon[-1] # these should be considered the model's hidden representation - the hidden representation after # the last walkback from the non-noisy graph. hiddens = recon_hiddens train_mse = T.mean(T.sqr(p_X_chain[-1] - self.X), axis=0) train_mse = T.mean(train_mse) mse = T.mean(T.sqr(p_X_chain_recon[-1] - self.X), axis=0) mse = T.mean(mse) monitors = OrderedDict([('noisy_recon_cost', self.show_cost), ('recon_cost', self.monitor), ('mse', mse), ('train_mse', train_mse)]) ############ # Sampling # ############ # the input to the sampling function X_sample = T.matrix("X_sampling") self.network_state_input = [X_sample] + [ T.matrix("H_sampling_" + str(i + 1)) for i in range(self.layers) ] # "Output" state of the network (noisy) # initialized with input, then we apply updates self.network_state_output = [X_sample] + self.network_state_input[1:] visible_pX_chain = [] # ONE update log.debug("Performing one walkback in network state sampling.") self.update_layers(self.network_state_output, visible_pX_chain, add_noise=True, reverse=False) ##################################################### # Create the run and monitor functions # ##################################################### log.debug("Compiling functions...") t = time.time() # doesn't make sense to have this if there is a hiddens_hook if not self.hiddens_flag: # THIS IS THE MAIN PREDICT FUNCTION - takes in a real matrix and produces the output from the non-noisy # computation graph log.debug("f_run...") self.f_run = function(inputs=[self.X], outputs=output, name='gsn_f_run') # this is a helper function - it corrupts inputs when testing the non-noisy graph (aka before feeding the # input to f_run) log.debug("f_noise...") self.f_noise = function(inputs=[self.X], outputs=self.input_noise(self.X), name='gsn_f_noise') # the sampling function, for creating lots of samples from the computational graph. (mostly for log-likelihood # or visualization) log.debug("f_sample...") if self.layers == 1: self.f_sample = function(inputs=[X_sample], outputs=visible_pX_chain[-1], name='gsn_f_sample_single_layer') else: # WHY IS THERE A WARNING???? # because the first odd layers are not used -> directly computed FROM THE EVEN layers # unused input = warn self.f_sample = function(inputs=self.network_state_input, outputs=self.network_state_output + visible_pX_chain, name='gsn_f_sample') log.debug("GSN compiling done. Took %s", make_time_units_string(time.time() - t)) return cost, monitors, output, hiddens
def train(self, monitor_channels=None, train_outservice=None, plot=None, additional_cost=None): """ This method performs the training!!! It is an online training method that goes over minibatches from the dataset for a number of epochs, updating parameters after each minibatch. You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully. Parameters ---------- monitor_channels : list(MonitorsChannel or Monitor), optional The list of channels or monitors containing monitor expressions/variables to compile and evaluate on the data. train_outservice : OutService, optional The OutService to use for the automatically created train_cost monitor. Default of None just outputs to logs. plot : Plot, optional The Plot object to use if we want to graph the outputs (uses bokeh server). additional_cost : theano expression or list(theano expression), optional Any additional cost expressions to use during training (things like regularization). These will be summed with the existing cost. """ if not self.model: log.error("No self.model for the Optimizer!") raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() " "was called from the Model. Try initializing the Optimizer with the model param " "and calling optimizer.train().") ##################################################### # handle additional costs (normally regularization) # ##################################################### # Create the gradient updates for the model - make sure to handle the possible # list of costs used for pretraining of certain parts of the model. train_costs = raise_to_list(self.model.get_train_cost()) # deal with any other additional costs (like regularization, etc.) if additional_cost is not None: additional_costs = raise_to_list(additional_cost) if len(additional_costs) > 1: additional_cost = T.sum(additional_costs) ######################### # gradients and updates # ######################### train_updates = [] self.gradients = [] for i, train_cost in enumerate(train_costs): # Now create the training cost function for the model to use while training - update parameters # gradient! if len(train_costs) > 1 and additional_cost is not None: log.warning("additional_cost will double count with gradients during layer-wise pretraining!") warnings.warn("additional_cost will double count with gradients during layer-wise pretraining!") # TODO: additional_cost will double count with gradients during layer-wise pretraining. # Need to somehow make w.r.t. params appropriate for the individual training costs. gradients, _ = self.model.get_gradient(cost=train_cost, additional_cost=additional_cost) # clip gradients if we want. gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip) # append to list self.gradients.append(gradients) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable updates = self.model.get_updates() if updates: updates.update(gradient_updates) else: updates = gradient_updates train_updates.append(updates) # grab the model parameters to use during training self.params = self.model.get_params() log.info("%s params: %s", str(type(self.model)), str(self.params)) ############ # monitors # ############ # deal with the monitor channels if they were given (or take them from the plot) if monitor_channels is None and plot is not None and len(plot.channels) > 0: monitor_channels = plot.channels self.train_monitors_dict = {} self.valid_monitors_dict = {} self.test_monitors_dict = {} self.train_monitors_outservice_dict = {} self.valid_monitors_outservice_dict = {} self.test_monitors_outservice_dict = {} if monitor_channels: # collapse the appropriate monitors into their (name, expression, out_service) tuples train_collapsed = collapse_channels(monitor_channels, train=True) valid_collapsed = collapse_channels(monitor_channels, valid=True) test_collapsed = collapse_channels(monitor_channels, test=True) # get name: expression dictionary self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) self.test_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in test_collapsed]) # get name: outservice dictionary self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed]) self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed]) self.test_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in test_collapsed]) # finally deal with an outservice provided to monitor training cost self.train_outservice = train_outservice # remove redundant files made by the fileservice for the train monitor. # TODO: THIS FEELS LIKE A HACK. I don't like it. if isinstance(self.train_outservice, FileService): os.remove(self.train_outservice.valid_filename) os.remove(self.train_outservice.test_filename) ####################################### # compile train and monitor functions # ####################################### function_input = raise_to_list(self.model.get_inputs()) + raise_to_list(self.model.get_targets()) train_functions = [] for i, (updates, train_cost) in enumerate(zip(train_updates, train_costs)): # Compile the training function! log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_updates), str(type(self.model))) t = time.time() f_learn = function(inputs=function_input, updates=updates, outputs=[train_cost] + list(self.train_monitors_dict.values()), name='f_learn_%d' % i) log.info('f_learn %d compilation took %s', i + 1, make_time_units_string(time.time() - t)) train_functions.append(f_learn) # figure out if we want valid and test (monitors) self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0) self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0) # Now compile the monitor functions! log.debug("Compiling monitor functions...") monitor_t = time.time() # valid monitors if self.valid_flag: self.valid_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.valid_monitors_dict.values()), name='valid_monitor_function' ) else: self.valid_monitor_function = None # test monitors if self.test_flag: self.test_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.test_monitors_dict.values()), name='test_monitor_function' ) else: self.test_monitor_function = None log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) ################## # start training # ################## # make sure to deal with a list of train_cost functions - for layer-wise pretraining! # this list of training functions was created during __init__() start_time = time.time() for func_i, train_function in enumerate(train_functions): log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS-----------", str(type(self.model)), func_i + 1, len(train_functions), self.n_epoch) self.STOP = False self.epoch_counter = 0 # reset any decay params for decay_param in self.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(train_function, plot) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") set_shared_values(self.params, self.best_params) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter)) log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t)) log.info("------------TOTAL %s TRAIN TIME TOOK %s---------", str(type(self.model)), make_time_units_string(time.time() - start_time))
def _build_computation_graph(self): ###################### BUILD NETWORK ########################## # whether or not to mirror the input images before feeding them into the network if self.flag_datalayer: layer_1_input = mirror_images( input=self.x, image_shape=(self.batch_size, 3, 256, 256), # bc01 format cropsize=227, rand=self.rand, flag_rand=self.rand_crop) else: layer_1_input = self.x # 4D tensor (going to be in bc01 format) # Start with 5 convolutional pooling layers log.debug("convpool layer 1...") convpool_layer1 = ConvPoolLayer(inputs_hook=((self.batch_size, 3, 227, 227), layer_1_input), filter_shape=(96, 3, 11, 11), convstride=4, padsize=0, group=1, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer1.get_params() log.debug("convpool layer 2...") convpool_layer2 = ConvPoolLayer(inputs_hook=(( self.batch_size, 96, 27, 27, ), convpool_layer1.get_outputs()), filter_shape=(256, 96, 5, 5), convstride=1, padsize=2, group=2, poolsize=3, poolstride=2, bias_init=0.1, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer2.get_params() log.debug("convpool layer 3...") convpool_layer3 = ConvPoolLayer( inputs_hook=((self.batch_size, 256, 13, 13), convpool_layer2.get_outputs()), filter_shape=(384, 256, 3, 3), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer3.get_params() log.debug("convpool layer 4...") convpool_layer4 = ConvPoolLayer( inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer3.get_outputs()), filter_shape=(384, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=1, poolstride=0, bias_init=0.1, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer4.get_params() log.debug("convpool layer 5...") convpool_layer5 = ConvPoolLayer( inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer4.get_outputs()), filter_shape=(256, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer5.get_params() # Now onto the fully-connected layers! fc_config = { 'activation': 'rectifier', # type of activation function to use for output 'weights_init': 'gaussian', # either 'gaussian' or 'uniform' - how to initialize weights 'weights_mean': 0.0, # mean for gaussian weights init 'weights_std': 0.005, # standard deviation for gaussian weights init 'bias_init': 0.0 # how to initialize the bias parameter } log.debug("fully connected layer 1 (model layer 6)...") # we want to have dropout applied to the training version, but not the test version. fc_layer6_input = T.flatten(convpool_layer5.get_outputs(), 2) fc_layer6 = Dense(inputs_hook=(9216, fc_layer6_input), output_size=4096, noise='dropout', noise_level=0.5, **fc_config) # Add this layer's parameters! self.params += fc_layer6.get_params() # Add the dropout noise switch self.noise_switches += fc_layer6.get_noise_switch() log.debug("fully connected layer 2 (model layer 7)...") fc_layer7 = Dense(inputs_hook=(4096, fc_layer6.get_outputs()), output_size=4096, noise='dropout', noise_level=0.5, **fc_config) # Add this layer's parameters! self.params += fc_layer7.get_params() # Add the dropout noise switch self.noise_switches += fc_layer7.get_noise_switch() # last layer is a softmax prediction output layer softmax_config = { 'weights_init': 'gaussian', 'weights_mean': 0.0, 'weights_std': 0.005, 'bias_init': 0.0 } log.debug("softmax classification layer (model layer 8)...") softmax_layer8 = SoftmaxLayer(inputs_hook=(4096, fc_layer7.get_outputs()), output_size=1000, **softmax_config) # Add this layer's parameters! self.params += softmax_layer8.get_params() # finally the softmax output from the whole thing! self.output = softmax_layer8.get_outputs() self.targets = softmax_layer8.get_targets() ##################### # Cost and monitors # ##################### self.train_cost = softmax_layer8.negative_log_likelihood() cost = softmax_layer8.negative_log_likelihood() errors = softmax_layer8.errors() train_errors = softmax_layer8.errors() self.monitors = OrderedDict([('cost', cost), ('errors', errors), ('dropout_errors', train_errors)]) ######################### # Compile the functions # ######################### log.debug("Compiling functions!") t = time.time() log.debug("f_run...") # use the actual argmax from the classification self.f_run = function(inputs=[self.x], outputs=softmax_layer8.get_argmax_prediction()) log.debug("compilation took %s", make_time_units_string(time.time() - t))
def train(self, monitor_channels=None, plot=None): """ This method performs the training!!! It is an online training method that goes over minibatches from the dataset for a number of epochs, updating parameters after each minibatch. You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully. Parameters ---------- monitor_channels : list(MonitorsChannel or Monitor), optional The list of channels or monitors containing monitor expressions/variables to compile and evaluate on the data. plot : Plot, optional The Plot object to use if we want to graph the outputs (uses bokeh server). """ if not self.model: log.error("No self.model for the Optimizer!") raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() " "was called from the Model. Try initializing the Optimizer with the model param " "and calling optimizer.train().") ######################### # gradients and updates # ######################### # grab the model parameters to use during training self.params = self.model.get_params() # Now create the training cost function for the model to use while training - update parameters # gradient! # First find the basic variables that will be updated params = set() for param in self.params.values(): params.update(base_variables(param)) params = list(params) gradients = grad(cost=self.loss_expression, wrt=params) # now create the dictionary mapping the parameter with its gradient gradients = OrderedDict( [(param, g) for param, g in zip(params, gradients)] ) # clip gradients if we want. gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable updates = self.model.get_updates() if updates: updates.update(gradient_updates) else: updates = gradient_updates log.info("%s params: %s", self.model._classname, str(list(self.params.keys()))) ############ # monitors # ############ # deal with the monitor channels if they were given (or take them from the plot) if monitor_channels is None and plot is not None and len(plot.channels) > 0: monitor_channels = plot.channels self.train_monitors_dict = {} self.valid_monitors_dict = {} self.test_monitors_dict = {} self.train_monitors_outservice_dict = {} self.valid_monitors_outservice_dict = {} self.test_monitors_outservice_dict = {} if monitor_channels: # collapse the appropriate monitors into their (name, expression, out_service) tuples train_collapsed = collapse_channels(monitor_channels, train=True) valid_collapsed = collapse_channels(monitor_channels, valid=True) test_collapsed = collapse_channels(monitor_channels, test=True) # get name: expression dictionary self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) self.test_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in test_collapsed]) # get name: outservice dictionary self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed]) self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed]) self.test_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in test_collapsed]) ####################################### # compile train and monitor functions # ####################################### function_input = raise_to_list(self.model.get_inputs()) if self.loss_targets is not None: function_input += self.loss_targets # Compile the training function! log.info('Compiling f_learn function for model %s...', self.model._classname) t = time.time() f_learn = function(inputs=function_input, updates=updates, outputs=[self.loss_expression] + list(self.train_monitors_dict.values()), name='f_learn') log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) # figure out if we want valid and test (monitors) self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0) self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0) # Now compile the monitor functions! log.debug("Compiling monitor functions...") monitor_t = time.time() # valid monitors if self.valid_flag: self.valid_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.valid_monitors_dict.values()), name='valid_monitor_function' ) else: self.valid_monitor_function = None # test monitors if self.test_flag: self.test_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.test_monitors_dict.values()), name='test_monitor_function' ) else: self.test_monitor_function = None log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) ################## # start training # ################## log.info("-----------TRAINING %s FOR %d EPOCHS-----------", self.model._classname, self.n_epoch) self.STOP = False self.epoch_counter = 0 # reset any decay params for decay_param in self.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(f_learn, plot) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") self.model.set_param_values(self.best_params, borrow=False) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter)) log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))
def _build_computation_graph(self): ###################### BUILD NETWORK ########################## # whether or not to mirror the input images before feeding them into the network if self.flag_datalayer: layer_1_input = mirror_images(input=self.x, image_shape=(self.batch_size, 3, 256, 256), # bc01 format cropsize=227, rand=self.rand, flag_rand=self.rand_crop) else: layer_1_input = self.x # 4D tensor (going to be in bc01 format) # Start with 5 convolutional pooling layers log.debug("convpool layer 1...") convpool_layer1 = ConvPoolLayer(inputs_hook=((self.batch_size, 3, 227, 227), layer_1_input), filter_shape=(96, 3, 11, 11), convstride=4, padsize=0, group=1, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer1.get_params() log.debug("convpool layer 2...") convpool_layer2 = ConvPoolLayer(inputs_hook=((self.batch_size, 96, 27, 27, ), convpool_layer1.get_outputs()), filter_shape=(256, 96, 5, 5), convstride=1, padsize=2, group=2, poolsize=3, poolstride=2, bias_init=0.1, local_response_normalization=True) # Add this layer's parameters! self.params += convpool_layer2.get_params() log.debug("convpool layer 3...") convpool_layer3 = ConvPoolLayer(inputs_hook=((self.batch_size, 256, 13, 13), convpool_layer2.get_outputs()), filter_shape=(384, 256, 3, 3), convstride=1, padsize=1, group=1, poolsize=1, poolstride=0, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer3.get_params() log.debug("convpool layer 4...") convpool_layer4 = ConvPoolLayer(inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer3.get_outputs()), filter_shape=(384, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=1, poolstride=0, bias_init=0.1, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer4.get_params() log.debug("convpool layer 5...") convpool_layer5 = ConvPoolLayer(inputs_hook=((self.batch_size, 384, 13, 13), convpool_layer4.get_outputs()), filter_shape=(256, 384, 3, 3), convstride=1, padsize=1, group=2, poolsize=3, poolstride=2, bias_init=0.0, local_response_normalization=False) # Add this layer's parameters! self.params += convpool_layer5.get_params() # Now onto the fully-connected layers! fc_config = { 'activation': 'rectifier', # type of activation function to use for output 'weights_init': 'gaussian', # either 'gaussian' or 'uniform' - how to initialize weights 'weights_mean': 0.0, # mean for gaussian weights init 'weights_std': 0.005, # standard deviation for gaussian weights init 'bias_init': 0.0 # how to initialize the bias parameter } log.debug("fully connected layer 1 (model layer 6)...") # we want to have dropout applied to the training version, but not the test version. fc_layer6_input = T.flatten(convpool_layer5.get_outputs(), 2) fc_layer6 = Dense(inputs_hook=(9216, fc_layer6_input), output_size=4096, noise='dropout', noise_level=0.5, **fc_config) # Add this layer's parameters! self.params += fc_layer6.get_params() # Add the dropout noise switch self.noise_switches += fc_layer6.get_switches() log.debug("fully connected layer 2 (model layer 7)...") fc_layer7 = Dense(inputs_hook=(4096, fc_layer6.get_outputs()), output_size=4096, noise='dropout', noise_level=0.5, **fc_config) # Add this layer's parameters! self.params += fc_layer7.get_params() # Add the dropout noise switch self.noise_switches += fc_layer7.get_switches() # last layer is a softmax prediction output layer softmax_config = { 'weights_init': 'gaussian', 'weights_mean': 0.0, 'weights_std': 0.005, 'bias_init': 0.0 } log.debug("softmax classification layer (model layer 8)...") softmax_layer8 = SoftmaxLayer(inputs_hook=(4096, fc_layer7.get_outputs()), output_size=1000, **softmax_config) # Add this layer's parameters! self.params += softmax_layer8.get_params() # finally the softmax output from the whole thing! self.output = softmax_layer8.get_outputs() self.targets = softmax_layer8.get_targets() ##################### # Cost and monitors # ##################### self.train_cost = softmax_layer8.negative_log_likelihood() cost = softmax_layer8.negative_log_likelihood() errors = softmax_layer8.errors() train_errors = softmax_layer8.errors() self.monitors = OrderedDict([('cost', cost), ('errors', errors), ('dropout_errors', train_errors)]) ######################### # Compile the functions # ######################### log.debug("Compiling functions!") t = time.time() log.debug("f_run...") # use the actual argmax from the classification self.f_run = function(inputs=[self.x], outputs=softmax_layer8.get_argmax_prediction()) log.debug("compilation took %s", make_time_units_string(time.time() - t))
def train(self, monitor_channels=None, train_outservice=None, plot=None, additional_cost=None): """ This method performs the training!!! It is an online training method that goes over minibatches from the dataset for a number of epochs, updating parameters after each minibatch. You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully. Parameters ---------- monitor_channels : list(MonitorsChannel or Monitor), optional The list of channels or monitors containing monitor expressions/variables to compile and evaluate on the data. train_outservice : OutService, optional The OutService to use for the automatically created train_cost monitor. Default of None just outputs to logs. plot : Plot, optional The Plot object to use if we want to graph the outputs (uses bokeh server). additional_cost : theano expression or list(theano expression), optional Any additional cost expressions to use during training (things like regularization). These will be summed with the existing cost. """ if not self.model: log.error("No self.model for the Optimizer!") raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() " "was called from the Model. Try initializing the Optimizer with the model param " "and calling optimizer.train().") ##################################################### # handle additional costs (normally regularization) # ##################################################### # Create the gradient updates for the model - make sure to handle the possible # list of costs used for pretraining of certain parts of the model. train_costs = raise_to_list(self.model.get_train_cost()) # deal with any other additional costs (like regularization, etc.) if additional_cost is not None: additional_costs = raise_to_list(additional_cost) if len(additional_costs) > 1: additional_cost = T.sum(additional_costs) ######################### # gradients and updates # ######################### train_updates = [] self.gradients = [] for i, train_cost in enumerate(train_costs): # Now create the training cost function for the model to use while training - update parameters # gradient! if len(train_costs) > 1 and additional_cost is not None: log.warning("additional_cost will double count with gradients during layer-wise pretraining!") warnings.warn("additional_cost will double count with gradients during layer-wise pretraining!") # TODO: additional_cost will double count with gradients during layer-wise pretraining. # Need to somehow make w.r.t. params appropriate for the individual training costs. gradients, _ = self.model.get_gradient(cost=train_cost, additional_cost=additional_cost) # clip gradients if we want. gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip) # append to list self.gradients.append(gradients) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable updates = self.model.get_updates() if updates: updates.update(gradient_updates) else: updates = gradient_updates train_updates.append(updates) # grab the model parameters to use during training self.params = self.model.get_params() log.info("%s params: %s", str(type(self.model)), str(self.params)) ############ # monitors # ############ # deal with the monitor channels if they were given (or take them from the plot) if monitor_channels is None and plot is not None and len(plot.channels) > 0: monitor_channels = plot.channels self.train_monitors_dict = {} self.valid_monitors_dict = {} self.test_monitors_dict = {} self.train_monitors_outservice_dict = {} self.valid_monitors_outservice_dict = {} self.test_monitors_outservice_dict = {} if monitor_channels: # collapse the appropriate monitors into their (name, expression, out_service) tuples train_collapsed = collapse_channels(monitor_channels, train=True) valid_collapsed = collapse_channels(monitor_channels, valid=True) test_collapsed = collapse_channels(monitor_channels, test=True) # get name: expression dictionary self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) self.test_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in test_collapsed]) # get name: outservice dictionary self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed]) self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed]) self.test_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in test_collapsed]) # finally deal with an outservice provided to monitor training cost self.train_outservice = train_outservice # remove redundant files made by the fileservice for the train monitor. # TODO: THIS FEELS LIKE A HACK. I don't like it. if isinstance(self.train_outservice, FileService): os.remove(self.train_outservice.valid_filename) os.remove(self.train_outservice.test_filename) ####################################### # compile train and monitor functions # ####################################### function_input = raise_to_list(self.model.get_inputs()) + raise_to_list(self.model.get_targets()) train_functions = [] for i, (updates, train_cost) in enumerate(zip(train_updates, train_costs)): # Compile the training function! log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_updates), str(type(self.model))) t = time.time() f_learn = function(inputs=function_input, updates=updates, outputs=[train_cost] + list(self.train_monitors_dict.values()), name='f_learn_%d' % i) log.info('f_learn %d compilation took %s', i + 1, make_time_units_string(time.time() - t)) train_functions.append(f_learn) # figure out if we want valid and test (monitors) self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0) self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0) # Now compile the monitor functions! log.debug("Compiling monitor functions...") monitor_t = time.time() # valid monitors if self.valid_flag: self.valid_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.valid_monitors_dict.values()), name='valid_monitor_function' ) else: self.valid_monitor_function = None # test monitors if self.test_flag: self.test_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.test_monitors_dict.values()), name='test_monitor_function' ) else: self.test_monitor_function = None log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) ################## # start training # ################## # make sure to deal with a list of train_cost functions - for layer-wise pretraining! # this list of training functions was created during __init__() start_time = time.time() for func_i, train_function in enumerate(train_functions): log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS-----------", str(type(self.model)), func_i + 1, len(train_functions), self.n_epoch) self.STOP = False self.epoch_counter = 0 # reset any decay params for decay_param in self.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(train_function, plot) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") set_shared_values(self.params, self.best_params) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl') log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t)) log.info("------------TOTAL %s TRAIN TIME TOOK %s---------", str(type(self.model)), make_time_units_string(time.time() - start_time))
def train(self, monitor_channels=None, train_outservice=None, plot=None): """ This method performs the training!!! It is an online training method that goes over minibatches from the dataset for a number of epochs, updating parameters after each minibatch. You can disrupt training with a KeyBoardInterrupt and it should exit/save parameters gracefully. Parameters ---------- monitor_channels : list(MonitorsChannel or Monitor), optional The list of channels or monitors containing monitor expressions/variables to compile and evaluate on the data. train_outservice : OutService, optional The OutService to use for the automatically created train_cost monitor. Default of None just outputs to logs. plot : Plot, optional The Plot object to use if we want to graph the outputs (uses bokeh server). """ if not self.model: log.error("No self.model for the Optimizer!") raise AssertionError("Needs to be initialized with a Model! (Or something went wrong if train() " "was called from the Model. Try initializing the Optimizer with the model param " "and calling optimizer.train().") ######################### # gradients and updates # ######################### # grab the model parameters to use during training self.params = self.model.get_params() # Now create the training cost function for the model to use while training - update parameters # gradient! gradients = grad(cost=self.loss_expression, wrt=list(self.params.values())) # now create the dictionary mapping the parameter with its gradient gradients = OrderedDict( [(param, g) for param, g in zip(list(self.params.values()), gradients)] ) # clip gradients if we want. gradients = clip_gradients(gradients, self.grad_clip, self.hard_clip) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable updates = self.model.get_updates() if updates: updates.update(gradient_updates) else: updates = gradient_updates log.info("%s params: %s", self.model._classname, str(list(self.params.keys()))) ############ # monitors # ############ # deal with the monitor channels if they were given (or take them from the plot) if monitor_channels is None and plot is not None and len(plot.channels) > 0: monitor_channels = plot.channels self.train_monitors_dict = {} self.valid_monitors_dict = {} self.test_monitors_dict = {} self.train_monitors_outservice_dict = {} self.valid_monitors_outservice_dict = {} self.test_monitors_outservice_dict = {} if monitor_channels: # collapse the appropriate monitors into their (name, expression, out_service) tuples train_collapsed = collapse_channels(monitor_channels, train=True) valid_collapsed = collapse_channels(monitor_channels, valid=True) test_collapsed = collapse_channels(monitor_channels, test=True) # get name: expression dictionary self.train_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in train_collapsed]) self.valid_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in valid_collapsed]) self.test_monitors_dict = OrderedDict([(name, expression) for name, expression, _ in test_collapsed]) # get name: outservice dictionary self.train_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in train_collapsed]) self.valid_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in valid_collapsed]) self.test_monitors_outservice_dict = OrderedDict([(name, out) for name, _, out in test_collapsed]) # finally deal with an outservice provided to monitor training cost self.train_outservice = train_outservice # remove redundant files made by the fileservice for the train monitor. # TODO: THIS FEELS LIKE A HACK. I don't like it. if isinstance(self.train_outservice, FileService): os.remove(self.train_outservice.valid_filename) os.remove(self.train_outservice.test_filename) ####################################### # compile train and monitor functions # ####################################### function_input = raise_to_list(self.model.get_inputs()) if self.loss_targets is not None: function_input += self.loss_targets # Compile the training function! log.info('Compiling f_learn function for model %s...', self.model._classname) t = time.time() f_learn = function(inputs=function_input, updates=updates, outputs=[self.loss_expression] + list(self.train_monitors_dict.values()), name='f_learn') log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) # figure out if we want valid and test (monitors) self.valid_flag = (self.dataset.valid_inputs is not None) and (len(self.valid_monitors_dict) > 0) self.test_flag = (self.dataset.test_inputs is not None) and (len(self.test_monitors_dict) > 0) # Now compile the monitor functions! log.debug("Compiling monitor functions...") monitor_t = time.time() # valid monitors if self.valid_flag: self.valid_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.valid_monitors_dict.values()), name='valid_monitor_function' ) else: self.valid_monitor_function = None # test monitors if self.test_flag: self.test_monitor_function = function( inputs=function_input, updates=self.model.get_updates(), outputs=list(self.test_monitors_dict.values()), name='test_monitor_function' ) else: self.test_monitor_function = None log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) ################## # start training # ################## log.info("-----------TRAINING %s FOR %d EPOCHS-----------", self.model._classname, self.n_epoch) self.STOP = False self.epoch_counter = 0 # reset any decay params for decay_param in self.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(f_learn, plot) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") for best_param, param_value in self.best_params.items(): self.params[best_param].set_value(param_value, borrow=False) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter)) log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t))
def build_computation_graph(self): ################# # Build the GSN # ################# log.debug("Building GSN graphs...") # GSN for training - with noise specified in initialization # if there is no hiddens_hook, build the GSN normally using the input X if not self.hiddens_flag: p_X_chain, _ = self.build_gsn(add_noise=self.add_noise) # if there is a hiddens_hook, we want to change the order layers are updated and make this purely # generative from the hiddens else: p_X_chain, _, = self.build_gsn(hiddens=self.hiddens, add_noise=self.add_noise, reverse=True) # GSN for prediction - same as above but no noise # deal with hiddens_hook exactly as above. if not self.hiddens_flag: p_X_chain_recon, recon_hiddens = self.build_gsn(add_noise=False) else: p_X_chain_recon, recon_hiddens = self.build_gsn(hiddens=self.hiddens, add_noise=False, reverse=True) #################### # Costs and output # #################### log.debug('Cost w.r.t p(X|...) at every step in the graph for the GSN') # use the noisy ones for training cost costs = [self.cost_function(output=rX, target=self.X, **self.cost_args) for rX in p_X_chain] self.show_cost = costs[-1] # for a monitor to show progress cost = numpy.sum(costs) # THIS IS THE TRAINING COST - RECONSTRUCTION OF OUTPUT FROM NOISY GRAPH # use the non-noisy graph for prediction gsn_costs_recon = [self.cost_function(output=rX, target=self.X, **self.cost_args) for rX in p_X_chain_recon] # another monitor, same as self.show_cost but on the non-noisy graph. self.monitor = gsn_costs_recon[-1] # this should be considered the main output of the computation, the sample after the # last walkback from the non-noisy graph. output = p_X_chain_recon[-1] # these should be considered the model's hidden representation - the hidden representation after # the last walkback from the non-noisy graph. hiddens = recon_hiddens train_mse = T.mean(T.sqr(p_X_chain[-1] - self.X), axis=0) train_mse = T.mean(train_mse) mse = T.mean(T.sqr(p_X_chain_recon[-1] - self.X), axis=0) mse = T.mean(mse) monitors = OrderedDict([('noisy_recon_cost', self.show_cost), ('recon_cost', self.monitor), ('mse', mse), ('train_mse', train_mse)]) ############ # Sampling # ############ # the input to the sampling function X_sample = T.matrix("X_sampling") self.network_state_input = [X_sample] + [T.matrix("H_sampling_"+str(i+1)) for i in range(self.layers)] # "Output" state of the network (noisy) # initialized with input, then we apply updates self.network_state_output = [X_sample] + self.network_state_input[1:] visible_pX_chain = [] # ONE update log.debug("Performing one walkback in network state sampling.") self.update_layers(self.network_state_output, visible_pX_chain, add_noise=True, reverse=False) ##################################################### # Create the run and monitor functions # ##################################################### log.debug("Compiling functions...") t = time.time() # doesn't make sense to have this if there is a hiddens_hook if not self.hiddens_flag: # THIS IS THE MAIN PREDICT FUNCTION - takes in a real matrix and produces the output from the non-noisy # computation graph log.debug("f_run...") self.f_run = function(inputs = [self.X], outputs = output, name = 'gsn_f_run') # this is a helper function - it corrupts inputs when testing the non-noisy graph (aka before feeding the # input to f_run) log.debug("f_noise...") self.f_noise = function(inputs = [self.X], outputs = self.input_noise(self.X), name = 'gsn_f_noise') # the sampling function, for creating lots of samples from the computational graph. (mostly for log-likelihood # or visualization) log.debug("f_sample...") if self.layers == 1: self.f_sample = function(inputs = [X_sample], outputs = visible_pX_chain[-1], name = 'gsn_f_sample_single_layer') else: # WHY IS THERE A WARNING???? # because the first odd layers are not used -> directly computed FROM THE EVEN layers # unused input = warn self.f_sample = function(inputs = self.network_state_input, outputs = self.network_state_output + visible_pX_chain, name = 'gsn_f_sample') log.debug("GSN compiling done. Took %s", make_time_units_string(time.time() - t)) return cost, monitors, output, hiddens