def gen_10k_samples(self): for i,x in enumerate(self.test_X): log.maybeLog(self.logger, 'Generating 10,000 samples {0!s}/{1!s}'.format(i,len(self.test_X))) samples, _ = self.sample(x.get_value()[1:2], 1000, 1) f_samples = 'samples_test{0!s}.npy'.format(i) numpy.save(f_samples, samples) log.maybeLog(self.logger, 'saved digits')
def recurrent_step(x_t, u_tm1, add_noise): # Make current guess for hiddens based on U for i in range(self.gsn_layers): if i % 2 == 0: log.maybeLog( self.logger, "Using {0!s} and {1!s}".format( self.recurrent_to_gsn_weights_list[(i + 1) / 2], self.bias_list[i + 1])) h_t = T.concatenate([ self.hidden_activation(self.bias_list[i + 1] + T.dot( u_tm1, self.recurrent_to_gsn_weights_list[(i + 1) / 2])) for i in range(self.gsn_layers) if i % 2 == 0 ], axis=0) # Make a GSN to update U _, hs = generative_stochastic_network.build_gsn( x_t, self.weights_list, self.bias_list, add_noise, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.walkbacks, self.logger) htop_t = hs[-1] ins_t = htop_t ua_t = T.dot(ins_t, self.W_ins_u) + T.dot( u_tm1, self.W_u_u) + self.recurrent_bias u_t = self.recurrent_hidden_activation(ua_t) return [ua_t, u_t, h_t]
def sample_some_numbers(n_samples): # The network's initial state init_vis = initial noisy_init_vis = self.f_noise(init_vis) network_state = [[noisy_init_vis] + [numpy.zeros((initial.shape[0],self.hidden_size), dtype='float32') for _ in self.bias_list[1:]]] visible_chain = [init_vis] noisy_h0_chain = [noisy_init_vis] sampled_h = [] times = [] for i in xrange(n_samples-1): _t = time.time() # feed the last state into the network, compute new state, and obtain visible units expectation chain net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1]) # append to the visible chain visible_chain += vis_pX_chain # append state output to the network state chain network_state.append(net_state_out) noisy_h0_chain.append(net_state_out[0]) if i%k == 0: sampled_h.append(T.stack(net_state_out[1:])) if i == k: log.maybeLog(self.logger, "About "+make_time_units_string(numpy.mean(times)*(n_samples-1-i))+" remaining...") times.append(time.time() - _t) log.maybeLog(self.logger, "Sampling done.") return numpy.vstack(visible_chain), sampled_h
def build_gsn_given_hiddens(X, hiddens, weights_list, bias_list, add_noise = defaults["add_noise"], noiseless_h1 = defaults["noiseless_h1"], hidden_add_noise_sigma = defaults["hidden_add_noise_sigma"], input_salt_and_pepper = defaults["input_salt_and_pepper"], input_sampling = defaults["input_sampling"], MRG = defaults["MRG"], visible_activation = defaults["visible_activation"], hidden_activation = defaults["hidden_activation"], walkbacks = defaults["walkbacks"], cost_function = defaults["cost_function"], logger = None): log.maybeLog(logger, ["Building the GSN graph given hiddens with", walkbacks,"walkbacks"]) p_X_chain = [] for i in range(walkbacks): log.maybeLog(logger, "GSN (prediction) Walkback {!s}/{!s}".format(i+1,walkbacks)) update_layers_reverse(hiddens, weights_list, bias_list, p_X_chain, add_noise, noiseless_h1, hidden_add_noise_sigma, input_salt_and_pepper, input_sampling, MRG, visible_activation, hidden_activation, logger) x_sample = p_X_chain[-1] costs = [cost_function(rX, X) for rX in p_X_chain] show_cost = costs[-1] # for logging to show progress cost = numpy.sum(costs) return x_sample, cost, show_cost
def test(self, test_X=None): log.maybeLog(self.logger, "\nTesting---------\n") if test_X is None: log.maybeLog(self.logger, "Testing using data given during initialization of GSN.\n") test_X = self.test_X if test_X is None: log.maybeLog(self.logger, "\nPlease provide a test dataset!\n") raise AssertionError("Please provide a test dataset") else: log.maybeLog(self.logger, "Testing using data provided to test function.\n") ########### # TESTING # ########### n_examples = 100 tests = test_X.get_value()[0:n_examples] noisy_tests = self.f_noise(test_X.get_value()[0:n_examples]) cost, reconstructed = self.f_recon(noisy_tests) # Concatenate stuff if it is an image if self.is_image: stacked = numpy.vstack([numpy.vstack([tests[i*10 : (i+1)*10], noisy_tests[i*10 : (i+1)*10], reconstructed[i*10 : (i+1)*10]]) for i in range(10)]) number_reconstruction = PIL.Image.fromarray(tile_raster_images(stacked, (self.image_height,self.image_width), (10,30))) number_reconstruction.save(self.outdir+'gsn_image_reconstruction_test.png') # Otherwise, save reconstructed numpy array as csv else: numpy.savetxt(self.outdir+'gsn_reconstruction_test.csv', reconstructed, delimiter=",") log.maybeLog(self.logger, "----------------\n\nAverage test cost is "+str(cost)+"\n\n-----------------")
def save_params(self, name, n, params): log.maybeLog(self.logger, 'saving parameters...') save_path = self.outdir+name+'_params_epoch_'+str(n)+'.pkl' f = open(save_path, 'wb') try: cPickle.dump(params, f, protocol=cPickle.HIGHEST_PROTOCOL) finally: f.close()
def plot_samples(self, epoch_number="", leading_text="", n_samples=400): to_sample = time.time() initial = self.test_X.get_value()[:1] V = self.sample(initial, n_samples) img_samples = PIL.Image.fromarray(tile_raster_images(V, (self.root_N_input,self.root_N_input), (ceil(sqrt(n_samples)), ceil(sqrt(n_samples))))) fname = self.outdir+leading_text+'samples_epoch_'+str(epoch_number)+'.png' img_samples.save(fname) log.maybeLog(self.logger, 'Took ' + str(time.time() - to_sample) + ' to sample '+n_samples+' numbers')
def sample_some_numbers_single_layer(n_samples): x0 = initial samples = [x0] x = self.f_noise(x0) for _ in xrange(n_samples-1): x = self.f_sample(x) samples.append(x) x = rng.binomial(n=1, p=x, size=x.shape).astype('float32') x = self.f_noise(x) log.maybeLog(self.logger, "Sampling done.") return numpy.vstack(samples), None
def update_layers_scan_step( hiddens_t, weights_list, bias_list, add_noise=defaults["add_noise"], noiseless_h1=defaults["noiseless_h1"], hidden_add_noise_sigma=defaults["hidden_add_noise_sigma"], input_salt_and_pepper=defaults["input_salt_and_pepper"], input_sampling=defaults["input_sampling"], MRG=defaults["MRG"], visible_activation=defaults["visible_activation"], hidden_activation=defaults["hidden_activation"], logger=None): p_X_chain = [] log.maybeLog(logger, "One full update step for layers.") # One update over the odd layers + one update over the even layers log.maybeLog(logger, 'odd layer updates') # update the odd layers update_odd_layers(hiddens_t, weights_list, bias_list, add_noise, noiseless_h1, hidden_add_noise_sigma, input_salt_and_pepper, input_sampling, MRG, visible_activation, hidden_activation, logger) log.maybeLog(logger, 'even layer updates') # update the even layers update_even_layers(hiddens_t, weights_list, bias_list, p_X_chain, add_noise, noiseless_h1, hidden_add_noise_sigma, input_salt_and_pepper, input_sampling, MRG, visible_activation, hidden_activation, logger) log.maybeLog(logger, 'done full update.\n') # return the generated sample, the sampled next input, and hiddens return p_X_chain[0], hiddens_t
def plot_samples(self, epoch_number="", leading_text="", n_samples=400): to_sample = time.time() initial = self.test_X.get_value()[:1] V = self.sample(initial, n_samples) img_samples = PIL.Image.fromarray( tile_raster_images(V, (self.root_N_input, self.root_N_input), (ceil(sqrt(n_samples)), ceil(sqrt(n_samples))))) fname = self.outdir + leading_text + 'samples_epoch_' + str( epoch_number) + '.png' img_samples.save(fname) log.maybeLog( self.logger, 'Took ' + str(time.time() - to_sample) + ' to sample ' + n_samples + ' numbers')
def update_layers_scan_step(hiddens_t, weights_list, bias_list, add_noise=defaults["add_noise"], noiseless_h1=defaults["noiseless_h1"], hidden_add_noise_sigma=defaults["hidden_add_noise_sigma"], input_salt_and_pepper=defaults["input_salt_and_pepper"], input_sampling=defaults["input_sampling"], MRG=defaults["MRG"], visible_activation=defaults["visible_activation"], hidden_activation=defaults["hidden_activation"], logger=None): p_X_chain = [] log.maybeLog(logger, "One full update step for layers.") # One update over the odd layers + one update over the even layers log.maybeLog(logger, 'odd layer updates') # update the odd layers update_odd_layers(hiddens_t, weights_list, bias_list, add_noise, noiseless_h1, hidden_add_noise_sigma, input_salt_and_pepper, input_sampling, MRG, visible_activation, hidden_activation, logger) log.maybeLog(logger, 'even layer updates') # update the even layers update_even_layers(hiddens_t, weights_list, bias_list, p_X_chain, add_noise, noiseless_h1, hidden_add_noise_sigma, input_salt_and_pepper, input_sampling, MRG, visible_activation, hidden_activation, logger) log.maybeLog(logger, 'done full update.\n') # return the generated sample, the sampled next input, and hiddens return p_X_chain[0], hiddens_t
def recurrent_step(x_t, u_tm1, add_noise): # Make current guess for hiddens based on U for i in range(self.gsn_layers): if i%2 == 0: log.maybeLog(self.logger, "Using {0!s} and {1!s}".format(self.recurrent_to_gsn_weights_list[(i+1)/2],self.bias_list[i+1])) h_t = T.concatenate([self.hidden_activation(self.bias_list[i+1] + T.dot(u_tm1, self.recurrent_to_gsn_weights_list[(i+1)/2])) for i in range(self.gsn_layers) if i%2 == 0],axis=0) # Make a GSN to update U _, hs = generative_stochastic_network.build_gsn(x_t, self.weights_list, self.bias_list, add_noise, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.walkbacks, self.logger) htop_t = hs[-1] ins_t = htop_t ua_t = T.dot(ins_t, self.W_ins_u) + T.dot(u_tm1, self.W_u_u) + self.recurrent_bias u_t = self.recurrent_hidden_activation(ua_t) return [ua_t, u_t, h_t]
def update_odd_layers(hiddens, weights_list, bias_list, add_noise = defaults["add_noise"], noiseless_h1 = defaults["noiseless_h1"], hidden_add_noise_sigma = defaults["hidden_add_noise_sigma"], input_salt_and_pepper = defaults["input_salt_and_pepper"], input_sampling = defaults["input_sampling"], MRG = defaults["MRG"], visible_activation = defaults["visible_activation"], hidden_activation = defaults["hidden_activation"], logger = None): # Loop over the odd layers for i in range(1, len(hiddens), 2): log.maybeLog(logger, ['updating layer',i]) simple_update_layer(hiddens, weights_list, bias_list, None, i, add_noise, noiseless_h1, hidden_add_noise_sigma, input_salt_and_pepper, input_sampling, MRG, visible_activation, hidden_activation, logger)
def plot_samples(self, epoch_number="", leading_text="", n_samples=400): to_sample = time.time() initial = self.test_X.get_value(borrow=True)[:1] rand_idx = numpy.random.choice(range(self.test_X.get_value(borrow=True).shape[0])) rand_init = self.test_X.get_value(borrow=True)[rand_idx:rand_idx+1] V, _ = self.sample(initial, n_samples) rand_V, _ = self.sample(rand_init, n_samples) img_samples = PIL.Image.fromarray(tile_raster_images(V, (self.image_height, self.image_width), closest_to_square_factors(n_samples))) rand_img_samples = PIL.Image.fromarray(tile_raster_images(rand_V, (self.image_height, self.image_width), closest_to_square_factors(n_samples))) fname = self.outdir+leading_text+'samples_epoch_'+str(epoch_number)+'.png' img_samples.save(fname) rfname = self.outdir+leading_text+'samples_rand_epoch_'+str(epoch_number)+'.png' rand_img_samples.save(rfname) log.maybeLog(self.logger, 'Took ' + make_time_units_string(time.time() - to_sample) + ' to sample '+str(n_samples*2)+' numbers')
def test(self, test_X=None): log.maybeLog(self.logger, "\nTesting---------\n") if test_X is None: log.maybeLog( self.logger, "Testing using data given during initialization of GSN.\n") test_X = self.test_X if test_X is None: log.maybeLog(self.logger, "\nPlease provide a test dataset!\n") raise AssertionError("Please provide a test dataset") else: log.maybeLog(self.logger, "Testing using data provided to test function.\n") ########### # TESTING # ########### n_examples = 100 tests = test_X.get_value()[0:n_examples] noisy_tests = self.f_noise(test_X.get_value()[0:n_examples]) cost, reconstructed = self.f_recon(noisy_tests) # Concatenate stuff if it is an image if self.is_image: stacked = numpy.vstack([ numpy.vstack([ tests[i * 10:(i + 1) * 10], noisy_tests[i * 10:(i + 1) * 10], reconstructed[i * 10:(i + 1) * 10] ]) for i in range(10) ]) number_reconstruction = PIL.Image.fromarray( tile_raster_images(stacked, (self.image_height, self.image_width), (10, 30))) number_reconstruction.save(self.outdir + 'gsn_image_reconstruction_test.png') # Otherwise, save reconstructed numpy array as csv else: numpy.savetxt(self.outdir + 'gsn_reconstruction_test.csv', reconstructed, delimiter=",") log.maybeLog( self.logger, "----------------\n\nAverage test cost is " + str(cost) + "\n\n-----------------")
def build_gsn_scan(X, weights_list, bias_list, add_noise=defaults["add_noise"], noiseless_h1=defaults["noiseless_h1"], hidden_add_noise_sigma=defaults["hidden_add_noise_sigma"], input_salt_and_pepper=defaults["input_salt_and_pepper"], input_sampling=defaults["input_sampling"], MRG=defaults["MRG"], visible_activation=defaults["visible_activation"], hidden_activation=defaults["hidden_activation"], walkbacks=defaults["walkbacks"], cost_function=defaults["cost_function"], logger=None): # Whether or not to corrupt the visible input X if add_noise: X_init = salt_and_pepper(X, input_salt_and_pepper) else: X_init = X # init hiddens with zeros hiddens_0 = [X_init] for w in weights_list: hiddens_0.append(T.zeros_like(T.dot(hiddens_0[-1], w))) log.maybeLog(logger, ["Building the GSN graph with", walkbacks, "walkbacks"]) p_X_chain = [] for i in range(walkbacks): log.maybeLog( logger, "GSN (after scan) Walkback {!s}/{!s}".format(i + 1, walkbacks)) update_layers(hiddens_0, weights_list, bias_list, p_X_chain, add_noise, noiseless_h1, hidden_add_noise_sigma, input_salt_and_pepper, input_sampling, MRG, visible_activation, hidden_activation, logger) x_sample = p_X_chain[-1] costs = [cost_function(rX, X) for rX in p_X_chain] show_cost = costs[-1] # for logging to show progress cost = numpy.sum(costs) return x_sample, cost, show_cost #, updates
def update_odd_layers( hiddens, weights_list, bias_list, add_noise=defaults["add_noise"], noiseless_h1=defaults["noiseless_h1"], hidden_add_noise_sigma=defaults["hidden_add_noise_sigma"], input_salt_and_pepper=defaults["input_salt_and_pepper"], input_sampling=defaults["input_sampling"], MRG=defaults["MRG"], visible_activation=defaults["visible_activation"], hidden_activation=defaults["hidden_activation"], logger=None): # Loop over the odd layers for i in range(1, len(hiddens), 2): log.maybeLog(logger, ['updating layer', i]) simple_update_layer(hiddens, weights_list, bias_list, None, i, add_noise, noiseless_h1, hidden_add_noise_sigma, input_salt_and_pepper, input_sampling, MRG, visible_activation, hidden_activation, logger)
def recurrent_step(x_t, u_tm1, add_noise): # Make current guess for hiddens based on U for i in range(self.layers): if i%2 == 0: log.maybeLog(self.logger, "Using {0!s} and {1!s}".format(self.recurrent_to_gsn_weights_list[(i+1)/2],self.bias_list[i+1])) h_t = T.concatenate([self.hidden_activation(self.bias_list[i+1] + T.dot(u_tm1, self.recurrent_to_gsn_weights_list[(i+1)/2])) for i in range(self.layers) if i%2 == 0],axis=0) generate = x_t is None if generate: pass # Make a GSN to update U # chain, hs = gsn.build_gsn(x_t, weights_list, bias_list, add_noise, state.noiseless_h1, state.hidden_add_noise_sigma, state.input_salt_and_pepper, state.input_sampling, MRG, visible_activation, hidden_activation, walkbacks, logger) # htop_t = hs[-1] # denoised_x_t = chain[-1] # Update U # ua_t = T.dot(denoised_x_t, W_x_u) + T.dot(htop_t, W_h_u) + T.dot(u_tm1, W_u_u) + recurrent_bias ua_t = T.dot(x_t, self.W_x_u) + T.dot(u_tm1, self.W_u_u) + self.recurrent_bias u_t = self.recurrent_hidden_activation(ua_t) return None if generate else [ua_t, u_t, h_t]
def update_layers_reverse( hiddens, weights_list, bias_list, p_X_chain, add_noise=defaults["add_noise"], noiseless_h1=defaults["noiseless_h1"], hidden_add_noise_sigma=defaults["hidden_add_noise_sigma"], input_salt_and_pepper=defaults["input_salt_and_pepper"], input_sampling=defaults["input_sampling"], MRG=defaults["MRG"], visible_activation=defaults["visible_activation"], hidden_activation=defaults["hidden_activation"], logger=None): # One update over the even layers + one update over the odd layers log.maybeLog(logger, 'even layer updates') # update the even layers update_even_layers(hiddens, weights_list, bias_list, p_X_chain, add_noise, noiseless_h1, hidden_add_noise_sigma, input_salt_and_pepper, input_sampling, MRG, visible_activation, hidden_activation, logger) log.maybeLog(logger, 'odd layer updates') # update the odd layers update_odd_layers(hiddens, weights_list, bias_list, add_noise, noiseless_h1, hidden_add_noise_sigma, input_salt_and_pepper, input_sampling, MRG, visible_activation, hidden_activation, logger) log.maybeLog(logger, 'done full update.\n')
def build_gsn_scan(X, weights_list, bias_list, add_noise = defaults["add_noise"], noiseless_h1 = defaults["noiseless_h1"], hidden_add_noise_sigma = defaults["hidden_add_noise_sigma"], input_salt_and_pepper = defaults["input_salt_and_pepper"], input_sampling = defaults["input_sampling"], MRG = defaults["MRG"], visible_activation = defaults["visible_activation"], hidden_activation = defaults["hidden_activation"], walkbacks = defaults["walkbacks"], cost_function = defaults["cost_function"], logger = None): # Whether or not to corrupt the visible input X if add_noise: X_init = salt_and_pepper(X, input_salt_and_pepper) else: X_init = X # init hiddens with zeros hiddens_0 = [X_init] for w in weights_list: hiddens_0.append(T.zeros_like(T.dot(hiddens_0[-1], w))) log.maybeLog(logger, ["Building the GSN graph with", walkbacks,"walkbacks"]) p_X_chain = [] for i in range(walkbacks): log.maybeLog(logger, "GSN (after scan) Walkback {!s}/{!s}".format(i+1,walkbacks)) update_layers(hiddens_0, weights_list, bias_list, p_X_chain, add_noise, noiseless_h1, hidden_add_noise_sigma, input_salt_and_pepper, input_sampling, MRG, visible_activation, hidden_activation, logger) x_sample = p_X_chain[-1] costs = [cost_function(rX, X) for rX in p_X_chain] show_cost = costs[-1] # for logging to show progress cost = numpy.sum(costs) return x_sample, cost, show_cost#, updates
def build_gsn_given_hiddens( X, hiddens, weights_list, bias_list, add_noise=defaults["add_noise"], noiseless_h1=defaults["noiseless_h1"], hidden_add_noise_sigma=defaults["hidden_add_noise_sigma"], input_salt_and_pepper=defaults["input_salt_and_pepper"], input_sampling=defaults["input_sampling"], MRG=defaults["MRG"], visible_activation=defaults["visible_activation"], hidden_activation=defaults["hidden_activation"], walkbacks=defaults["walkbacks"], cost_function=defaults["cost_function"], logger=None): log.maybeLog( logger, ["Building the GSN graph given hiddens with", walkbacks, "walkbacks"]) p_X_chain = [] for i in range(walkbacks): log.maybeLog( logger, "GSN (prediction) Walkback {!s}/{!s}".format(i + 1, walkbacks)) update_layers_reverse(hiddens, weights_list, bias_list, p_X_chain, add_noise, noiseless_h1, hidden_add_noise_sigma, input_salt_and_pepper, input_sampling, MRG, visible_activation, hidden_activation, logger) x_sample = p_X_chain[-1] costs = [cost_function(rX, X) for rX in p_X_chain] show_cost = costs[-1] # for logging to show progress cost = numpy.sum(costs) return x_sample, cost, show_cost
def load_params(self, filename): ''' self.params = self.weights_list + self.bias_list + self.recurrent_to_gsn_weights_list + [self.W_u_u, self.W_x_u, self.recurrent_bias] ''' def set_param(loaded_params, start, param): [p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[start:start+len(param)], param)] return start + len(param) if os.path.isfile(filename): log.maybeLog(self.logger, "\nLoading existing RNN-GSN parameters...") loaded_params = cPickle.load(open(filename,'r')) start = 0 start = set_param(loaded_params, start, self.weights_list) start = set_param(loaded_params, start, self.bias_list) start = set_param(loaded_params, start, self.recurrent_to_gsn_weights_list) set_param(loaded_params, start, self.u_params) log.maybeLog(self.logger, "Parameters loaded.\n") else: log.maybeLog(self.logger, "\n\nCould not find existing RNN-GSN parameter file {}.\n\n".format(filename))
def update_layers_reverse(hiddens, weights_list, bias_list, p_X_chain, add_noise = defaults["add_noise"], noiseless_h1 = defaults["noiseless_h1"], hidden_add_noise_sigma = defaults["hidden_add_noise_sigma"], input_salt_and_pepper = defaults["input_salt_and_pepper"], input_sampling = defaults["input_sampling"], MRG = defaults["MRG"], visible_activation = defaults["visible_activation"], hidden_activation = defaults["hidden_activation"], logger = None): # One update over the even layers + one update over the odd layers log.maybeLog(logger, 'even layer updates') # update the even layers update_even_layers(hiddens, weights_list, bias_list, p_X_chain, add_noise, noiseless_h1, hidden_add_noise_sigma, input_salt_and_pepper, input_sampling, MRG, visible_activation, hidden_activation, logger) log.maybeLog(logger, 'odd layer updates') # update the odd layers update_odd_layers(hiddens, weights_list, bias_list, add_noise, noiseless_h1, hidden_add_noise_sigma, input_salt_and_pepper, input_sampling, MRG, visible_activation, hidden_activation, logger) log.maybeLog(logger, 'done full update.\n')
def __init__(self, train_X=None, valid_X=None, test_X=None, args=None, logger=None): # Output logger self.logger = logger self.outdir = args.get("output_path", defaults["output_path"]) if self.outdir[-1] != '/': self.outdir = self.outdir+'/' # Input data - make sure it is a list of shared datasets self.train_X = raise_data_to_list(train_X) self.valid_X = raise_data_to_list(valid_X) self.test_X = raise_data_to_list(test_X) # variables from the dataset that are used for initialization and image reconstruction if train_X is None: self.N_input = args.get("input_size") if args.get("input_size") is None: raise AssertionError("Please either specify input_size in the arguments or provide an example train_X for input dimensionality.") else: self.N_input = train_X[0].eval().shape[1] self.root_N_input = numpy.sqrt(self.N_input) self.is_image = args.get('is_image', defaults['is_image']) if self.is_image: self.image_width = args.get('width', self.root_N_input) self.image_height = args.get('height', self.root_N_input) ####################################### # Network and training specifications # ####################################### self.layers = args.get('layers', defaults['layers']) # number hidden layers self.walkbacks = args.get('walkbacks', defaults['walkbacks']) # number of walkbacks self.learning_rate = theano.shared(cast32(args.get('learning_rate', defaults['learning_rate']))) # learning rate self.init_learn_rate = cast32(args.get('learning_rate', defaults['learning_rate'])) self.momentum = theano.shared(cast32(args.get('momentum', defaults['momentum']))) # momentum term self.annealing = cast32(args.get('annealing', defaults['annealing'])) # exponential annealing coefficient self.noise_annealing = cast32(args.get('noise_annealing', defaults['noise_annealing'])) # exponential noise annealing coefficient self.batch_size = args.get('batch_size', defaults['batch_size']) self.n_epoch = args.get('n_epoch', defaults['n_epoch']) self.early_stop_threshold = args.get('early_stop_threshold', defaults['early_stop_threshold']) self.early_stop_length = args.get('early_stop_length', defaults['early_stop_length']) self.save_frequency = args.get('save_frequency', defaults['save_frequency']) self.noiseless_h1 = args.get('noiseless_h1', defaults["noiseless_h1"]) self.hidden_add_noise_sigma = theano.shared(cast32(args.get('hidden_add_noise_sigma', defaults["hidden_add_noise_sigma"]))) self.input_salt_and_pepper = theano.shared(cast32(args.get('input_salt_and_pepper', defaults["input_salt_and_pepper"]))) self.input_sampling = args.get('input_sampling', defaults["input_sampling"]) self.vis_init = args.get('vis_init', defaults['vis_init']) self.layer_sizes = [self.N_input] + [args.get('hidden_size', defaults['hidden_size'])] * self.layers # layer sizes, from h0 to hK (h0 is the visible layer) self.f_recon = None self.f_noise = None # Activation functions! if args.get('hidden_activation') is not None: log.maybeLog(self.logger, 'Using specified activation for hiddens') self.hidden_activation = args.get('hidden_activation') elif args.get('hidden_act') is not None: self.hidden_activation = get_activation_function(args.get('hidden_act')) log.maybeLog(self.logger, 'Using {0!s} activation for hiddens'.format(args.get('hidden_act'))) else: log.maybeLog(self.logger, "Using default activation for hiddens") self.hidden_activation = defaults['hidden_activation'] # Visible layer activation if args.get('visible_activation') is not None: log.maybeLog(self.logger, 'Using specified activation for visible layer') self.visible_activation = args.get('visible_activation') elif args.get('visible_act') is not None: self.visible_activation = get_activation_function(args.get('visible_act')) log.maybeLog(self.logger, 'Using {0!s} activation for visible layer'.format(args.get('visible_act'))) else: log.maybeLog(self.logger, 'Using default activation for visible layer') self.visible_activation = defaults['visible_activation'] # Cost function! if args.get('cost_function') is not None: log.maybeLog(self.logger, '\nUsing specified cost function for training\n') self.cost_function = args.get('cost_function') elif args.get('cost_funct') is not None: self.cost_function = get_cost_function(args.get('cost_funct')) log.maybeLog(self.logger, 'Using {0!s} for cost function'.format(args.get('cost_funct'))) else: log.maybeLog(self.logger, '\nUsing default cost function for training\n') self.cost_function = defaults['cost_function'] ############################ # Theano variables and RNG # ############################ self.X = T.fmatrix('X') # for use in sampling self.MRG = RNG_MRG.MRG_RandomStreams(1) rng.seed(1) ############### # Parameters! # ############### # initialize a list of weights and biases based on layer_sizes for the GSN if args.get('weights_list') is None: self.weights_list = [get_shared_weights(self.layer_sizes[layer], self.layer_sizes[layer+1], name="W_{0!s}_{1!s}".format(layer,layer+1)) for layer in range(self.layers)] # initialize each layer to uniform sample from sqrt(6. / (n_in + n_out)) else: self.weights_list = args.get('weights_list') if args.get('bias_list') is None: self.bias_list = [get_shared_bias(self.layer_sizes[layer], name='b_'+str(layer)) for layer in range(self.layers + 1)] # initialize each layer to 0's. else: self.bias_list = args.get('bias_list') self.params = self.weights_list + self.bias_list ################# # Build the GSN # ################# log.maybeLog(self.logger, "\nBuilding GSN graphs for training and testing") # GSN for training - with noise add_noise = True p_X_chain, _ = build_gsn(self.X, self.weights_list, self.bias_list, add_noise, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.walkbacks, self.logger) # GSN for reconstruction checks along the way - no noise add_noise = False p_X_chain_recon, _ = build_gsn(self.X, self.weights_list, self.bias_list, add_noise, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.walkbacks, self.logger) ####################### # Costs and gradients # ####################### log.maybeLog(self.logger, 'Cost w.r.t p(X|...) at every step in the graph for the GSN') gsn_costs = [self.cost_function(rX, self.X) for rX in p_X_chain] show_gsn_cost = gsn_costs[-1] # for logging to show progress gsn_cost = numpy.sum(gsn_costs) gsn_costs_recon = [self.cost_function(rX, self.X) for rX in p_X_chain_recon] show_gsn_cost_recon = gsn_costs_recon[-1] log.maybeLog(self.logger, ["gsn params:", self.params]) # Stochastic gradient descent! gradient = T.grad(gsn_cost, self.params) gradient_buffer = [theano.shared(numpy.zeros(param.get_value().shape, dtype='float32')) for param in self.params] m_gradient = [self.momentum * gb + (cast32(1) - self.momentum) * g for (gb, g) in zip(gradient_buffer, gradient)] param_updates = [(param, param - self.learning_rate * mg) for (param, mg) in zip(self.params, m_gradient)] gradient_buffer_updates = zip(gradient_buffer, m_gradient) updates = OrderedDict(param_updates + gradient_buffer_updates) ############ # Sampling # ############ # the input to the sampling function X_sample = T.fmatrix("X_sampling") self.network_state_input = [X_sample] + [T.fmatrix("H_sampling_"+str(i+1)) for i in range(self.layers)] # "Output" state of the network (noisy) # initialized with input, then we apply updates self.network_state_output = [X_sample] + self.network_state_input[1:] visible_pX_chain = [] # ONE update log.maybeLog(self.logger, "Performing one walkback in network state sampling.") update_layers(self.network_state_output, self.weights_list, self.bias_list, visible_pX_chain, True, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.logger) ################################# # Create the functions # ################################# log.maybeLog(self.logger, "Compiling functions...") t = time.time() self.f_learn = theano.function(inputs = [self.X], updates = updates, outputs = show_gsn_cost, name='gsn_f_learn') self.f_cost = theano.function(inputs = [self.X], outputs = show_gsn_cost, name='gsn_f_cost') # used for checkpoints and testing - no noise in network self.f_recon = theano.function(inputs = [self.X], outputs = [show_gsn_cost_recon, p_X_chain_recon[-1]], name='gsn_f_recon') self.f_noise = theano.function(inputs = [self.X], outputs = salt_and_pepper(self.X, self.input_salt_and_pepper), name='gsn_f_noise') if self.layers == 1: self.f_sample = theano.function(inputs = [X_sample], outputs = visible_pX_chain[-1], name='gsn_f_sample_single_layer') else: # WHY IS THERE A WARNING???? # because the first odd layers are not used -> directly computed FROM THE EVEN layers # unused input = warn self.f_sample = theano.function(inputs = self.network_state_input, outputs = self.network_state_output + visible_pX_chain, on_unused_input='warn', name='gsn_f_sample') log.maybeLog(self.logger, "Compiling done. Took "+make_time_units_string(time.time() - t)+".\n")
def train(self, train_X=None, train_Y=None, valid_X=None, valid_Y=None, test_X=None, test_Y=None, is_artificial=False, artificial_sequence=1, continue_training=False): log.maybeLog(self.logger, "\nTraining---------\n") if train_X is None: log.maybeLog(self.logger, "Training using data given during initialization of RNN-GSN.\n") train_X = self.train_X train_Y = self.train_Y if train_X is None: log.maybeLog(self.logger, "\nPlease provide a training dataset!\n") raise AssertionError("Please provide a training dataset!") else: log.maybeLog(self.logger, "Training using data provided to training function.\n") if valid_X is None: valid_X = self.valid_X valid_Y = self.valid_Y if test_X is None: test_X = self.test_X test_Y = self.test_Y # Input data - make sure it is a list of shared datasets train_X = raise_to_list(train_X) train_Y = raise_to_list(train_Y) valid_X = raise_to_list(valid_X) valid_Y = raise_to_list(valid_Y) test_X = raise_to_list(test_X) test_Y = raise_to_list(test_Y) ########################################################## # Train the GSN first to get good weights initialization # ########################################################## if self.train_gsn_first: log.maybeLog(self.logger, "\n\n----------Initially training the GSN---------\n\n") # init_gsn = GSN(train_X=train_X, valid_X=valid_X, test_X=test_X, state=self.gsn_args, logger=self.logger) # init_gsn.train() print "NOT IMPLEMENTED" ######################################### # If we are using Hessian-free training # ######################################### if self.hessian_free: pass # gradient_dataset = hf_sequence_dataset([train_X.get_value()], batch_size=None, number_batches=5000) # cg_dataset = hf_sequence_dataset([train_X.get_value()], batch_size=None, number_batches=1000) # valid_dataset = hf_sequence_dataset([valid_X.get_value()], batch_size=None, number_batches=1000) # # s = x_samples # costs = [cost, show_cost] # hf_optimizer(params, [Xs], s, costs, u, ua).train(gradient_dataset, cg_dataset, initial_lambda=1.0, preconditioner=True, validation=valid_dataset) ################################ # If we are using SGD training # ################################ else: log.maybeLog(self.logger, "\n-----------TRAINING RNN-GSN------------\n") # TRAINING STOP = False counter = 0 if not continue_training: self.learning_rate.set_value(self.init_learn_rate) # learning rate times = [] best_cost = float('inf') best_params = None patience = 0 log.maybeLog(self.logger, ['train X size:',str(train_X[0].get_value(borrow=True).shape)]) if valid_X is not None: log.maybeLog(self.logger, ['valid X size:',str(valid_X[0].get_value(borrow=True).shape)]) if test_X is not None: log.maybeLog(self.logger, ['test X size:',str(test_X[0].get_value(borrow=True).shape)]) if self.vis_init: self.bias_list[0].set_value(logit(numpy.clip(0.9,0.001,train_X[0].get_value(borrow=True).mean(axis=0)))) start_time = time.time() while not STOP: counter += 1 t = time.time() log.maybeAppend(self.logger, [counter,'\t']) # if is_artificial: # data.sequence_mnist_data(train_X[0], train_Y[0], valid_X[0], valid_Y[0], test_X[0], test_Y[0], artificial_sequence, rng) #train train_costs = [] train_errors = [] for train_data in train_X: costs_and_errors = data.apply_cost_function_to_dataset(self.f_learn, train_data, self.batch_size) train_costs.extend([cost for (cost, error) in costs_and_errors]) train_errors.extend([error for (cost, error) in costs_and_errors]) log.maybeAppend(self.logger, ['Train:',trunc(numpy.mean(train_costs)),trunc(numpy.mean(train_errors)),'\t']) #valid if valid_X is not None: valid_costs = [] for valid_data in valid_X: cs = data.apply_cost_function_to_dataset(self.f_cost, valid_data, self.batch_size) valid_costs.extend([c for c,e in cs]) log.maybeAppend(self.logger, ['Valid:',trunc(numpy.mean(valid_costs)), '\t']) #test if test_X is not None: test_costs = [] test_errors = [] for test_data in test_X: costs_and_errors = data.apply_cost_function_to_dataset(self.f_cost, test_data, self.batch_size) test_costs.extend([cost for (cost, error) in costs_and_errors]) test_errors.extend([error for (cost, error) in costs_and_errors]) log.maybeAppend(self.logger, ['Test:',trunc(numpy.mean(test_costs)),trunc(numpy.mean(test_errors)), '\t']) #check for early stopping if valid_X is not None: cost = numpy.sum(valid_costs) else: cost = numpy.sum(train_costs) if cost < best_cost*self.early_stop_threshold: patience = 0 best_cost = cost # save the parameters that made it the best best_params = copy_params(self.params) else: patience += 1 if counter >= self.n_epoch or patience >= self.early_stop_length: STOP = True if best_params is not None: restore_params(self.params, best_params) self.save_params('all', counter, self.params) timing = time.time() - t times.append(timing) log.maybeAppend(self.logger, 'time: '+make_time_units_string(timing)+'\t') log.maybeLog(self.logger, 'remaining: '+make_time_units_string((self.n_epoch - counter) * numpy.mean(times))) if (counter % self.save_frequency) == 0 or STOP is True: n_examples = 100 xs_test = test_X[0].get_value(borrow=True)[range(n_examples)] noisy_xs_test = self.f_noise(test_X[0].get_value(borrow=True)[range(n_examples)]) reconstructions = [] for i in xrange(0, len(noisy_xs_test)): recon, recon_cost = self.f_recon(noisy_xs_test[max(0,(i+1)-self.batch_size):i+1]) reconstructions.append(recon[-1]) reconstructed = numpy.array(reconstructions) if (self.is_image): # Concatenate stuff stacked = numpy.vstack([numpy.vstack([xs_test[i*10 : (i+1)*10], noisy_xs_test[i*10 : (i+1)*10], reconstructed[i*10 : (i+1)*10]]) for i in range(10)]) number_reconstruction = PIL.Image.fromarray(tile_raster_images(stacked, (self.image_height, self.image_width), (10,30))) number_reconstruction.save(self.outdir+'rnngsn_reconstruction_epoch_'+str(counter)+'.png') #sample_numbers(counter, 'seven') # plot_samples(counter, 'rnngsn') #save params self.save_params('all', counter, self.params) # ANNEAL! new_lr = self.learning_rate.get_value() * self.annealing self.learning_rate.set_value(new_lr) new_noise = self.input_salt_and_pepper.get_value() * self.noise_annealing self.input_salt_and_pepper.set_value(new_noise) log.maybeLog(self.logger, "\n------------TOTAL RNN-GSN TRAIN TIME TOOK {0!s}---------".format(make_time_units_string(time.time()-start_time)))
def train(self, train_X=None, train_Y=None, valid_X=None, valid_Y=None, test_X=None, test_Y=None, is_artificial=False, artificial_sequence=1, continue_training=False): log.maybeLog(self.logger, "\nTraining---------\n") if train_X is None: log.maybeLog(self.logger, "Training using data given during initialization of RNN-GSN.\n") train_X = self.train_X train_Y = self.train_Y if train_X is None: log.maybeLog(self.logger, "\nPlease provide a training dataset!\n") raise AssertionError("Please provide a training dataset!") else: log.maybeLog(self.logger, "Training using data provided to training function.\n") if valid_X is None: valid_X = self.valid_X valid_Y = self.valid_Y if test_X is None: test_X = self.test_X test_Y = self.test_Y ########################################################## # Train the GSN first to get good weights initialization # ########################################################## if self.train_gsn_first: log.maybeLog(self.logger, "\n\n----------Initially training the GSN---------\n\n") init_gsn = generative_stochastic_network.GSN(train_X=train_X, valid_X=valid_X, test_X=test_X, args=self.gsn_args, logger=self.logger) init_gsn.train() ############################# # Save the model parameters # ############################# def save_params_to_file(name, n, gsn_params): pass print 'saving parameters...' save_path = self.outdir+name+'_params_epoch_'+str(n)+'.pkl' f = open(save_path, 'wb') try: cPickle.dump(gsn_params, f, protocol=cPickle.HIGHEST_PROTOCOL) finally: f.close() def save_params(params): values = [param.get_value(borrow=True) for param in params] return values def restore_params(params, values): for i in range(len(params)): params[i].set_value(values[i]) ######################################### # If we are using Hessian-free training # ######################################### if self.hessian_free: pass # gradient_dataset = hf_sequence_dataset([train_X.get_value()], batch_size=None, number_batches=5000) # cg_dataset = hf_sequence_dataset([train_X.get_value()], batch_size=None, number_batches=1000) # valid_dataset = hf_sequence_dataset([valid_X.get_value()], batch_size=None, number_batches=1000) # # s = x_samples # costs = [cost, show_cost] # hf_optimizer(params, [Xs], s, costs, u, ua).train(gradient_dataset, cg_dataset, initial_lambda=1.0, preconditioner=True, validation=valid_dataset) ################################ # If we are using SGD training # ################################ else: log.maybeLog(self.logger, "\n-----------TRAINING RNN-GSN------------\n") # TRAINING STOP = False counter = 0 if not continue_training: self.learning_rate.set_value(self.init_learn_rate) # learning rate times = [] best_cost = float('inf') best_params = None patience = 0 log.maybeLog(self.logger, ['train X size:',str(train_X.shape.eval())]) if valid_X is not None: log.maybeLog(self.logger, ['valid X size:',str(valid_X.shape.eval())]) if test_X is not None: log.maybeLog(self.logger, ['test X size:',str(test_X.shape.eval())]) if self.vis_init: self.bias_list[0].set_value(logit(numpy.clip(0.9,0.001,train_X.get_value().mean(axis=0)))) while not STOP: counter += 1 t = time.time() log.maybeAppend(self.logger, [counter,'\t']) if is_artificial: data.sequence_mnist_data(train_X, train_Y, valid_X, valid_Y, test_X, test_Y, artificial_sequence, rng) #train train_costs = data.apply_cost_function_to_dataset(self.f_learn, train_X, self.batch_size) # record it log.maybeAppend(self.logger, ['Train:',trunc(train_costs),'\t']) #valid valid_costs = data.apply_cost_function_to_dataset(self.f_cost, valid_X, self.batch_size) # record it log.maybeAppend(self.logger, ['Valid:',trunc(valid_costs), '\t']) #test test_costs = data.apply_cost_function_to_dataset(self.f_cost, test_X, self.batch_size) # record it log.maybeAppend(self.logger, ['Test:',trunc(test_costs), '\t']) #check for early stopping cost = numpy.sum(valid_costs) if cost < best_cost*self.early_stop_threshold: patience = 0 best_cost = cost # save the parameters that made it the best best_params = save_params(self.params) else: patience += 1 if counter >= self.n_epoch or patience >= self.early_stop_length: STOP = True if best_params is not None: restore_params(self.params, best_params) save_params_to_file('all', counter, self.params) timing = time.time() - t times.append(timing) log.maybeAppend(self.logger, 'time: '+make_time_units_string(timing)+'\t') log.maybeLog(self.logger, 'remaining: '+make_time_units_string((self.n_epoch - counter) * numpy.mean(times))) if (counter % self.save_frequency) == 0 or STOP is True: n_examples = 100 nums = test_X.get_value(borrow=True)[range(n_examples)] noisy_nums = self.f_noise(test_X.get_value(borrow=True)[range(n_examples)]) reconstructions = [] for i in xrange(0, len(noisy_nums)): recon = self.f_recon(noisy_nums[max(0,(i+1)-self.batch_size):i+1]) reconstructions.append(recon) reconstructed = numpy.array(reconstructions) # Concatenate stuff stacked = numpy.vstack([numpy.vstack([nums[i*10 : (i+1)*10], noisy_nums[i*10 : (i+1)*10], reconstructed[i*10 : (i+1)*10]]) for i in range(10)]) number_reconstruction = PIL.Image.fromarray(tile_raster_images(stacked, (self.root_N_input,self.root_N_input), (10,30))) number_reconstruction.save(self.outdir+'rnngsn_number_reconstruction_epoch_'+str(counter)+'.png') #save params save_params_to_file('all', counter, self.params) # ANNEAL! new_lr = self.learning_rate.get_value() * self.annealing self.learning_rate.set_value(new_lr)
def build_gsn(X, weights_list, bias_list, add_noise = defaults["add_noise"], noiseless_h1 = defaults["noiseless_h1"], hidden_add_noise_sigma = defaults["hidden_add_noise_sigma"], input_salt_and_pepper = defaults["input_salt_and_pepper"], input_sampling = defaults["input_sampling"], MRG = defaults["MRG"], visible_activation = defaults["visible_activation"], hidden_activation = defaults["hidden_activation"], walkbacks = defaults["walkbacks"], logger = None): """ Construct a GSN (unimodal transition operator) for k walkbacks on the input X. Returns the list of predicted X's after k walkbacks and the resulting layer values. @type X: Theano symbolic variable @param X: The variable representing the visible input. @type weights_list: List(matrix) @param weights_list: The list of weights to use between layers. @type bias_list: List(vector) @param bias_list: The list of biases to use for each layer. @type add_noise: Boolean @param add_noise: Whether or not to add noise in the computational graph. @type noiseless_h1: Boolean @param noiseless_h1: Whether or not to add noise in the first hidden layer. @type hidden_add_noise_sigma: Float @param hidden_add_noise_sigma: The sigma value for the hidden noise function. @type input_salt_and_pepper: Float @param input_salt_and_pepper: The amount of masking noise to use. @type input_sampling: Boolean @param input_sampling: Whether to sample from each walkback prediction (like Gibbs). @type MRG: Theano random generator @param MRG: Random generator. @type visible_activation: Function @param visible_activation: The visible layer X activation function. @type hidden_activation: Function @param hidden_activation: The hidden layer activation function. @type walkbacks: Integer @param walkbacks: The k number of walkbacks to use for the GSN. @type logger: Logger @param logger: The output log to use. @rtype: List @return: predicted_x_chain, hiddens """ p_X_chain = [] # Whether or not to corrupt the visible input X if add_noise: X_init = salt_and_pepper(X, input_salt_and_pepper) else: X_init = X # init hiddens with zeros hiddens = [X_init] for w in weights_list: hiddens.append(T.zeros_like(T.dot(hiddens[-1], w))) # The layer update scheme log.maybeLog(logger, ["Building the GSN graph :", walkbacks,"updates"]) for i in range(walkbacks): log.maybeLog(logger, "GSN Walkback {!s}/{!s}".format(i+1,walkbacks)) update_layers(hiddens, weights_list, bias_list, p_X_chain, add_noise, noiseless_h1, hidden_add_noise_sigma, input_salt_and_pepper, input_sampling, MRG, visible_activation, hidden_activation, logger) return p_X_chain, hiddens
def __init__(self, train_X=None, valid_X=None, test_X=None, args=None, logger=None): # Output logger self.logger = logger self.outdir = args.get("output_path", defaults["output_path"]) if self.outdir[-1] != '/': self.outdir = self.outdir + '/' # Input data - make sure it is a list of shared datasets self.train_X = raise_data_to_list(train_X) self.valid_X = raise_data_to_list(valid_X) self.test_X = raise_data_to_list(test_X) # variables from the dataset that are used for initialization and image reconstruction if train_X is None: self.N_input = args.get("input_size") if args.get("input_size") is None: raise AssertionError( "Please either specify input_size in the arguments or provide an example train_X for input dimensionality." ) else: self.N_input = train_X[0].eval().shape[1] self.root_N_input = numpy.sqrt(self.N_input) self.is_image = args.get('is_image', defaults['is_image']) if self.is_image: self.image_width = args.get('width', self.root_N_input) self.image_height = args.get('height', self.root_N_input) ####################################### # Network and training specifications # ####################################### self.layers = args.get('layers', defaults['layers']) # number hidden layers self.walkbacks = args.get('walkbacks', defaults['walkbacks']) # number of walkbacks self.learning_rate = theano.shared( cast32(args.get('learning_rate', defaults['learning_rate']))) # learning rate self.init_learn_rate = cast32( args.get('learning_rate', defaults['learning_rate'])) self.momentum = theano.shared( cast32(args.get('momentum', defaults['momentum']))) # momentum term self.annealing = cast32(args.get( 'annealing', defaults['annealing'])) # exponential annealing coefficient self.noise_annealing = cast32( args.get('noise_annealing', defaults['noise_annealing']) ) # exponential noise annealing coefficient self.batch_size = args.get('batch_size', defaults['batch_size']) self.n_epoch = args.get('n_epoch', defaults['n_epoch']) self.early_stop_threshold = args.get('early_stop_threshold', defaults['early_stop_threshold']) self.early_stop_length = args.get('early_stop_length', defaults['early_stop_length']) self.save_frequency = args.get('save_frequency', defaults['save_frequency']) self.noiseless_h1 = args.get('noiseless_h1', defaults["noiseless_h1"]) self.hidden_add_noise_sigma = theano.shared( cast32( args.get('hidden_add_noise_sigma', defaults["hidden_add_noise_sigma"]))) self.input_salt_and_pepper = theano.shared( cast32( args.get('input_salt_and_pepper', defaults["input_salt_and_pepper"]))) self.input_sampling = args.get('input_sampling', defaults["input_sampling"]) self.vis_init = args.get('vis_init', defaults['vis_init']) self.layer_sizes = [self.N_input] + [ args.get('hidden_size', defaults['hidden_size']) ] * self.layers # layer sizes, from h0 to hK (h0 is the visible layer) self.f_recon = None self.f_noise = None # Activation functions! if args.get('hidden_activation') is not None: log.maybeLog(self.logger, 'Using specified activation for hiddens') self.hidden_activation = args.get('hidden_activation') elif args.get('hidden_act') is not None: self.hidden_activation = get_activation_function( args.get('hidden_act')) log.maybeLog( self.logger, 'Using {0!s} activation for hiddens'.format( args.get('hidden_act'))) else: log.maybeLog(self.logger, "Using default activation for hiddens") self.hidden_activation = defaults['hidden_activation'] # Visible layer activation if args.get('visible_activation') is not None: log.maybeLog(self.logger, 'Using specified activation for visible layer') self.visible_activation = args.get('visible_activation') elif args.get('visible_act') is not None: self.visible_activation = get_activation_function( args.get('visible_act')) log.maybeLog( self.logger, 'Using {0!s} activation for visible layer'.format( args.get('visible_act'))) else: log.maybeLog(self.logger, 'Using default activation for visible layer') self.visible_activation = defaults['visible_activation'] # Cost function! if args.get('cost_function') is not None: log.maybeLog(self.logger, '\nUsing specified cost function for training\n') self.cost_function = args.get('cost_function') elif args.get('cost_funct') is not None: self.cost_function = get_cost_function(args.get('cost_funct')) log.maybeLog( self.logger, 'Using {0!s} for cost function'.format(args.get('cost_funct'))) else: log.maybeLog(self.logger, '\nUsing default cost function for training\n') self.cost_function = defaults['cost_function'] ############################ # Theano variables and RNG # ############################ self.X = T.fmatrix('X') # for use in sampling self.MRG = RNG_MRG.MRG_RandomStreams(1) rng.seed(1) ############### # Parameters! # ############### # initialize a list of weights and biases based on layer_sizes for the GSN if args.get('weights_list') is None: self.weights_list = [ get_shared_weights(self.layer_sizes[layer], self.layer_sizes[layer + 1], name="W_{0!s}_{1!s}".format( layer, layer + 1)) for layer in range(self.layers) ] # initialize each layer to uniform sample from sqrt(6. / (n_in + n_out)) else: self.weights_list = args.get('weights_list') if args.get('bias_list') is None: self.bias_list = [ get_shared_bias(self.layer_sizes[layer], name='b_' + str(layer)) for layer in range(self.layers + 1) ] # initialize each layer to 0's. else: self.bias_list = args.get('bias_list') self.params = self.weights_list + self.bias_list ################# # Build the GSN # ################# log.maybeLog(self.logger, "\nBuilding GSN graphs for training and testing") # GSN for training - with noise add_noise = True p_X_chain, _ = build_gsn( self.X, self.weights_list, self.bias_list, add_noise, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.walkbacks, self.logger) # GSN for reconstruction checks along the way - no noise add_noise = False p_X_chain_recon, _ = build_gsn( self.X, self.weights_list, self.bias_list, add_noise, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.walkbacks, self.logger) ####################### # Costs and gradients # ####################### log.maybeLog( self.logger, 'Cost w.r.t p(X|...) at every step in the graph for the GSN') gsn_costs = [self.cost_function(rX, self.X) for rX in p_X_chain] show_gsn_cost = gsn_costs[-1] # for logging to show progress gsn_cost = numpy.sum(gsn_costs) gsn_costs_recon = [ self.cost_function(rX, self.X) for rX in p_X_chain_recon ] show_gsn_cost_recon = gsn_costs_recon[-1] log.maybeLog(self.logger, ["gsn params:", self.params]) # Stochastic gradient descent! gradient = T.grad(gsn_cost, self.params) gradient_buffer = [ theano.shared(numpy.zeros(param.get_value().shape, dtype='float32')) for param in self.params ] m_gradient = [ self.momentum * gb + (cast32(1) - self.momentum) * g for (gb, g) in zip(gradient_buffer, gradient) ] param_updates = [(param, param - self.learning_rate * mg) for (param, mg) in zip(self.params, m_gradient)] gradient_buffer_updates = zip(gradient_buffer, m_gradient) updates = OrderedDict(param_updates + gradient_buffer_updates) ############ # Sampling # ############ # the input to the sampling function X_sample = T.fmatrix("X_sampling") self.network_state_input = [X_sample] + [ T.fmatrix("H_sampling_" + str(i + 1)) for i in range(self.layers) ] # "Output" state of the network (noisy) # initialized with input, then we apply updates self.network_state_output = [X_sample] + self.network_state_input[1:] visible_pX_chain = [] # ONE update log.maybeLog(self.logger, "Performing one walkback in network state sampling.") update_layers(self.network_state_output, self.weights_list, self.bias_list, visible_pX_chain, True, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.logger) ################################# # Create the functions # ################################# log.maybeLog(self.logger, "Compiling functions...") t = time.time() self.f_learn = theano.function(inputs=[self.X], updates=updates, outputs=show_gsn_cost, name='gsn_f_learn') self.f_cost = theano.function(inputs=[self.X], outputs=show_gsn_cost, name='gsn_f_cost') # used for checkpoints and testing - no noise in network self.f_recon = theano.function( inputs=[self.X], outputs=[show_gsn_cost_recon, p_X_chain_recon[-1]], name='gsn_f_recon') self.f_noise = theano.function(inputs=[self.X], outputs=salt_and_pepper( self.X, self.input_salt_and_pepper), name='gsn_f_noise') if self.layers == 1: self.f_sample = theano.function(inputs=[X_sample], outputs=visible_pX_chain[-1], name='gsn_f_sample_single_layer') else: # WHY IS THERE A WARNING???? # because the first odd layers are not used -> directly computed FROM THE EVEN layers # unused input = warn self.f_sample = theano.function(inputs=self.network_state_input, outputs=self.network_state_output + visible_pX_chain, on_unused_input='warn', name='gsn_f_sample') log.maybeLog( self.logger, "Compiling done. Took " + make_time_units_string(time.time() - t) + ".\n")
def simple_update_layer( hiddens, weights_list, bias_list, p_X_chain, i, add_noise=defaults["add_noise"], noiseless_h1=defaults["noiseless_h1"], hidden_add_noise_sigma=defaults["hidden_add_noise_sigma"], input_salt_and_pepper=defaults["input_salt_and_pepper"], input_sampling=defaults["input_sampling"], MRG=defaults["MRG"], visible_activation=defaults["visible_activation"], hidden_activation=defaults["hidden_activation"], logger=None): # Compute the dot product, whatever layer # If the visible layer X if i == 0: log.maybeLog(logger, 'using ' + str(weights_list[i]) + '.T') hiddens[i] = T.dot(hiddens[i + 1], weights_list[i].T) + bias_list[i] # If the top layer elif i == len(hiddens) - 1: log.maybeLog(logger, ['using', weights_list[i - 1]]) hiddens[i] = T.dot(hiddens[i - 1], weights_list[i - 1]) + bias_list[i] # Otherwise in-between layers else: log.maybeLog(logger, [ "using {0!s} and {1!s}.T".format(weights_list[i - 1], weights_list[i]) ]) # next layer : hiddens[i+1], assigned weights : W_i # previous layer : hiddens[i-1], assigned weights : W_(i-1) hiddens[i] = T.dot(hiddens[i + 1], weights_list[i].T) + T.dot( hiddens[i - 1], weights_list[i - 1]) + bias_list[i] # Add pre-activation noise if NOT input layer if i == 1 and noiseless_h1: log.maybeLog(logger, '>>NO noise in first hidden layer') add_noise = False # pre activation noise if i != 0 and add_noise: log.maybeLog(logger, ['Adding pre-activation gaussian noise for layer', i]) hiddens[i] = add_gaussian_noise(hiddens[i], hidden_add_noise_sigma) # ACTIVATION! if i == 0: log.maybeLog(logger, 'Activation for visible layer') hiddens[i] = visible_activation(hiddens[i]) else: log.maybeLog(logger, ['Hidden units activation for layer', i]) hiddens[i] = hidden_activation(hiddens[i]) # post activation noise # why is there post activation noise? Because there is already pre-activation noise, this just doubles the amount of noise between each activation of the hiddens. if i != 0 and add_noise: log.maybeLog(logger, ['Adding post-activation gaussian noise for layer', i]) hiddens[i] = add_gaussian_noise(hiddens[i], hidden_add_noise_sigma) # build the reconstruction chain if updating the visible layer X if i == 0: # if input layer -> append p(X|...) p_X_chain.append(hiddens[i]) # sample from p(X|...) - SAMPLING NEEDS TO BE CORRECT FOR INPUT TYPES I.E. FOR BINARY MNIST SAMPLING IS BINOMIAL. real-valued inputs should be gaussian if input_sampling: log.maybeLog(logger, 'Sampling from input') sampled = MRG.binomial(p=hiddens[i], size=hiddens[i].shape, dtype='float32') else: log.maybeLog(logger, '>>NO input sampling') sampled = hiddens[i] # add noise sampled = salt_and_pepper(sampled, input_salt_and_pepper) # set input layer hiddens[i] = sampled
def build_gsn(X, weights_list, bias_list, add_noise=defaults["add_noise"], noiseless_h1=defaults["noiseless_h1"], hidden_add_noise_sigma=defaults["hidden_add_noise_sigma"], input_salt_and_pepper=defaults["input_salt_and_pepper"], input_sampling=defaults["input_sampling"], MRG=defaults["MRG"], visible_activation=defaults["visible_activation"], hidden_activation=defaults["hidden_activation"], walkbacks=defaults["walkbacks"], logger=None): """ Construct a GSN (unimodal transition operator) for k walkbacks on the input X. Returns the list of predicted X's after k walkbacks and the resulting layer values. @type X: Theano symbolic variable @param X: The variable representing the visible input. @type weights_list: List(matrix) @param weights_list: The list of weights to use between layers. @type bias_list: List(vector) @param bias_list: The list of biases to use for each layer. @type add_noise: Boolean @param add_noise: Whether or not to add noise in the computational graph. @type noiseless_h1: Boolean @param noiseless_h1: Whether or not to add noise in the first hidden layer. @type hidden_add_noise_sigma: Float @param hidden_add_noise_sigma: The sigma value for the hidden noise function. @type input_salt_and_pepper: Float @param input_salt_and_pepper: The amount of masking noise to use. @type input_sampling: Boolean @param input_sampling: Whether to sample from each walkback prediction (like Gibbs). @type MRG: Theano random generator @param MRG: Random generator. @type visible_activation: Function @param visible_activation: The visible layer X activation function. @type hidden_activation: Function @param hidden_activation: The hidden layer activation function. @type walkbacks: Integer @param walkbacks: The k number of walkbacks to use for the GSN. @type logger: Logger @param logger: The output log to use. @rtype: List @return: predicted_x_chain, hiddens """ p_X_chain = [] # Whether or not to corrupt the visible input X if add_noise: X_init = salt_and_pepper(X, input_salt_and_pepper) else: X_init = X # init hiddens with zeros hiddens = [X_init] for w in weights_list: hiddens.append(T.zeros_like(T.dot(hiddens[-1], w))) # The layer update scheme log.maybeLog(logger, ["Building the GSN graph :", walkbacks, "updates"]) for i in range(walkbacks): log.maybeLog(logger, "GSN Walkback {!s}/{!s}".format(i + 1, walkbacks)) update_layers(hiddens, weights_list, bias_list, p_X_chain, add_noise, noiseless_h1, hidden_add_noise_sigma, input_salt_and_pepper, input_sampling, MRG, visible_activation, hidden_activation, logger) return p_X_chain, hiddens
def sample(self, initial, n_samples=400, k=1): log.maybeLog(self.logger, "Starting sampling...") def sample_some_numbers_single_layer(n_samples): x0 = initial samples = [x0] x = self.f_noise(x0) for _ in xrange(n_samples-1): x = self.f_sample(x) samples.append(x) x = rng.binomial(n=1, p=x, size=x.shape).astype('float32') x = self.f_noise(x) log.maybeLog(self.logger, "Sampling done.") return numpy.vstack(samples), None def sampling_wrapper(NSI): # * is the "splat" operator: It takes a list as input, and expands it into actual positional arguments in the function call. out = self.f_sample(*NSI) NSO = out[:len(self.network_state_output)] vis_pX_chain = out[len(self.network_state_output):] return NSO, vis_pX_chain def sample_some_numbers(n_samples): # The network's initial state init_vis = initial noisy_init_vis = self.f_noise(init_vis) network_state = [[noisy_init_vis] + [numpy.zeros((initial.shape[0],self.hidden_size), dtype='float32') for _ in self.bias_list[1:]]] visible_chain = [init_vis] noisy_h0_chain = [noisy_init_vis] sampled_h = [] times = [] for i in xrange(n_samples-1): _t = time.time() # feed the last state into the network, compute new state, and obtain visible units expectation chain net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1]) # append to the visible chain visible_chain += vis_pX_chain # append state output to the network state chain network_state.append(net_state_out) noisy_h0_chain.append(net_state_out[0]) if i%k == 0: sampled_h.append(T.stack(net_state_out[1:])) if i == k: log.maybeLog(self.logger, "About "+make_time_units_string(numpy.mean(times)*(n_samples-1-i))+" remaining...") times.append(time.time() - _t) log.maybeLog(self.logger, "Sampling done.") return numpy.vstack(visible_chain), sampled_h if self.layers == 1: return sample_some_numbers_single_layer(n_samples) else: return sample_some_numbers(n_samples)
def __init__(self, train_X=None, train_Y=None, valid_X=None, valid_Y=None, test_X=None, test_Y=None, args=None, logger=None): # Output logger self.logger = logger self.outdir = args.get("output_path", defaults["output_path"]) if self.outdir[-1] != '/': self.outdir = self.outdir+'/' data.mkdir_p(self.outdir) # Configuration config_filename = self.outdir+'config' logger.log('Saving config') with open(config_filename, 'w') as f: f.write(str(args)) # Input data - make sure it is a list of shared datasets if it isn't. THIS WILL KEEP 'NONE' AS 'NONE' no need to worry :) self.train_X = raise_to_list(train_X) self.train_Y = raise_to_list(train_Y) self.valid_X = raise_to_list(valid_X) self.valid_Y = raise_to_list(valid_Y) self.test_X = raise_to_list(test_X) self.test_Y = raise_to_list(test_Y) # variables from the dataset that are used for initialization and image reconstruction if self.train_X is None: self.N_input = args.get("input_size") if args.get("input_size") is None: raise AssertionError("Please either specify input_size in the arguments or provide an example train_X for input dimensionality.") else: self.N_input = self.train_X[0].get_value(borrow=True).shape[1] self.is_image = args.get('is_image', defaults['is_image']) if self.is_image: (_h, _w) = closest_to_square_factors(self.N_input) self.image_width = args.get('width', _w) self.image_height = args.get('height', _h) ####################################### # Network and training specifications # ####################################### self.layers = args.get('layers', defaults['layers']) # number hidden layers self.walkbacks = args.get('walkbacks', defaults['walkbacks']) # number of walkbacks self.learning_rate = theano.shared(cast32(args.get('learning_rate', defaults['learning_rate']))) # learning rate self.init_learn_rate = cast32(args.get('learning_rate', defaults['learning_rate'])) self.momentum = theano.shared(cast32(args.get('momentum', defaults['momentum']))) # momentum term self.annealing = cast32(args.get('annealing', defaults['annealing'])) # exponential annealing coefficient self.noise_annealing = cast32(args.get('noise_annealing', defaults['noise_annealing'])) # exponential noise annealing coefficient self.batch_size = args.get('batch_size', defaults['batch_size']) self.gsn_batch_size = args.get('gsn_batch_size', defaults['gsn_batch_size']) self.n_epoch = args.get('n_epoch', defaults['n_epoch']) self.early_stop_threshold = args.get('early_stop_threshold', defaults['early_stop_threshold']) self.early_stop_length = args.get('early_stop_length', defaults['early_stop_length']) self.save_frequency = args.get('save_frequency', defaults['save_frequency']) self.noiseless_h1 = args.get('noiseless_h1', defaults["noiseless_h1"]) self.hidden_add_noise_sigma = theano.shared(cast32(args.get('hidden_add_noise_sigma', defaults["hidden_add_noise_sigma"]))) self.input_salt_and_pepper = theano.shared(cast32(args.get('input_salt_and_pepper', defaults["input_salt_and_pepper"]))) self.input_sampling = args.get('input_sampling', defaults["input_sampling"]) self.vis_init = args.get('vis_init', defaults['vis_init']) self.initialize_gsn = args.get('initialize_gsn', defaults['initialize_gsn']) self.hessian_free = args.get('hessian_free', defaults['hessian_free']) self.hidden_size = args.get('hidden_size', defaults['hidden_size']) self.layer_sizes = [self.N_input] + [self.hidden_size] * self.layers # layer sizes, from h0 to hK (h0 is the visible layer) self.recurrent_hidden_size = args.get('recurrent_hidden_size', defaults['recurrent_hidden_size']) self.f_recon = None self.f_noise = None # Activation functions! # For the GSN: if args.get('hidden_activation') is not None: log.maybeLog(self.logger, 'Using specified activation for GSN hiddens') self.hidden_activation = args.get('hidden_activation') elif args.get('hidden_act') is not None: self.hidden_activation = get_activation_function(args.get('hidden_act')) log.maybeLog(self.logger, 'Using {0!s} activation for GSN hiddens'.format(args.get('hidden_act'))) else: log.maybeLog(self.logger, "Using default activation for GSN hiddens") self.hidden_activation = defaults['hidden_activation'] # For the RNN: if args.get('recurrent_hidden_activation') is not None: log.maybeLog(self.logger, 'Using specified activation for RNN hiddens') self.recurrent_hidden_activation = args.get('recurrent_hidden_activation') elif args.get('recurrent_hidden_act') is not None: self.recurrent_hidden_activation = get_activation_function(args.get('recurrent_hidden_act')) log.maybeLog(self.logger, 'Using {0!s} activation for RNN hiddens'.format(args.get('recurrent_hidden_act'))) else: log.maybeLog(self.logger, "Using default activation for RNN hiddens") self.recurrent_hidden_activation = defaults['recurrent_hidden_activation'] # Visible layer activation if args.get('visible_activation') is not None: log.maybeLog(self.logger, 'Using specified activation for visible layer') self.visible_activation = args.get('visible_activation') elif args.get('visible_act') is not None: self.visible_activation = get_activation_function(args.get('visible_act')) log.maybeLog(self.logger, 'Using {0!s} activation for visible layer'.format(args.get('visible_act'))) else: log.maybeLog(self.logger, 'Using default activation for visible layer') self.visible_activation = defaults['visible_activation'] # Cost function! if args.get('cost_function') is not None: log.maybeLog(self.logger, '\nUsing specified cost function for GSN training\n') self.cost_function = args.get('cost_function') elif args.get('cost_funct') is not None: self.cost_function = get_cost_function(args.get('cost_funct')) log.maybeLog(self.logger, 'Using {0!s} for cost function'.format(args.get('cost_funct'))) else: log.maybeLog(self.logger, '\nUsing default cost function for GSN training\n') self.cost_function = defaults['cost_function'] ############################ # Theano variables and RNG # ############################ self.X = T.fmatrix('X') #single (batch) for training gsn self.Xs = T.fmatrix('Xs') #sequence for training rnn-gsn self.MRG = RNG_MRG.MRG_RandomStreams(1) ############### # Parameters! # ############### #gsn self.weights_list = [get_shared_weights(self.layer_sizes[i], self.layer_sizes[i+1], name="W_{0!s}_{1!s}".format(i,i+1)) for i in range(self.layers)] # initialize each layer to uniform sample from sqrt(6. / (n_in + n_out)) self.bias_list = [get_shared_bias(self.layer_sizes[i], name='b_'+str(i)) for i in range(self.layers + 1)] # initialize each layer to 0's. #recurrent self.recurrent_to_gsn_weights_list = [get_shared_weights(self.recurrent_hidden_size, self.layer_sizes[layer], name="W_u_h{0!s}".format(layer)) for layer in range(self.layers+1) if layer%2 != 0] self.W_u_u = get_shared_weights(self.recurrent_hidden_size, self.recurrent_hidden_size, name="W_u_u") self.W_x_u = get_shared_weights(self.N_input, self.recurrent_hidden_size, name="W_x_u") self.recurrent_bias = get_shared_bias(self.recurrent_hidden_size, name='b_u') #lists for use with gradients self.gsn_params = self.weights_list + self.bias_list self.u_params = [self.W_u_u, self.W_x_u, self.recurrent_bias] self.params = self.gsn_params + self.recurrent_to_gsn_weights_list + self.u_params ########################################################### # load initial parameters of gsn # ########################################################### self.train_gsn_first = False if self.initialize_gsn: params_to_load = 'gsn_params_epoch_30.pkl' if not os.path.isfile(params_to_load): self.train_gsn_first = True else: log.maybeLog(self.logger, "\nLoading existing GSN parameters\n") loaded_params = cPickle.load(open(params_to_load,'r')) [p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[:len(self.weights_list)], self.weights_list)] [p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[len(self.weights_list):], self.bias_list)] if self.initialize_gsn: self.gsn_args = {'weights_list': self.weights_list, 'bias_list': self.bias_list, 'hidden_activation': self.hidden_activation, 'visible_activation': self.visible_activation, 'cost_function': self.cost_function, 'layers': self.layers, 'walkbacks': self.walkbacks, 'hidden_size': self.hidden_size, 'learning_rate': args.get('learning_rate', defaults['learning_rate']), 'momentum': args.get('momentum', defaults['momentum']), 'annealing': self.annealing, 'noise_annealing': self.noise_annealing, 'batch_size': self.gsn_batch_size, 'n_epoch': self.n_epoch, 'early_stop_threshold': self.early_stop_threshold, 'early_stop_length': self.early_stop_length, 'save_frequency': self.save_frequency, 'noiseless_h1': self.noiseless_h1, 'hidden_add_noise_sigma': args.get('hidden_add_noise_sigma', defaults['hidden_add_noise_sigma']), 'input_salt_and_pepper': args.get('input_salt_and_pepper', defaults['input_salt_and_pepper']), 'input_sampling': self.input_sampling, 'vis_init': self.vis_init, 'output_path': self.outdir+'gsn/', 'is_image': self.is_image, 'input_size': self.N_input } ############ # Sampling # ############ # the input to the sampling function X_sample = T.fmatrix("X_sampling") self.network_state_input = [X_sample] + [T.fmatrix("H_sampling_"+str(i+1)) for i in range(self.layers)] # "Output" state of the network (noisy) # initialized with input, then we apply updates self.network_state_output = [X_sample] + self.network_state_input[1:] visible_pX_chain = [] # ONE update _add_noise = True log.maybeLog(self.logger, "Performing one walkback in network state sampling.") GSN.update_layers(self.network_state_output, self.weights_list, self.bias_list, visible_pX_chain, _add_noise, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.logger) ############################################# # Build the graphs for the RNN-GSN # ############################################# # If `x_t` is given, deterministic recurrence to compute the u_t. Otherwise, first generate def recurrent_step(x_t, u_tm1, add_noise): # Make current guess for hiddens based on U for i in range(self.layers): if i%2 == 0: log.maybeLog(self.logger, "Using {0!s} and {1!s}".format(self.recurrent_to_gsn_weights_list[(i+1)/2],self.bias_list[i+1])) h_t = T.concatenate([self.hidden_activation(self.bias_list[i+1] + T.dot(u_tm1, self.recurrent_to_gsn_weights_list[(i+1)/2])) for i in range(self.layers) if i%2 == 0],axis=0) generate = x_t is None if generate: pass # Make a GSN to update U # chain, hs = gsn.build_gsn(x_t, weights_list, bias_list, add_noise, state.noiseless_h1, state.hidden_add_noise_sigma, state.input_salt_and_pepper, state.input_sampling, MRG, visible_activation, hidden_activation, walkbacks, logger) # htop_t = hs[-1] # denoised_x_t = chain[-1] # Update U # ua_t = T.dot(denoised_x_t, W_x_u) + T.dot(htop_t, W_h_u) + T.dot(u_tm1, W_u_u) + recurrent_bias ua_t = T.dot(x_t, self.W_x_u) + T.dot(u_tm1, self.W_u_u) + self.recurrent_bias u_t = self.recurrent_hidden_activation(ua_t) return None if generate else [ua_t, u_t, h_t] log.maybeLog(self.logger, "\nCreating recurrent step scan.") # For training, the deterministic recurrence is used to compute all the # {h_t, 1 <= t <= T} given Xs. Conditional GSNs can then be trained # in batches using those parameters. u0 = T.zeros((self.recurrent_hidden_size,)) # initial value for the RNN hidden units (ua, u, h_t), updates_recurrent = theano.scan(fn=lambda x_t, u_tm1, *_: recurrent_step(x_t, u_tm1, True), sequences=self.Xs, outputs_info=[None, u0, None], non_sequences=self.params) log.maybeLog(self.logger, "Now for reconstruction sample without noise") (_, _, h_t_recon), updates_recurrent_recon = theano.scan(fn=lambda x_t, u_tm1, *_: recurrent_step(x_t, u_tm1, False), sequences=self.Xs, outputs_info=[None, u0, None], non_sequences=self.params) # put together the hiddens list h_list = [T.zeros_like(self.Xs)] for layer, w in enumerate(self.weights_list): if layer%2 != 0: h_list.append(T.zeros_like(T.dot(h_list[-1], w))) else: h_list.append((h_t.T[(layer/2)*self.hidden_size:(layer/2+1)*self.hidden_size]).T) h_list_recon = [T.zeros_like(self.Xs)] for layer, w in enumerate(self.weights_list): if layer%2 != 0: h_list_recon.append(T.zeros_like(T.dot(h_list_recon[-1], w))) else: h_list_recon.append((h_t_recon.T[(layer/2)*self.hidden_size:(layer/2+1)*self.hidden_size]).T) #with noise _, _, cost, show_cost, error = GSN.build_gsn_given_hiddens(self.Xs, h_list, self.weights_list, self.bias_list, True, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.walkbacks, self.cost_function) #without noise for reconstruction x_sample_recon, _, _, recon_show_cost, _ = GSN.build_gsn_given_hiddens(self.Xs, h_list_recon, self.weights_list, self.bias_list, False, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.walkbacks, self.cost_function) updates_train = updates_recurrent updates_cost = updates_recurrent ############# # COSTS # ############# log.maybeLog(self.logger, '\nCost w.r.t p(X|...) at every step in the graph') start_functions_time = time.time() # if we are not using Hessian-free training create the normal sgd functions if not self.hessian_free: gradient = T.grad(cost, self.params) gradient_buffer = [theano.shared(numpy.zeros(param.get_value().shape, dtype='float32')) for param in self.params] m_gradient = [self.momentum * gb + (cast32(1) - self.momentum) * g for (gb, g) in zip(gradient_buffer, gradient)] param_updates = [(param, param - self.learning_rate * mg) for (param, mg) in zip(self.params, m_gradient)] gradient_buffer_updates = zip(gradient_buffer, m_gradient) updates = OrderedDict(param_updates + gradient_buffer_updates) updates_train.update(updates) log.maybeLog(self.logger, "rnn-gsn learn...") self.f_learn = theano.function(inputs = [self.Xs], updates = updates_train, outputs = [show_cost, error], on_unused_input='warn', name='rnngsn_f_learn') log.maybeLog(self.logger, "rnn-gsn cost...") self.f_cost = theano.function(inputs = [self.Xs], updates = updates_cost, outputs = [show_cost, error], on_unused_input='warn', name='rnngsn_f_cost') log.maybeLog(self.logger, "Training/cost functions done.") # Denoise some numbers : show number, noisy number, predicted number, reconstructed number log.maybeLog(self.logger, "Creating graph for noisy reconstruction function at checkpoints during training.") self.f_recon = theano.function(inputs=[self.Xs], outputs=[x_sample_recon[-1], recon_show_cost], name='rnngsn_f_recon') # a function to add salt and pepper noise self.f_noise = theano.function(inputs = [self.X], outputs = salt_and_pepper(self.X, self.input_salt_and_pepper), name='rnngsn_f_noise') # Sampling functions log.maybeLog(self.logger, "Creating sampling function...") if self.layers == 1: self.f_sample = theano.function(inputs = [X_sample], outputs = visible_pX_chain[-1], name='rnngsn_f_sample_single_layer') else: self.f_sample = theano.function(inputs = self.network_state_input, outputs = self.network_state_output + visible_pX_chain, on_unused_input='warn', name='rnngsn_f_sample') log.maybeLog(self.logger, "Done compiling all functions.") compilation_time = time.time() - start_functions_time # Show the compile time with appropriate easy-to-read units. log.maybeLog(self.logger, "Total compilation time took "+make_time_units_string(compilation_time)+".\n\n")
def simple_update_layer(hiddens, weights_list, bias_list, p_X_chain, i, add_noise = defaults["add_noise"], noiseless_h1 = defaults["noiseless_h1"], hidden_add_noise_sigma = defaults["hidden_add_noise_sigma"], input_salt_and_pepper = defaults["input_salt_and_pepper"], input_sampling = defaults["input_sampling"], MRG = defaults["MRG"], visible_activation = defaults["visible_activation"], hidden_activation = defaults["hidden_activation"], logger = None): # Compute the dot product, whatever layer # If the visible layer X if i == 0: log.maybeLog(logger, 'using '+str(weights_list[i])+'.T') hiddens[i] = T.dot(hiddens[i+1], weights_list[i].T) + bias_list[i] # If the top layer elif i == len(hiddens)-1: log.maybeLog(logger, ['using',weights_list[i-1]]) hiddens[i] = T.dot(hiddens[i-1], weights_list[i-1]) + bias_list[i] # Otherwise in-between layers else: log.maybeLog(logger, ["using {0!s} and {1!s}.T".format(weights_list[i-1], weights_list[i])]) # next layer : hiddens[i+1], assigned weights : W_i # previous layer : hiddens[i-1], assigned weights : W_(i-1) hiddens[i] = T.dot(hiddens[i+1], weights_list[i].T) + T.dot(hiddens[i-1], weights_list[i-1]) + bias_list[i] # Add pre-activation noise if NOT input layer if i==1 and noiseless_h1: log.maybeLog(logger, '>>NO noise in first hidden layer') add_noise = False # pre activation noise if i != 0 and add_noise: log.maybeLog(logger, ['Adding pre-activation gaussian noise for layer', i]) hiddens[i] = add_gaussian_noise(hiddens[i], hidden_add_noise_sigma) # ACTIVATION! if i == 0: log.maybeLog(logger, 'Activation for visible layer') hiddens[i] = visible_activation(hiddens[i]) else: log.maybeLog(logger, ['Hidden units activation for layer', i]) hiddens[i] = hidden_activation(hiddens[i]) # post activation noise # why is there post activation noise? Because there is already pre-activation noise, this just doubles the amount of noise between each activation of the hiddens. if i != 0 and add_noise: log.maybeLog(logger, ['Adding post-activation gaussian noise for layer', i]) hiddens[i] = add_gaussian_noise(hiddens[i], hidden_add_noise_sigma) # build the reconstruction chain if updating the visible layer X if i == 0: # if input layer -> append p(X|...) p_X_chain.append(hiddens[i]) # sample from p(X|...) - SAMPLING NEEDS TO BE CORRECT FOR INPUT TYPES I.E. FOR BINARY MNIST SAMPLING IS BINOMIAL. real-valued inputs should be gaussian if input_sampling: log.maybeLog(logger, 'Sampling from input') sampled = MRG.binomial(p = hiddens[i], size=hiddens[i].shape, dtype='float32') else: log.maybeLog(logger, '>>NO input sampling') sampled = hiddens[i] # add noise sampled = salt_and_pepper(sampled, input_salt_and_pepper) # set input layer hiddens[i] = sampled
def train(self, train_X=None, valid_X=None, test_X=None, continue_training=False): log.maybeLog(self.logger, "\nTraining---------\n") if train_X is None: log.maybeLog( self.logger, "Training using data given during initialization of GSN.\n") train_X = self.train_X if train_X is None: log.maybeLog(self.logger, "\nPlease provide a training dataset!\n") raise AssertionError("Please provide a training dataset!") else: log.maybeLog( self.logger, "Training using data provided to training function.\n") if valid_X is None: valid_X = self.valid_X if test_X is None: test_X = self.test_X train_X = raise_data_to_list(train_X) valid_X = raise_data_to_list(valid_X) test_X = raise_data_to_list(test_X) ############ # TRAINING # ############ log.maybeLog( self.logger, "-----------TRAINING GSN FOR {0!s} EPOCHS-----------".format( self.n_epoch)) STOP = False counter = 0 if not continue_training: self.learning_rate.set_value(self.init_learn_rate) # learning rate times = [] best_cost = float('inf') best_params = None patience = 0 log.maybeLog( self.logger, ['train X size:', str(train_X[0].shape.eval())]) if valid_X is not None: log.maybeLog(self.logger, ['valid X size:', str(valid_X[0].shape.eval())]) if test_X is not None: log.maybeLog( self.logger, ['test X size:', str(test_X[0].shape.eval())]) if self.vis_init: self.bias_list[0].set_value( logit( numpy.clip(0.9, 0.001, train_X[0].get_value().mean(axis=0)))) while not STOP: counter += 1 t = time.time() log.maybeAppend(self.logger, [counter, '\t']) #train train_costs = data.apply_cost_function_to_dataset( self.f_learn, train_X, self.batch_size) log.maybeAppend( self.logger, ['Train:', trunc(numpy.mean(train_costs)), '\t']) #valid if valid_X is not None: valid_costs = data.apply_cost_function_to_dataset( self.f_cost, valid_X, self.batch_size) log.maybeAppend( self.logger, ['Valid:', trunc(numpy.mean(valid_costs)), '\t']) #test if test_X is not None: test_costs = data.apply_cost_function_to_dataset( self.f_cost, test_X, self.batch_size) log.maybeAppend( self.logger, ['Test:', trunc(numpy.mean(test_costs)), '\t']) #check for early stopping if valid_X is not None: cost = numpy.sum(valid_costs) else: cost = numpy.sum(train_costs) if cost < best_cost * self.early_stop_threshold: patience = 0 best_cost = cost # save the parameters that made it the best best_params = save_params(self.params) else: patience += 1 if counter >= self.n_epoch or patience >= self.early_stop_length: STOP = True if best_params is not None: restore_params(self.params, best_params) save_params_to_file(counter, self.params, self.outdir, self.logger) timing = time.time() - t times.append(timing) log.maybeAppend(self.logger, 'time: ' + make_time_units_string(timing) + '\t') log.maybeLog( self.logger, 'remaining: ' + make_time_units_string( (self.n_epoch - counter) * numpy.mean(times))) if (counter % self.save_frequency) == 0 or STOP is True: if self.is_image: n_examples = 100 tests = test_X.get_value()[0:n_examples] noisy_tests = self.f_noise( test_X.get_value()[0:n_examples]) _, reconstructed = self.f_recon(noisy_tests) # Concatenate stuff if it is an image stacked = numpy.vstack([ numpy.vstack([ tests[i * 10:(i + 1) * 10], noisy_tests[i * 10:(i + 1) * 10], reconstructed[i * 10:(i + 1) * 10] ]) for i in range(10) ]) number_reconstruction = PIL.Image.fromarray( tile_raster_images( stacked, (self.image_height, self.image_width), (10, 30))) number_reconstruction.save( self.outdir + 'gsn_image_reconstruction_epoch_' + str(counter) + '.png') #save gsn_params save_params_to_file(counter, self.params, self.outdir, self.logger) # ANNEAL! new_lr = self.learning_rate.get_value() * self.annealing self.learning_rate.set_value(new_lr) new_hidden_sigma = self.hidden_add_noise_sigma.get_value( ) * self.noise_annealing self.hidden_add_noise_sigma.set_value(new_hidden_sigma) new_salt_pepper = self.input_salt_and_pepper.get_value( ) * self.noise_annealing self.input_salt_and_pepper.set_value(new_salt_pepper)
def train(self, train_X=None, train_Y=None, valid_X=None, valid_Y=None, test_X=None, test_Y=None, is_artificial=False, artificial_sequence=1, continue_training=False): log.maybeLog(self.logger, "\nTraining---------\n") if train_X is None: log.maybeLog(self.logger, "Training using data given during initialization of RNN-GSN.\n") train_X = self.train_X train_Y = self.train_Y if train_X is None: log.maybeLog(self.logger, "\nPlease provide a training dataset!\n") raise AssertionError("Please provide a training dataset!") else: log.maybeLog(self.logger, "Training using data provided to training function.\n") if valid_X is None: valid_X = self.valid_X valid_Y = self.valid_Y if test_X is None: test_X = self.test_X test_Y = self.test_Y # Input data - make sure it is a list of shared datasets train_X = raise_to_list(train_X) train_Y = raise_to_list(train_Y) valid_X = raise_to_list(valid_X) valid_Y = raise_to_list(valid_Y) test_X = raise_to_list(test_X) test_Y = raise_to_list(test_Y) ########################################################## # Train the GSN first to get good weights initialization # ########################################################## # if self.train_gsn_first: # log.maybeLog(self.logger, "\n\n----------Initially training the GSN---------\n\n") # init_gsn = generative_stochastic_network.GSN(train_X=train_X, valid_X=valid_X, test_X=test_X, args=self.gsn_args, logger=self.logger) # init_gsn.train() ######################################### # If we are using Hessian-free training # ######################################### if self.hessian_free: pass # gradient_dataset = hf_sequence_dataset([train_X.get_value()], batch_size=None, number_batches=5000) # cg_dataset = hf_sequence_dataset([train_X.get_value()], batch_size=None, number_batches=1000) # valid_dataset = hf_sequence_dataset([valid_X.get_value()], batch_size=None, number_batches=1000) # # s = x_samples # costs = [cost, show_cost] # hf_optimizer(params, [Xs], s, costs, u, ua).train(gradient_dataset, cg_dataset, initial_lambda=1.0, preconditioner=True, validation=valid_dataset) ################################ # If we are using SGD training # ################################ else: log.maybeLog(self.logger, "\n-----------TRAINING RNN-GSN------------\n") # TRAINING STOP = False counter = 0 if not continue_training: self.learning_rate.set_value(self.init_learn_rate) # learning rate times = [] best_cost = float('inf') best_params = None patience = 0 log.maybeLog(self.logger, ['train X size:',str(train_X[0].get_value(borrow=True).shape)]) if valid_X is not None: log.maybeLog(self.logger, ['valid X size:',str(valid_X[0].get_value(borrow=True).shape)]) if test_X is not None: log.maybeLog(self.logger, ['test X size:',str(test_X[0].get_value(borrow=True).shape)]) if self.vis_init: self.bias_list[0].set_value(logit(numpy.clip(0.9,0.001,train_X[0].get_value(borrow=True).mean(axis=0)))) start_time = time.time() while not STOP: counter += 1 t = time.time() log.maybeAppend(self.logger, [counter,'\t']) # if is_artificial: # data.sequence_mnist_data(train_X[0], train_Y[0], valid_X[0], valid_Y[0], test_X[0], test_Y[0], artificial_sequence, rng) #train train_costs = [] train_errors = [] for train_data in train_X: costs_and_errors = data.apply_cost_function_to_dataset(self.f_learn, train_data, self.batch_size) train_costs.extend([cost for (cost, error) in costs_and_errors]) train_errors.extend([error for (cost, error) in costs_and_errors]) log.maybeAppend(self.logger, ['Train:',trunc(numpy.mean(train_costs)),trunc(numpy.mean(train_errors)),'\t']) #valid if valid_X is not None: valid_costs = [] for valid_data in valid_X: cs = data.apply_cost_function_to_dataset(self.f_cost, valid_data, self.batch_size) valid_costs.extend([c for c,e in cs]) log.maybeAppend(self.logger, ['Valid:',trunc(numpy.mean(valid_costs)), '\t']) #test if test_X is not None: test_costs = [] test_errors = [] for test_data in test_X: costs_and_errors = data.apply_cost_function_to_dataset(self.f_cost, test_data, self.batch_size) test_costs.extend([cost for (cost, error) in costs_and_errors]) test_errors.extend([error for (cost, error) in costs_and_errors]) log.maybeAppend(self.logger, ['Test:',trunc(numpy.mean(test_costs)),trunc(numpy.mean(test_errors)), '\t']) #check for early stopping if valid_X is not None: cost = numpy.sum(valid_costs) else: cost = numpy.sum(train_costs) if cost < best_cost*self.early_stop_threshold: patience = 0 best_cost = cost # save the parameters that made it the best best_params = copy_params(self.params) else: patience += 1 if counter >= self.n_epoch or patience >= self.early_stop_length: STOP = True if best_params is not None: restore_params(self.params, best_params) self.save_params('all', counter, self.params) timing = time.time() - t times.append(timing) log.maybeAppend(self.logger, 'time: '+make_time_units_string(timing)+'\t') log.maybeLog(self.logger, 'remaining: '+make_time_units_string((self.n_epoch - counter) * numpy.mean(times))) if (counter % self.save_frequency) == 0 or STOP is True: n_examples = 100 xs_test = test_X[0].get_value(borrow=True)[range(n_examples)] noisy_xs_test = self.f_noise(test_X[0].get_value(borrow=True)[range(n_examples)]) reconstructions = [] for i in xrange(0, len(noisy_xs_test)): recon, recon_cost = self.f_recon(noisy_xs_test[max(0,(i+1)-self.batch_size):i+1]) reconstructions.append(recon) reconstructed = numpy.array(reconstructions) if (self.is_image): # Concatenate stuff # stacked = numpy.vstack([numpy.vstack([xs_test[i*10 : (i+1)*10], noisy_xs_test[i*10 : (i+1)*10], reconstructed[i*10 : (i+1)*10]]) for i in range(10)]) # number_reconstruction = PIL.Image.fromarray(tile_raster_images(stacked, (self.image_height, self.image_width), (10,30))) # number_reconstruction.save(self.outdir+'rnngsn_reconstruction_epoch_'+str(counter)+'.png') #sample_numbers(counter, 'seven') # plot_samples(counter, 'rnngsn') pass #save params self.save_params('all', counter, self.params) # ANNEAL! new_lr = self.learning_rate.get_value() * self.annealing self.learning_rate.set_value(new_lr) new_noise = self.input_salt_and_pepper.get_value() * self.noise_annealing self.input_salt_and_pepper.set_value(new_noise) log.maybeLog(self.logger, "\n------------TOTAL RNN-GSN TRAIN TIME TOOK {0!s}---------".format(make_time_units_string(time.time()-start_time)))
def train(self, train_X=None, valid_X=None, test_X=None, continue_training=False): log.maybeLog(self.logger, "\nTraining---------\n") if train_X is None: log.maybeLog(self.logger, "Training using data given during initialization of GSN.\n") train_X = self.train_X if train_X is None: log.maybeLog(self.logger, "\nPlease provide a training dataset!\n") raise AssertionError("Please provide a training dataset!") else: log.maybeLog(self.logger, "Training using data provided to training function.\n") if valid_X is None: valid_X = self.valid_X if test_X is None: test_X = self.test_X train_X = raise_data_to_list(train_X) valid_X = raise_data_to_list(valid_X) test_X = raise_data_to_list(test_X) ############ # TRAINING # ############ log.maybeLog(self.logger, "-----------TRAINING GSN FOR {0!s} EPOCHS-----------".format(self.n_epoch)) STOP = False counter = 0 if not continue_training: self.learning_rate.set_value(self.init_learn_rate) # learning rate times = [] best_cost = float('inf') best_params = None patience = 0 log.maybeLog(self.logger, ['train X size:',str(train_X[0].shape.eval())]) if valid_X is not None: log.maybeLog(self.logger, ['valid X size:',str(valid_X[0].shape.eval())]) if test_X is not None: log.maybeLog(self.logger, ['test X size:',str(test_X[0].shape.eval())]) if self.vis_init: self.bias_list[0].set_value(logit(numpy.clip(0.9,0.001,train_X[0].get_value().mean(axis=0)))) while not STOP: counter += 1 t = time.time() log.maybeAppend(self.logger, [counter,'\t']) #train train_costs = data.apply_cost_function_to_dataset(self.f_learn, train_X, self.batch_size) log.maybeAppend(self.logger, ['Train:',trunc(numpy.mean(train_costs)), '\t']) #valid if valid_X is not None: valid_costs = data.apply_cost_function_to_dataset(self.f_cost, valid_X, self.batch_size) log.maybeAppend(self.logger, ['Valid:',trunc(numpy.mean(valid_costs)), '\t']) #test if test_X is not None: test_costs = data.apply_cost_function_to_dataset(self.f_cost, test_X, self.batch_size) log.maybeAppend(self.logger, ['Test:',trunc(numpy.mean(test_costs)), '\t']) #check for early stopping if valid_X is not None: cost = numpy.sum(valid_costs) else: cost = numpy.sum(train_costs) if cost < best_cost*self.early_stop_threshold: patience = 0 best_cost = cost # save the parameters that made it the best best_params = save_params(self.params) else: patience += 1 if counter >= self.n_epoch or patience >= self.early_stop_length: STOP = True if best_params is not None: restore_params(self.params, best_params) save_params_to_file(counter, self.params, self.outdir, self.logger) timing = time.time() - t times.append(timing) log.maybeAppend(self.logger, 'time: '+make_time_units_string(timing)+'\t') log.maybeLog(self.logger, 'remaining: '+make_time_units_string((self.n_epoch - counter) * numpy.mean(times))) if (counter % self.save_frequency) == 0 or STOP is True: if self.is_image: n_examples = 100 tests = test_X.get_value()[0:n_examples] noisy_tests = self.f_noise(test_X.get_value()[0:n_examples]) _, reconstructed = self.f_recon(noisy_tests) # Concatenate stuff if it is an image stacked = numpy.vstack([numpy.vstack([tests[i*10 : (i+1)*10], noisy_tests[i*10 : (i+1)*10], reconstructed[i*10 : (i+1)*10]]) for i in range(10)]) number_reconstruction = PIL.Image.fromarray(tile_raster_images(stacked, (self.image_height,self.image_width), (10,30))) number_reconstruction.save(self.outdir+'gsn_image_reconstruction_epoch_'+str(counter)+'.png') #save gsn_params save_params_to_file(counter, self.params, self.outdir, self.logger) # ANNEAL! new_lr = self.learning_rate.get_value() * self.annealing self.learning_rate.set_value(new_lr) new_hidden_sigma = self.hidden_add_noise_sigma.get_value() * self.noise_annealing self.hidden_add_noise_sigma.set_value(new_hidden_sigma) new_salt_pepper = self.input_salt_and_pepper.get_value() * self.noise_annealing self.input_salt_and_pepper.set_value(new_salt_pepper)
def __init__(self, train_X=None, train_Y=None, valid_X=None, valid_Y=None, test_X=None, test_Y=None, args=None, logger=None): # Output logger self.logger = logger self.outdir = args.get("output_path", defaults["output_path"]) if self.outdir[-1] != '/': self.outdir = self.outdir + '/' # Input data self.train_X = train_X self.train_Y = train_Y self.valid_X = valid_X self.valid_Y = valid_Y self.test_X = test_X self.test_Y = test_Y # variables from the dataset that are used for initialization and image reconstruction if train_X is None: self.N_input = args.get("input_size") if args.get("input_size") is None: raise AssertionError( "Please either specify input_size in the arguments or provide an example train_X for input dimensionality." ) else: self.N_input = train_X.eval().shape[1] self.root_N_input = numpy.sqrt(self.N_input) self.is_image = args.get('is_image', defaults['is_image']) if self.is_image: self.image_width = args.get('width', self.root_N_input) self.image_height = args.get('height', self.root_N_input) ####################################### # Network and training specifications # ####################################### self.gsn_layers = args.get( 'gsn_layers', defaults['gsn_layers']) # number hidden layers self.walkbacks = args.get('walkbacks', defaults['walkbacks']) # number of walkbacks self.learning_rate = theano.shared( cast32(args.get('learning_rate', defaults['learning_rate']))) # learning rate self.init_learn_rate = cast32( args.get('learning_rate', defaults['learning_rate'])) self.momentum = theano.shared( cast32(args.get('momentum', defaults['momentum']))) # momentum term self.annealing = cast32(args.get( 'annealing', defaults['annealing'])) # exponential annealing coefficient self.noise_annealing = cast32( args.get('noise_annealing', defaults['noise_annealing']) ) # exponential noise annealing coefficient self.batch_size = args.get('batch_size', defaults['batch_size']) self.gsn_batch_size = args.get('gsn_batch_size', defaults['gsn_batch_size']) self.n_epoch = args.get('n_epoch', defaults['n_epoch']) self.early_stop_threshold = args.get('early_stop_threshold', defaults['early_stop_threshold']) self.early_stop_length = args.get('early_stop_length', defaults['early_stop_length']) self.save_frequency = args.get('save_frequency', defaults['save_frequency']) self.noiseless_h1 = args.get('noiseless_h1', defaults["noiseless_h1"]) self.hidden_add_noise_sigma = theano.shared( cast32( args.get('hidden_add_noise_sigma', defaults["hidden_add_noise_sigma"]))) self.input_salt_and_pepper = theano.shared( cast32( args.get('input_salt_and_pepper', defaults["input_salt_and_pepper"]))) self.input_sampling = args.get('input_sampling', defaults["input_sampling"]) self.vis_init = args.get('vis_init', defaults['vis_init']) self.load_params = args.get('load_params', defaults['load_params']) self.hessian_free = args.get('hessian_free', defaults['hessian_free']) self.layer_sizes = [self.N_input] + [ args.get('hidden_size', defaults['hidden_size']) ] * self.gsn_layers # layer sizes, from h0 to hK (h0 is the visible layer) self.recurrent_hidden_size = args.get( 'recurrent_hidden_size', defaults['recurrent_hidden_size']) self.top_layer_sizes = [self.recurrent_hidden_size] + [ args.get('hidden_size', defaults['hidden_size']) ] * self.gsn_layers # layer sizes, from h0 to hK (h0 is the visible layer) self.f_recon = None self.f_noise = None # Activation functions! # For the GSN: if args.get('hidden_activation') is not None: log.maybeLog(self.logger, 'Using specified activation for GSN hiddens') self.hidden_activation = args.get('hidden_activation') elif args.get('hidden_act') == 'sigmoid': log.maybeLog(self.logger, 'Using sigmoid activation for GSN hiddens') self.hidden_activation = T.nnet.sigmoid elif args.get('hidden_act') == 'rectifier': log.maybeLog(self.logger, 'Using rectifier activation for GSN hiddens') self.hidden_activation = lambda x: T.maximum(cast32(0), x) elif args.get('hidden_act') == 'tanh': log.maybeLog( self.logger, 'Using hyperbolic tangent activation for GSN hiddens') self.hidden_activation = lambda x: T.tanh(x) elif args.get('hidden_act') is not None: log.maybeLog( self.logger, "Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid for GSN hiddens" .format(args.get('hidden_act'))) raise NotImplementedError( "Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid for GSN hiddens" .format(args.get('hidden_act'))) else: log.maybeLog(self.logger, "Using default activation for GSN hiddens") self.hidden_activation = defaults['hidden_activation'] # For the RNN: if args.get('recurrent_hidden_activation') is not None: log.maybeLog(self.logger, 'Using specified activation for RNN hiddens') self.recurrent_hidden_activation = args.get( 'recurrent_hidden_activation') elif args.get('recurrent_hidden_act') == 'sigmoid': log.maybeLog(self.logger, 'Using sigmoid activation for RNN hiddens') self.recurrent_hidden_activation = T.nnet.sigmoid elif args.get('recurrent_hidden_act') == 'rectifier': log.maybeLog(self.logger, 'Using rectifier activation for RNN hiddens') self.recurrent_hidden_activation = lambda x: T.maximum( cast32(0), x) elif args.get('recurrent_hidden_act') == 'tanh': log.maybeLog( self.logger, 'Using hyperbolic tangent activation for RNN hiddens') self.recurrent_hidden_activation = lambda x: T.tanh(x) elif args.get('recurrent_hidden_act') is not None: log.maybeLog( self.logger, "Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid for RNN hiddens" .format(args.get('hidden_act'))) raise NotImplementedError( "Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid for RNN hiddens" .format(args.get('hidden_act'))) else: log.maybeLog(self.logger, "Using default activation for RNN hiddens") self.recurrent_hidden_activation = defaults[ 'recurrent_hidden_activation'] # Visible layer activation if args.get('visible_activation') is not None: log.maybeLog(self.logger, 'Using specified activation for visible layer') self.visible_activation = args.get('visible_activation') elif args.get('visible_act') == 'sigmoid': log.maybeLog(self.logger, 'Using sigmoid activation for visible layer') self.visible_activation = T.nnet.sigmoid elif args.get('visible_act') == 'softmax': log.maybeLog(self.logger, 'Using softmax activation for visible layer') self.visible_activation = T.nnet.softmax elif args.get('visible_act') is not None: log.maybeLog( self.logger, "Did not recognize visible activation {0!s}, please use sigmoid or softmax" .format(args.get('visible_act'))) raise NotImplementedError( "Did not recognize visible activation {0!s}, please use sigmoid or softmax" .format(args.get('visible_act'))) else: log.maybeLog(self.logger, 'Using default activation for visible layer') self.visible_activation = defaults['visible_activation'] # Cost function! if args.get('cost_function') is not None: log.maybeLog(self.logger, '\nUsing specified cost function for GSN training\n') self.cost_function = args.get('cost_function') elif args.get('cost_funct') == 'binary_crossentropy': log.maybeLog(self.logger, '\nUsing binary cross-entropy cost!\n') self.cost_function = lambda x, y: T.mean( T.nnet.binary_crossentropy(x, y)) elif args.get('cost_funct') == 'square': log.maybeLog(self.logger, "\nUsing square error cost!\n") #cost_function = lambda x,y: T.log(T.mean(T.sqr(x-y))) self.cost_function = lambda x, y: T.log(T.sum(T.pow((x - y), 2))) elif args.get('cost_funct') is not None: log.maybeLog( self.logger, "\nDid not recognize cost function {0!s}, please use binary_crossentropy or square\n" .format(args.get('cost_funct'))) raise NotImplementedError( "Did not recognize cost function {0!s}, please use binary_crossentropy or square" .format(args.get('cost_funct'))) else: log.maybeLog(self.logger, '\nUsing default cost function for GSN training\n') self.cost_function = defaults['cost_function'] ############################ # Theano variables and RNG # ############################ self.X = T.fmatrix('X') #single (batch) for training gsn self.Xs = T.fmatrix('Xs') #sequence for training rnn self.MRG = RNG_MRG.MRG_RandomStreams(1) ############### # Parameters! # ############### #visible gsn self.weights_list = [ get_shared_weights(self.layer_sizes[i], self.layer_sizes[i + 1], name="W_{0!s}_{1!s}".format(i, i + 1)) for i in range(self.gsn_layers) ] # initialize each layer to uniform sample from sqrt(6. / (n_in + n_out)) self.bias_list = [ get_shared_bias(self.layer_sizes[i], name='b_' + str(i)) for i in range(self.gsn_layers + 1) ] # initialize each layer to 0's. #recurrent self.recurrent_to_gsn_weights_list = [ get_shared_weights(self.recurrent_hidden_size, self.layer_sizes[layer], name="W_u_h{0!s}".format(layer)) for layer in range(self.gsn_layers + 1) if layer % 2 != 0 ] self.W_u_u = get_shared_weights(self.recurrent_hidden_size, self.recurrent_hidden_size, name="W_u_u") self.W_ins_u = get_shared_weights(args.get('hidden_size', defaults['hidden_size']), self.recurrent_hidden_size, name="W_ins_u") self.recurrent_bias = get_shared_bias(self.recurrent_hidden_size, name='b_u') #top layer gsn self.top_weights_list = [ get_shared_weights(self.top_layer_sizes[i], self.top_layer_sizes[i + 1], name="Wtop_{0!s}_{1!s}".format(i, i + 1)) for i in range(self.gsn_layers) ] # initialize each layer to uniform sample from sqrt(6. / (n_in + n_out)) self.top_bias_list = [ get_shared_bias(self.top_layer_sizes[i], name='btop_' + str(i)) for i in range(self.gsn_layers + 1) ] # initialize each layer to 0's. #lists for use with gradients self.gsn_params = self.weights_list + self.bias_list self.u_params = [self.W_u_u, self.W_ins_u, self.recurrent_bias] self.top_params = self.top_weights_list + self.top_bias_list self.params = self.gsn_params + self.recurrent_to_gsn_weights_list + self.u_params + self.top_params ################################################### # load initial parameters # ################################################### if self.load_params: params_to_load = 'gsn_params.pkl' log.maybeLog(self.logger, "\nLoading existing GSN parameters\n") loaded_params = cPickle.load(open(params_to_load, 'r')) [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip( loaded_params[:len(self.weights_list)], self.weights_list) ] [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip( loaded_params[len(self.weights_list):], self.bias_list) ] params_to_load = 'rnn_params.pkl' log.maybeLog(self.logger, "\nLoading existing RNN parameters\n") loaded_params = cPickle.load(open(params_to_load, 'r')) [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip( loaded_params[:len(self.recurrent_to_gsn_weights_list)], self.recurrent_to_gsn_weights_list) ] [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip( loaded_params[len(self.recurrent_to_gsn_weights_list ):len(self.recurrent_to_gsn_weights_list ) + 1], self.W_u_u) ] [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip( loaded_params[len(self.recurrent_to_gsn_weights_list) + 1:len(self.recurrent_to_gsn_weights_list) + 2], self.W_ins_u) ] [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip( loaded_params[len(self.recurrent_to_gsn_weights_list) + 2:], self.recurrent_bias) ] params_to_load = 'top_gsn_params.pkl' log.maybeLog(self.logger, "\nLoading existing top level GSN parameters\n") loaded_params = cPickle.load(open(params_to_load, 'r')) [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[:len(self.top_weights_list)], self.top_weights_list) ] [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[len(self.top_weights_list):], self.top_bias_list) ] self.gsn_args = { 'weights_list': self.weights_list, 'bias_list': self.bias_list, 'hidden_activation': self.hidden_activation, 'visible_activation': self.visible_activation, 'cost_function': self.cost_function, 'layers': self.gsn_layers, 'walkbacks': self.walkbacks, 'hidden_size': args.get('hidden_size', defaults['hidden_size']), 'learning_rate': args.get('learning_rate', defaults['learning_rate']), 'momentum': args.get('momentum', defaults['momentum']), 'annealing': self.annealing, 'noise_annealing': self.noise_annealing, 'batch_size': self.gsn_batch_size, 'n_epoch': self.n_epoch, 'early_stop_threshold': self.early_stop_threshold, 'early_stop_length': self.early_stop_length, 'save_frequency': self.save_frequency, 'noiseless_h1': self.noiseless_h1, 'hidden_add_noise_sigma': args.get('hidden_add_noise_sigma', defaults['hidden_add_noise_sigma']), 'input_salt_and_pepper': args.get('input_salt_and_pepper', defaults['input_salt_and_pepper']), 'input_sampling': self.input_sampling, 'vis_init': self.vis_init, 'output_path': self.outdir + 'gsn/', 'is_image': self.is_image, 'input_size': self.N_input } self.top_gsn_args = { 'weights_list': self.top_weights_list, 'bias_list': self.top_bias_list, 'hidden_activation': self.hidden_activation, 'visible_activation': self.recurrent_hidden_activation, 'cost_function': self.cost_function, 'layers': self.gsn_layers, 'walkbacks': self.walkbacks, 'hidden_size': args.get('hidden_size', defaults['hidden_size']), 'learning_rate': args.get('learning_rate', defaults['learning_rate']), 'momentum': args.get('momentum', defaults['momentum']), 'annealing': self.annealing, 'noise_annealing': self.noise_annealing, 'batch_size': self.gsn_batch_size, 'n_epoch': self.n_epoch, 'early_stop_threshold': self.early_stop_threshold, 'early_stop_length': self.early_stop_length, 'save_frequency': self.save_frequency, 'noiseless_h1': self.noiseless_h1, 'hidden_add_noise_sigma': args.get('hidden_add_noise_sigma', defaults['hidden_add_noise_sigma']), 'input_salt_and_pepper': args.get('input_salt_and_pepper', defaults['input_salt_and_pepper']), 'input_sampling': self.input_sampling, 'vis_init': self.vis_init, 'output_path': self.outdir + 'top_gsn/', 'is_image': False, 'input_size': self.recurrent_hidden_size } ############ # Sampling # ############ # the input to the sampling function X_sample = T.fmatrix("X_sampling") self.network_state_input = [X_sample] + [ T.fmatrix("H_sampling_" + str(i + 1)) for i in range(self.gsn_layers) ] # "Output" state of the network (noisy) # initialized with input, then we apply updates self.network_state_output = [X_sample] + self.network_state_input[1:] visible_pX_chain = [] # ONE update log.maybeLog(self.logger, "Performing one walkback in network state sampling.") generative_stochastic_network.update_layers( self.network_state_output, self.weights_list, self.bias_list, visible_pX_chain, True, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.logger) ############################################## # Build the graphs for the SEN # ############################################## # If `x_t` is given, deterministic recurrence to compute the u_t. Otherwise, first generate def recurrent_step(x_t, u_tm1, add_noise): # Make current guess for hiddens based on U for i in range(self.gsn_layers): if i % 2 == 0: log.maybeLog( self.logger, "Using {0!s} and {1!s}".format( self.recurrent_to_gsn_weights_list[(i + 1) / 2], self.bias_list[i + 1])) h_t = T.concatenate([ self.hidden_activation(self.bias_list[i + 1] + T.dot( u_tm1, self.recurrent_to_gsn_weights_list[(i + 1) / 2])) for i in range(self.gsn_layers) if i % 2 == 0 ], axis=0) # Make a GSN to update U _, hs = generative_stochastic_network.build_gsn( x_t, self.weights_list, self.bias_list, add_noise, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.walkbacks, self.logger) htop_t = hs[-1] ins_t = htop_t ua_t = T.dot(ins_t, self.W_ins_u) + T.dot( u_tm1, self.W_u_u) + self.recurrent_bias u_t = self.recurrent_hidden_activation(ua_t) return [ua_t, u_t, h_t] log.maybeLog(self.logger, "\nCreating recurrent step scan.") # For training, the deterministic recurrence is used to compute all the # {h_t, 1 <= t <= T} given Xs. Conditional GSNs can then be trained # in batches using those parameters. u0 = T.zeros((self.recurrent_hidden_size, )) # initial value for the RNN hidden units (ua, u, h_t), updates_recurrent = theano.scan( fn=lambda x_t, u_tm1, *_: recurrent_step(x_t, u_tm1, True), sequences=self.Xs, outputs_info=[None, u0, None], non_sequences=self.params) log.maybeLog(self.logger, "Now for reconstruction sample without noise") (_, _, h_t_recon), updates_recurrent_recon = theano.scan( fn=lambda x_t, u_tm1, *_: recurrent_step(x_t, u_tm1, False), sequences=self.Xs, outputs_info=[None, u0, None], non_sequences=self.params) # put together the hiddens list h_list = [T.zeros_like(self.Xs)] for layer, w in enumerate(self.weights_list): if layer % 2 != 0: h_list.append(T.zeros_like(T.dot(h_list[-1], w))) else: h_list.append( (h_t.T[(layer / 2) * self.hidden_size:(layer / 2 + 1) * self.hidden_size]).T) h_list_recon = [T.zeros_like(self.Xs)] for layer, w in enumerate(self.weights_list): if layer % 2 != 0: h_list_recon.append(T.zeros_like(T.dot(h_list_recon[-1], w))) else: h_list_recon.append( (h_t_recon.T[(layer / 2) * self.hidden_size:(layer / 2 + 1) * self.hidden_size]).T) #with noise _, cost, show_cost = generative_stochastic_network.build_gsn_given_hiddens( self.Xs, h_list, self.weights_list, self.bias_list, True, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.walkbacks, self.cost_function, self.logger) #without noise for reconstruction x_sample_recon, _, _ = generative_stochastic_network.build_gsn_given_hiddens( self.Xs, h_list_recon, self.weights_list, self.bias_list, False, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.walkbacks, self.cost_function, self.logger) updates_train = updates_recurrent updates_cost = updates_recurrent ############# # COSTS # ############# log.maybeLog(self.logger, '\nCost w.r.t p(X|...) at every step in the graph') start_functions_time = time.time() # if we are not using Hessian-free training create the normal sgd functions if not self.hessian_free: gradient = T.grad(cost, self.params) gradient_buffer = [ theano.shared( numpy.zeros(param.get_value().shape, dtype='float32')) for param in self.params ] m_gradient = [ self.momentum * gb + (cast32(1) - self.momentum) * g for (gb, g) in zip(gradient_buffer, gradient) ] param_updates = [(param, param - self.learning_rate * mg) for (param, mg) in zip(self.params, m_gradient)] gradient_buffer_updates = zip(gradient_buffer, m_gradient) updates = OrderedDict(param_updates + gradient_buffer_updates) updates_train.update(updates) log.maybeLog(self.logger, "rnn-gsn learn...") self.f_learn = theano.function(inputs=[self.Xs], updates=updates_train, outputs=show_cost, on_unused_input='warn', name='rnngsn_f_learn') log.maybeLog(self.logger, "rnn-gsn cost...") self.f_cost = theano.function(inputs=[self.Xs], updates=updates_cost, outputs=show_cost, on_unused_input='warn', name='rnngsn_f_cost') log.maybeLog(self.logger, "Training/cost functions done.") # Denoise some numbers : show number, noisy number, predicted number, reconstructed number log.maybeLog( self.logger, "Creating graph for noisy reconstruction function at checkpoints during training." ) self.f_recon = theano.function(inputs=[self.Xs], outputs=x_sample_recon[-1], updates=updates_recurrent_recon, name='rnngsn_f_recon') # a function to add salt and pepper noise self.f_noise = theano.function(inputs=[self.X], outputs=salt_and_pepper( self.X, self.input_salt_and_pepper), name='rnngsn_f_noise') # Sampling functions log.maybeLog(self.logger, "Creating sampling function...") if self.gsn_layers == 1: self.f_sample = theano.function( inputs=[X_sample], outputs=visible_pX_chain[-1], name='rnngsn_f_sample_single_layer') else: # WHY IS THERE A WARNING???? # because the first odd layers are not used -> directly computed FROM THE EVEN layers # unused input = warn self.f_sample = theano.function(inputs=self.network_state_input, outputs=self.network_state_output + visible_pX_chain, on_unused_input='warn', name='rnngsn_f_sample') log.maybeLog(self.logger, "Done compiling all functions.") compilation_time = time.time() - start_functions_time # Show the compile time with appropriate easy-to-read units. log.maybeLog( self.logger, "Total compilation time took " + make_time_units_string(compilation_time) + ".\n\n")
def __init__(self, train_X=None, train_Y=None, valid_X=None, valid_Y=None, test_X=None, test_Y=None, args=None, logger=None): # Output logger self.logger = logger self.outdir = args.get("output_path", defaults["output_path"]) if self.outdir[-1] != '/': self.outdir = self.outdir+'/' data.mkdir_p(self.outdir) # Configuration config_filename = self.outdir+'config' logger.log('Saving config') with open(config_filename, 'w') as f: f.write(str(args)) # Input data - make sure it is a list of shared datasets if it isn't. THIS WILL KEEP 'NONE' AS 'NONE' no need to worry :) self.train_X = raise_to_list(train_X) self.train_Y = raise_to_list(train_Y) self.valid_X = raise_to_list(valid_X) self.valid_Y = raise_to_list(valid_Y) self.test_X = raise_to_list(test_X) self.test_Y = raise_to_list(test_Y) # variables from the dataset that are used for initialization and image reconstruction if self.train_X is None: self.N_input = args.get("input_size") if args.get("input_size") is None: raise AssertionError("Please either specify input_size in the arguments or provide an example train_X for input dimensionality.") else: self.N_input = self.train_X[0].get_value(borrow=True).shape[1] self.is_image = args.get('is_image', defaults['is_image']) if self.is_image: (_h, _w) = closest_to_square_factors(self.N_input) self.image_width = args.get('width', _w) self.image_height = args.get('height', _h) ####################################### # Network and training specifications # ####################################### self.layers = args.get('layers', defaults['layers']) # number hidden layers self.walkbacks = args.get('walkbacks', defaults['walkbacks']) # number of walkbacks self.learning_rate = theano.shared(cast32(args.get('learning_rate', defaults['learning_rate']))) # learning rate self.init_learn_rate = cast32(args.get('learning_rate', defaults['learning_rate'])) self.momentum = theano.shared(cast32(args.get('momentum', defaults['momentum']))) # momentum term self.annealing = cast32(args.get('annealing', defaults['annealing'])) # exponential annealing coefficient self.noise_annealing = cast32(args.get('noise_annealing', defaults['noise_annealing'])) # exponential noise annealing coefficient self.batch_size = args.get('batch_size', defaults['batch_size']) self.gsn_batch_size = args.get('gsn_batch_size', defaults['gsn_batch_size']) self.n_epoch = args.get('n_epoch', defaults['n_epoch']) self.early_stop_threshold = args.get('early_stop_threshold', defaults['early_stop_threshold']) self.early_stop_length = args.get('early_stop_length', defaults['early_stop_length']) self.save_frequency = args.get('save_frequency', defaults['save_frequency']) self.noiseless_h1 = args.get('noiseless_h1', defaults["noiseless_h1"]) self.hidden_add_noise_sigma = theano.shared(cast32(args.get('hidden_add_noise_sigma', defaults["hidden_add_noise_sigma"]))) self.input_salt_and_pepper = theano.shared(cast32(args.get('input_salt_and_pepper', defaults["input_salt_and_pepper"]))) self.input_sampling = args.get('input_sampling', defaults["input_sampling"]) self.vis_init = args.get('vis_init', defaults['vis_init']) self.initialize_gsn = args.get('initialize_gsn', defaults['initialize_gsn']) self.hessian_free = args.get('hessian_free', defaults['hessian_free']) self.hidden_size = args.get('hidden_size', defaults['hidden_size']) self.layer_sizes = [self.N_input] + [self.hidden_size] * self.layers # layer sizes, from h0 to hK (h0 is the visible layer) self.recurrent_hidden_size = args.get('recurrent_hidden_size', defaults['recurrent_hidden_size']) self.f_recon = None self.f_noise = None # Activation functions! # For the GSN: if args.get('hidden_activation') is not None: log.maybeLog(self.logger, 'Using specified activation for GSN hiddens') self.hidden_activation = args.get('hidden_activation') elif args.get('hidden_act') is not None: self.hidden_activation = get_activation_function(args.get('hidden_act')) log.maybeLog(self.logger, 'Using {0!s} activation for GSN hiddens'.format(args.get('hidden_act'))) else: log.maybeLog(self.logger, "Using default activation for GSN hiddens") self.hidden_activation = defaults['hidden_activation'] # For the RNN: if args.get('recurrent_hidden_activation') is not None: log.maybeLog(self.logger, 'Using specified activation for RNN hiddens') self.recurrent_hidden_activation = args.get('recurrent_hidden_activation') elif args.get('recurrent_hidden_act') is not None: self.recurrent_hidden_activation = get_activation_function(args.get('recurrent_hidden_act')) log.maybeLog(self.logger, 'Using {0!s} activation for RNN hiddens'.format(args.get('recurrent_hidden_act'))) else: log.maybeLog(self.logger, "Using default activation for RNN hiddens") self.recurrent_hidden_activation = defaults['recurrent_hidden_activation'] # Visible layer activation if args.get('visible_activation') is not None: log.maybeLog(self.logger, 'Using specified activation for visible layer') self.visible_activation = args.get('visible_activation') elif args.get('visible_act') is not None: self.visible_activation = get_activation_function(args.get('visible_act')) log.maybeLog(self.logger, 'Using {0!s} activation for visible layer'.format(args.get('visible_act'))) else: log.maybeLog(self.logger, 'Using default activation for visible layer') self.visible_activation = defaults['visible_activation'] # Cost function! if args.get('cost_function') is not None: log.maybeLog(self.logger, '\nUsing specified cost function for GSN training\n') self.cost_function = args.get('cost_function') elif args.get('cost_funct') is not None: self.cost_function = get_cost_function(args.get('cost_funct')) log.maybeLog(self.logger, 'Using {0!s} for cost function'.format(args.get('cost_funct'))) else: log.maybeLog(self.logger, '\nUsing default cost function for GSN training\n') self.cost_function = defaults['cost_function'] ############################ # Theano variables and RNG # ############################ self.X = T.fmatrix('X') #single (batch) for training gsn self.Xs = T.fmatrix('Xs') #sequence for training rnn-gsn self.MRG = RNG_MRG.MRG_RandomStreams(1) ############### # Parameters! # ############### #gsn self.weights_list = [get_shared_weights(self.layer_sizes[i], self.layer_sizes[i+1], name="W_{0!s}_{1!s}".format(i,i+1)) for i in range(self.layers)] # initialize each layer to uniform sample from sqrt(6. / (n_in + n_out)) self.bias_list = [get_shared_bias(self.layer_sizes[i], name='b_'+str(i)) for i in range(self.layers + 1)] # initialize each layer to 0's. #recurrent self.recurrent_to_gsn_weights_list = [get_shared_weights(self.recurrent_hidden_size, self.layer_sizes[layer], name="W_u_h{0!s}".format(layer)) for layer in range(self.layers+1) if layer%2 != 0] self.W_u_u = get_shared_weights(self.recurrent_hidden_size, self.recurrent_hidden_size, name="W_u_u") self.W_x_u = get_shared_weights(self.N_input, self.recurrent_hidden_size, name="W_x_u") self.recurrent_bias = get_shared_bias(self.recurrent_hidden_size, name='b_u') #lists for use with gradients self.gsn_params = self.weights_list + self.bias_list self.u_params = [self.W_u_u, self.W_x_u, self.recurrent_bias] self.params = self.gsn_params + self.recurrent_to_gsn_weights_list + self.u_params ########################################################### # load initial parameters of gsn # ########################################################### self.train_gsn_first = False if self.initialize_gsn: params_to_load = 'gsn_params.pkl' if not os.path.isfile(params_to_load): self.train_gsn_first = True else: log.maybeLog(self.logger, "\nLoading existing GSN parameters\n") loaded_params = cPickle.load(open(params_to_load,'r')) [p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[:len(self.weights_list)], self.weights_list)] [p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[len(self.weights_list):], self.bias_list)] if self.initialize_gsn: self.gsn_args = {'weights_list': self.weights_list, 'bias_list': self.bias_list, 'hidden_activation': self.hidden_activation, 'visible_activation': self.visible_activation, 'cost_function': self.cost_function, 'layers': self.layers, 'walkbacks': self.walkbacks, 'hidden_size': self.hidden_size, 'learning_rate': args.get('learning_rate', defaults['learning_rate']), 'momentum': args.get('momentum', defaults['momentum']), 'annealing': self.annealing, 'noise_annealing': self.noise_annealing, 'batch_size': self.gsn_batch_size, 'n_epoch': self.n_epoch, 'early_stop_threshold': self.early_stop_threshold, 'early_stop_length': self.early_stop_length, 'save_frequency': self.save_frequency, 'noiseless_h1': self.noiseless_h1, 'hidden_add_noise_sigma': args.get('hidden_add_noise_sigma', defaults['hidden_add_noise_sigma']), 'input_salt_and_pepper': args.get('input_salt_and_pepper', defaults['input_salt_and_pepper']), 'input_sampling': self.input_sampling, 'vis_init': self.vis_init, 'output_path': self.outdir+'gsn/', 'is_image': self.is_image, 'input_size': self.N_input } ############ # Sampling # ############ # the input to the sampling function X_sample = T.fmatrix("X_sampling") self.network_state_input = [X_sample] + [T.fmatrix("H_sampling_"+str(i+1)) for i in range(self.layers)] # "Output" state of the network (noisy) # initialized with input, then we apply updates self.network_state_output = [X_sample] + self.network_state_input[1:] visible_pX_chain = [] # ONE update _add_noise = True log.maybeLog(self.logger, "Performing one walkback in network state sampling.") GSN.update_layers(self.network_state_output, self.weights_list, self.bias_list, visible_pX_chain, _add_noise, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.logger) ############################################# # Build the graphs for the RNN-GSN # ############################################# # If `x_t` is given, deterministic recurrence to compute the u_t. Otherwise, first generate def recurrent_step(x_t, u_tm1, add_noise): # Make current guess for hiddens based on U for i in range(self.layers): if i%2 == 0: log.maybeLog(self.logger, "Using {0!s} and {1!s}".format(self.recurrent_to_gsn_weights_list[(i+1)/2],self.bias_list[i+1])) h_t = T.concatenate([self.hidden_activation(self.bias_list[i+1] + T.dot(u_tm1, self.recurrent_to_gsn_weights_list[(i+1)/2])) for i in range(self.layers) if i%2 == 0],axis=0) generate = x_t is None if generate: pass # Make a GSN to update U # chain, hs = gsn.build_gsn(x_t, weights_list, bias_list, add_noise, state.noiseless_h1, state.hidden_add_noise_sigma, state.input_salt_and_pepper, state.input_sampling, MRG, visible_activation, hidden_activation, walkbacks, logger) # htop_t = hs[-1] # denoised_x_t = chain[-1] # Update U # ua_t = T.dot(denoised_x_t, W_x_u) + T.dot(htop_t, W_h_u) + T.dot(u_tm1, W_u_u) + recurrent_bias ua_t = T.dot(x_t, self.W_x_u) + T.dot(u_tm1, self.W_u_u) + self.recurrent_bias u_t = self.recurrent_hidden_activation(ua_t) return None if generate else [ua_t, u_t, h_t] log.maybeLog(self.logger, "\nCreating recurrent step scan.") # For training, the deterministic recurrence is used to compute all the # {h_t, 1 <= t <= T} given Xs. Conditional GSNs can then be trained # in batches using those parameters. u0 = T.zeros((self.recurrent_hidden_size,)) # initial value for the RNN hidden units (ua, u, h_t), updates_recurrent = theano.scan(fn=lambda x_t, u_tm1, *_: recurrent_step(x_t, u_tm1, True), sequences=self.Xs, outputs_info=[None, u0, None], non_sequences=self.params) log.maybeLog(self.logger, "Now for reconstruction sample without noise") (_, _, h_t_recon), updates_recurrent_recon = theano.scan(fn=lambda x_t, u_tm1, *_: recurrent_step(x_t, u_tm1, False), sequences=self.Xs, outputs_info=[None, u0, None], non_sequences=self.params) # put together the hiddens list h_list = [T.zeros_like(self.Xs)] for layer, w in enumerate(self.weights_list): if layer%2 != 0: h_list.append(T.zeros_like(T.dot(h_list[-1], w))) else: h_list.append((h_t.T[(layer/2)*self.hidden_size:(layer/2+1)*self.hidden_size]).T) h_list_recon = [T.zeros_like(self.Xs)] for layer, w in enumerate(self.weights_list): if layer%2 != 0: h_list_recon.append(T.zeros_like(T.dot(h_list_recon[-1], w))) else: h_list_recon.append((h_t_recon.T[(layer/2)*self.hidden_size:(layer/2+1)*self.hidden_size]).T) #with noise _, _, cost, show_cost, error = GSN.build_gsn_given_hiddens(self.Xs, h_list, self.weights_list, self.bias_list, True, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.walkbacks, self.cost_function) #without noise for reconstruction x_sample_recon, _, _, recon_show_cost, _ = GSN.build_gsn_given_hiddens(self.Xs, h_list_recon, self.weights_list, self.bias_list, False, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.walkbacks, self.cost_function) updates_train = updates_recurrent updates_cost = updates_recurrent ############# # COSTS # ############# log.maybeLog(self.logger, '\nCost w.r.t p(X|...) at every step in the graph') start_functions_time = time.time() # if we are not using Hessian-free training create the normal sgd functions if not self.hessian_free: gradient = T.grad(cost, self.params) gradient_buffer = [theano.shared(numpy.zeros(param.get_value().shape, dtype='float32')) for param in self.params] m_gradient = [self.momentum * gb + (cast32(1) - self.momentum) * g for (gb, g) in zip(gradient_buffer, gradient)] param_updates = [(param, param - self.learning_rate * mg) for (param, mg) in zip(self.params, m_gradient)] gradient_buffer_updates = zip(gradient_buffer, m_gradient) updates = OrderedDict(param_updates + gradient_buffer_updates) updates_train.update(updates) log.maybeLog(self.logger, "rnn-gsn learn...") self.f_learn = theano.function(inputs = [self.Xs], updates = updates_train, outputs = [show_cost, error], on_unused_input='warn', name='rnngsn_f_learn') log.maybeLog(self.logger, "rnn-gsn cost...") self.f_cost = theano.function(inputs = [self.Xs], updates = updates_cost, outputs = [show_cost, error], on_unused_input='warn', name='rnngsn_f_cost') log.maybeLog(self.logger, "Training/cost functions done.") # Denoise some numbers : show number, noisy number, predicted number, reconstructed number log.maybeLog(self.logger, "Creating graph for noisy reconstruction function at checkpoints during training.") self.f_recon = theano.function(inputs=[self.Xs], outputs=[x_sample_recon[-1], recon_show_cost], name='rnngsn_f_recon') # a function to add salt and pepper noise self.f_noise = theano.function(inputs = [self.X], outputs = salt_and_pepper(self.X, self.input_salt_and_pepper), name='rnngsn_f_noise') # Sampling functions log.maybeLog(self.logger, "Creating sampling function...") if self.layers == 1: self.f_sample = theano.function(inputs = [X_sample], outputs = visible_pX_chain[-1], name='rnngsn_f_sample_single_layer') else: self.f_sample = theano.function(inputs = self.network_state_input, outputs = self.network_state_output + visible_pX_chain, on_unused_input='warn', name='rnngsn_f_sample') log.maybeLog(self.logger, "Done compiling all functions.") compilation_time = time.time() - start_functions_time # Show the compile time with appropriate easy-to-read units. log.maybeLog(self.logger, "Total compilation time took "+make_time_units_string(compilation_time)+".\n\n")
def __init__(self, train_X=None, train_Y=None, valid_X=None, valid_Y=None, test_X=None, test_Y=None, args=None, logger=None): # Output logger self.logger = logger self.outdir = args.get("output_path", defaults["output_path"]) if self.outdir[-1] != '/': self.outdir = self.outdir+'/' # Input data self.train_X = train_X self.train_Y = train_Y self.valid_X = valid_X self.valid_Y = valid_Y self.test_X = test_X self.test_Y = test_Y # variables from the dataset that are used for initialization and image reconstruction if train_X is None: self.N_input = args.get("input_size") if args.get("input_size") is None: raise AssertionError("Please either specify input_size in the arguments or provide an example train_X for input dimensionality.") else: self.N_input = train_X.eval().shape[1] self.root_N_input = numpy.sqrt(self.N_input) self.is_image = args.get('is_image', defaults['is_image']) if self.is_image: self.image_width = args.get('width', self.root_N_input) self.image_height = args.get('height', self.root_N_input) ####################################### # Network and training specifications # ####################################### self.gsn_layers = args.get('gsn_layers', defaults['gsn_layers']) # number hidden layers self.walkbacks = args.get('walkbacks', defaults['walkbacks']) # number of walkbacks self.learning_rate = theano.shared(cast32(args.get('learning_rate', defaults['learning_rate']))) # learning rate self.init_learn_rate = cast32(args.get('learning_rate', defaults['learning_rate'])) self.momentum = theano.shared(cast32(args.get('momentum', defaults['momentum']))) # momentum term self.annealing = cast32(args.get('annealing', defaults['annealing'])) # exponential annealing coefficient self.noise_annealing = cast32(args.get('noise_annealing', defaults['noise_annealing'])) # exponential noise annealing coefficient self.batch_size = args.get('batch_size', defaults['batch_size']) self.gsn_batch_size = args.get('gsn_batch_size', defaults['gsn_batch_size']) self.n_epoch = args.get('n_epoch', defaults['n_epoch']) self.early_stop_threshold = args.get('early_stop_threshold', defaults['early_stop_threshold']) self.early_stop_length = args.get('early_stop_length', defaults['early_stop_length']) self.save_frequency = args.get('save_frequency', defaults['save_frequency']) self.noiseless_h1 = args.get('noiseless_h1', defaults["noiseless_h1"]) self.hidden_add_noise_sigma = theano.shared(cast32(args.get('hidden_add_noise_sigma', defaults["hidden_add_noise_sigma"]))) self.input_salt_and_pepper = theano.shared(cast32(args.get('input_salt_and_pepper', defaults["input_salt_and_pepper"]))) self.input_sampling = args.get('input_sampling', defaults["input_sampling"]) self.vis_init = args.get('vis_init', defaults['vis_init']) self.load_params = args.get('load_params', defaults['load_params']) self.hessian_free = args.get('hessian_free', defaults['hessian_free']) self.layer_sizes = [self.N_input] + [args.get('hidden_size', defaults['hidden_size'])] * self.gsn_layers # layer sizes, from h0 to hK (h0 is the visible layer) self.recurrent_hidden_size = args.get('recurrent_hidden_size', defaults['recurrent_hidden_size']) self.top_layer_sizes = [self.recurrent_hidden_size] + [args.get('hidden_size', defaults['hidden_size'])] * self.gsn_layers # layer sizes, from h0 to hK (h0 is the visible layer) self.f_recon = None self.f_noise = None # Activation functions! # For the GSN: if args.get('hidden_activation') is not None: log.maybeLog(self.logger, 'Using specified activation for GSN hiddens') self.hidden_activation = args.get('hidden_activation') elif args.get('hidden_act') == 'sigmoid': log.maybeLog(self.logger, 'Using sigmoid activation for GSN hiddens') self.hidden_activation = T.nnet.sigmoid elif args.get('hidden_act') == 'rectifier': log.maybeLog(self.logger, 'Using rectifier activation for GSN hiddens') self.hidden_activation = lambda x : T.maximum(cast32(0), x) elif args.get('hidden_act') == 'tanh': log.maybeLog(self.logger, 'Using hyperbolic tangent activation for GSN hiddens') self.hidden_activation = lambda x : T.tanh(x) elif args.get('hidden_act') is not None: log.maybeLog(self.logger, "Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid for GSN hiddens".format(args.get('hidden_act'))) raise NotImplementedError("Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid for GSN hiddens".format(args.get('hidden_act'))) else: log.maybeLog(self.logger, "Using default activation for GSN hiddens") self.hidden_activation = defaults['hidden_activation'] # For the RNN: if args.get('recurrent_hidden_activation') is not None: log.maybeLog(self.logger, 'Using specified activation for RNN hiddens') self.recurrent_hidden_activation = args.get('recurrent_hidden_activation') elif args.get('recurrent_hidden_act') == 'sigmoid': log.maybeLog(self.logger, 'Using sigmoid activation for RNN hiddens') self.recurrent_hidden_activation = T.nnet.sigmoid elif args.get('recurrent_hidden_act') == 'rectifier': log.maybeLog(self.logger, 'Using rectifier activation for RNN hiddens') self.recurrent_hidden_activation = lambda x : T.maximum(cast32(0), x) elif args.get('recurrent_hidden_act') == 'tanh': log.maybeLog(self.logger, 'Using hyperbolic tangent activation for RNN hiddens') self.recurrent_hidden_activation = lambda x : T.tanh(x) elif args.get('recurrent_hidden_act') is not None: log.maybeLog(self.logger, "Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid for RNN hiddens".format(args.get('hidden_act'))) raise NotImplementedError("Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid for RNN hiddens".format(args.get('hidden_act'))) else: log.maybeLog(self.logger, "Using default activation for RNN hiddens") self.recurrent_hidden_activation = defaults['recurrent_hidden_activation'] # Visible layer activation if args.get('visible_activation') is not None: log.maybeLog(self.logger, 'Using specified activation for visible layer') self.visible_activation = args.get('visible_activation') elif args.get('visible_act') == 'sigmoid': log.maybeLog(self.logger, 'Using sigmoid activation for visible layer') self.visible_activation = T.nnet.sigmoid elif args.get('visible_act') == 'softmax': log.maybeLog(self.logger, 'Using softmax activation for visible layer') self.visible_activation = T.nnet.softmax elif args.get('visible_act') is not None: log.maybeLog(self.logger, "Did not recognize visible activation {0!s}, please use sigmoid or softmax".format(args.get('visible_act'))) raise NotImplementedError("Did not recognize visible activation {0!s}, please use sigmoid or softmax".format(args.get('visible_act'))) else: log.maybeLog(self.logger, 'Using default activation for visible layer') self.visible_activation = defaults['visible_activation'] # Cost function! if args.get('cost_function') is not None: log.maybeLog(self.logger, '\nUsing specified cost function for GSN training\n') self.cost_function = args.get('cost_function') elif args.get('cost_funct') == 'binary_crossentropy': log.maybeLog(self.logger, '\nUsing binary cross-entropy cost!\n') self.cost_function = lambda x,y: T.mean(T.nnet.binary_crossentropy(x,y)) elif args.get('cost_funct') == 'square': log.maybeLog(self.logger, "\nUsing square error cost!\n") #cost_function = lambda x,y: T.log(T.mean(T.sqr(x-y))) self.cost_function = lambda x,y: T.log(T.sum(T.pow((x-y),2))) elif args.get('cost_funct') is not None: log.maybeLog(self.logger, "\nDid not recognize cost function {0!s}, please use binary_crossentropy or square\n".format(args.get('cost_funct'))) raise NotImplementedError("Did not recognize cost function {0!s}, please use binary_crossentropy or square".format(args.get('cost_funct'))) else: log.maybeLog(self.logger, '\nUsing default cost function for GSN training\n') self.cost_function = defaults['cost_function'] ############################ # Theano variables and RNG # ############################ self.X = T.fmatrix('X') #single (batch) for training gsn self.Xs = T.fmatrix('Xs') #sequence for training rnn self.MRG = RNG_MRG.MRG_RandomStreams(1) ############### # Parameters! # ############### #visible gsn self.weights_list = [get_shared_weights(self.layer_sizes[i], self.layer_sizes[i+1], name="W_{0!s}_{1!s}".format(i,i+1)) for i in range(self.gsn_layers)] # initialize each layer to uniform sample from sqrt(6. / (n_in + n_out)) self.bias_list = [get_shared_bias(self.layer_sizes[i], name='b_'+str(i)) for i in range(self.gsn_layers + 1)] # initialize each layer to 0's. #recurrent self.recurrent_to_gsn_weights_list = [get_shared_weights(self.recurrent_hidden_size, self.layer_sizes[layer], name="W_u_h{0!s}".format(layer)) for layer in range(self.gsn_layers+1) if layer%2 != 0] self.W_u_u = get_shared_weights(self.recurrent_hidden_size, self.recurrent_hidden_size, name="W_u_u") self.W_ins_u = get_shared_weights(args.get('hidden_size', defaults['hidden_size']), self.recurrent_hidden_size, name="W_ins_u") self.recurrent_bias = get_shared_bias(self.recurrent_hidden_size, name='b_u') #top layer gsn self.top_weights_list = [get_shared_weights(self.top_layer_sizes[i], self.top_layer_sizes[i+1], name="Wtop_{0!s}_{1!s}".format(i,i+1)) for i in range(self.gsn_layers)] # initialize each layer to uniform sample from sqrt(6. / (n_in + n_out)) self.top_bias_list = [get_shared_bias(self.top_layer_sizes[i], name='btop_'+str(i)) for i in range(self.gsn_layers + 1)] # initialize each layer to 0's. #lists for use with gradients self.gsn_params = self.weights_list + self.bias_list self.u_params = [self.W_u_u, self.W_ins_u, self.recurrent_bias] self.top_params = self.top_weights_list + self.top_bias_list self.params = self.gsn_params + self.recurrent_to_gsn_weights_list + self.u_params + self.top_params ################################################### # load initial parameters # ################################################### if self.load_params: params_to_load = 'gsn_params.pkl' log.maybeLog(self.logger, "\nLoading existing GSN parameters\n") loaded_params = cPickle.load(open(params_to_load,'r')) [p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[:len(self.weights_list)], self.weights_list)] [p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[len(self.weights_list):], self.bias_list)] params_to_load = 'rnn_params.pkl' log.maybeLog(self.logger, "\nLoading existing RNN parameters\n") loaded_params = cPickle.load(open(params_to_load,'r')) [p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[:len(self.recurrent_to_gsn_weights_list)], self.recurrent_to_gsn_weights_list)] [p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[len(self.recurrent_to_gsn_weights_list):len(self.recurrent_to_gsn_weights_list)+1], self.W_u_u)] [p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[len(self.recurrent_to_gsn_weights_list)+1:len(self.recurrent_to_gsn_weights_list)+2], self.W_ins_u)] [p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[len(self.recurrent_to_gsn_weights_list)+2:], self.recurrent_bias)] params_to_load = 'top_gsn_params.pkl' log.maybeLog(self.logger, "\nLoading existing top level GSN parameters\n") loaded_params = cPickle.load(open(params_to_load,'r')) [p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[:len(self.top_weights_list)], self.top_weights_list)] [p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[len(self.top_weights_list):], self.top_bias_list)] self.gsn_args = {'weights_list': self.weights_list, 'bias_list': self.bias_list, 'hidden_activation': self.hidden_activation, 'visible_activation': self.visible_activation, 'cost_function': self.cost_function, 'layers': self.gsn_layers, 'walkbacks': self.walkbacks, 'hidden_size': args.get('hidden_size', defaults['hidden_size']), 'learning_rate': args.get('learning_rate', defaults['learning_rate']), 'momentum': args.get('momentum', defaults['momentum']), 'annealing': self.annealing, 'noise_annealing': self.noise_annealing, 'batch_size': self.gsn_batch_size, 'n_epoch': self.n_epoch, 'early_stop_threshold': self.early_stop_threshold, 'early_stop_length': self.early_stop_length, 'save_frequency': self.save_frequency, 'noiseless_h1': self.noiseless_h1, 'hidden_add_noise_sigma': args.get('hidden_add_noise_sigma', defaults['hidden_add_noise_sigma']), 'input_salt_and_pepper': args.get('input_salt_and_pepper', defaults['input_salt_and_pepper']), 'input_sampling': self.input_sampling, 'vis_init': self.vis_init, 'output_path': self.outdir+'gsn/', 'is_image': self.is_image, 'input_size': self.N_input } self.top_gsn_args = {'weights_list': self.top_weights_list, 'bias_list': self.top_bias_list, 'hidden_activation': self.hidden_activation, 'visible_activation': self.recurrent_hidden_activation, 'cost_function': self.cost_function, 'layers': self.gsn_layers, 'walkbacks': self.walkbacks, 'hidden_size': args.get('hidden_size', defaults['hidden_size']), 'learning_rate': args.get('learning_rate', defaults['learning_rate']), 'momentum': args.get('momentum', defaults['momentum']), 'annealing': self.annealing, 'noise_annealing': self.noise_annealing, 'batch_size': self.gsn_batch_size, 'n_epoch': self.n_epoch, 'early_stop_threshold': self.early_stop_threshold, 'early_stop_length': self.early_stop_length, 'save_frequency': self.save_frequency, 'noiseless_h1': self.noiseless_h1, 'hidden_add_noise_sigma': args.get('hidden_add_noise_sigma', defaults['hidden_add_noise_sigma']), 'input_salt_and_pepper': args.get('input_salt_and_pepper', defaults['input_salt_and_pepper']), 'input_sampling': self.input_sampling, 'vis_init': self.vis_init, 'output_path': self.outdir+'top_gsn/', 'is_image': False, 'input_size': self.recurrent_hidden_size } ############ # Sampling # ############ # the input to the sampling function X_sample = T.fmatrix("X_sampling") self.network_state_input = [X_sample] + [T.fmatrix("H_sampling_"+str(i+1)) for i in range(self.gsn_layers)] # "Output" state of the network (noisy) # initialized with input, then we apply updates self.network_state_output = [X_sample] + self.network_state_input[1:] visible_pX_chain = [] # ONE update log.maybeLog(self.logger, "Performing one walkback in network state sampling.") generative_stochastic_network.update_layers(self.network_state_output, self.weights_list, self.bias_list, visible_pX_chain, True, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.logger) ############################################## # Build the graphs for the SEN # ############################################## # If `x_t` is given, deterministic recurrence to compute the u_t. Otherwise, first generate def recurrent_step(x_t, u_tm1, add_noise): # Make current guess for hiddens based on U for i in range(self.gsn_layers): if i%2 == 0: log.maybeLog(self.logger, "Using {0!s} and {1!s}".format(self.recurrent_to_gsn_weights_list[(i+1)/2],self.bias_list[i+1])) h_t = T.concatenate([self.hidden_activation(self.bias_list[i+1] + T.dot(u_tm1, self.recurrent_to_gsn_weights_list[(i+1)/2])) for i in range(self.gsn_layers) if i%2 == 0],axis=0) # Make a GSN to update U _, hs = generative_stochastic_network.build_gsn(x_t, self.weights_list, self.bias_list, add_noise, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.walkbacks, self.logger) htop_t = hs[-1] ins_t = htop_t ua_t = T.dot(ins_t, self.W_ins_u) + T.dot(u_tm1, self.W_u_u) + self.recurrent_bias u_t = self.recurrent_hidden_activation(ua_t) return [ua_t, u_t, h_t] log.maybeLog(self.logger, "\nCreating recurrent step scan.") # For training, the deterministic recurrence is used to compute all the # {h_t, 1 <= t <= T} given Xs. Conditional GSNs can then be trained # in batches using those parameters. u0 = T.zeros((self.recurrent_hidden_size,)) # initial value for the RNN hidden units (ua, u, h_t), updates_recurrent = theano.scan(fn=lambda x_t, u_tm1, *_: recurrent_step(x_t, u_tm1, True), sequences=self.Xs, outputs_info=[None, u0, None], non_sequences=self.params) log.maybeLog(self.logger, "Now for reconstruction sample without noise") (_, _, h_t_recon), updates_recurrent_recon = theano.scan(fn=lambda x_t, u_tm1, *_: recurrent_step(x_t, u_tm1, False), sequences=self.Xs, outputs_info=[None, u0, None], non_sequences=self.params) # put together the hiddens list h_list = [T.zeros_like(self.Xs)] for layer, w in enumerate(self.weights_list): if layer%2 != 0: h_list.append(T.zeros_like(T.dot(h_list[-1], w))) else: h_list.append((h_t.T[(layer/2)*self.hidden_size:(layer/2+1)*self.hidden_size]).T) h_list_recon = [T.zeros_like(self.Xs)] for layer, w in enumerate(self.weights_list): if layer%2 != 0: h_list_recon.append(T.zeros_like(T.dot(h_list_recon[-1], w))) else: h_list_recon.append((h_t_recon.T[(layer/2)*self.hidden_size:(layer/2+1)*self.hidden_size]).T) #with noise _, cost, show_cost = generative_stochastic_network.build_gsn_given_hiddens(self.Xs, h_list, self.weights_list, self.bias_list, True, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.walkbacks, self.cost_function, self.logger) #without noise for reconstruction x_sample_recon, _, _ = generative_stochastic_network.build_gsn_given_hiddens(self.Xs, h_list_recon, self.weights_list, self.bias_list, False, self.noiseless_h1, self.hidden_add_noise_sigma, self.input_salt_and_pepper, self.input_sampling, self.MRG, self.visible_activation, self.hidden_activation, self.walkbacks, self.cost_function, self.logger) updates_train = updates_recurrent updates_cost = updates_recurrent ############# # COSTS # ############# log.maybeLog(self.logger, '\nCost w.r.t p(X|...) at every step in the graph') start_functions_time = time.time() # if we are not using Hessian-free training create the normal sgd functions if not self.hessian_free: gradient = T.grad(cost, self.params) gradient_buffer = [theano.shared(numpy.zeros(param.get_value().shape, dtype='float32')) for param in self.params] m_gradient = [self.momentum * gb + (cast32(1) - self.momentum) * g for (gb, g) in zip(gradient_buffer, gradient)] param_updates = [(param, param - self.learning_rate * mg) for (param, mg) in zip(self.params, m_gradient)] gradient_buffer_updates = zip(gradient_buffer, m_gradient) updates = OrderedDict(param_updates + gradient_buffer_updates) updates_train.update(updates) log.maybeLog(self.logger, "rnn-gsn learn...") self.f_learn = theano.function(inputs = [self.Xs], updates = updates_train, outputs = show_cost, on_unused_input='warn', name='rnngsn_f_learn') log.maybeLog(self.logger, "rnn-gsn cost...") self.f_cost = theano.function(inputs = [self.Xs], updates = updates_cost, outputs = show_cost, on_unused_input='warn', name='rnngsn_f_cost') log.maybeLog(self.logger, "Training/cost functions done.") # Denoise some numbers : show number, noisy number, predicted number, reconstructed number log.maybeLog(self.logger, "Creating graph for noisy reconstruction function at checkpoints during training.") self.f_recon = theano.function(inputs=[self.Xs], outputs=x_sample_recon[-1], updates=updates_recurrent_recon, name='rnngsn_f_recon') # a function to add salt and pepper noise self.f_noise = theano.function(inputs = [self.X], outputs = salt_and_pepper(self.X, self.input_salt_and_pepper), name='rnngsn_f_noise') # Sampling functions log.maybeLog(self.logger, "Creating sampling function...") if self.gsn_layers == 1: self.f_sample = theano.function(inputs = [X_sample], outputs = visible_pX_chain[-1], name='rnngsn_f_sample_single_layer') else: # WHY IS THERE A WARNING???? # because the first odd layers are not used -> directly computed FROM THE EVEN layers # unused input = warn self.f_sample = theano.function(inputs = self.network_state_input, outputs = self.network_state_output + visible_pX_chain, on_unused_input='warn', name='rnngsn_f_sample') log.maybeLog(self.logger, "Done compiling all functions.") compilation_time = time.time() - start_functions_time # Show the compile time with appropriate easy-to-read units. log.maybeLog(self.logger, "Total compilation time took "+make_time_units_string(compilation_time)+".\n\n")
def train(self, train_X=None, train_Y=None, valid_X=None, valid_Y=None, test_X=None, test_Y=None, is_artificial=False, artificial_sequence=1, continue_training=False): log.maybeLog(self.logger, "\nTraining---------\n") if train_X is None: log.maybeLog( self.logger, "Training using data given during initialization of RNN-GSN.\n" ) train_X = self.train_X train_Y = self.train_Y if train_X is None: log.maybeLog(self.logger, "\nPlease provide a training dataset!\n") raise AssertionError("Please provide a training dataset!") else: log.maybeLog( self.logger, "Training using data provided to training function.\n") if valid_X is None: valid_X = self.valid_X valid_Y = self.valid_Y if test_X is None: test_X = self.test_X test_Y = self.test_Y ########################################################## # Train the GSN first to get good weights initialization # ########################################################## if self.train_gsn_first: log.maybeLog( self.logger, "\n\n----------Initially training the GSN---------\n\n") init_gsn = generative_stochastic_network.GSN(train_X=train_X, valid_X=valid_X, test_X=test_X, args=self.gsn_args, logger=self.logger) init_gsn.train() ############################# # Save the model parameters # ############################# def save_params_to_file(name, n, gsn_params): pass print 'saving parameters...' save_path = self.outdir + name + '_params_epoch_' + str(n) + '.pkl' f = open(save_path, 'wb') try: cPickle.dump(gsn_params, f, protocol=cPickle.HIGHEST_PROTOCOL) finally: f.close() def save_params(params): values = [param.get_value(borrow=True) for param in params] return values def restore_params(params, values): for i in range(len(params)): params[i].set_value(values[i]) ######################################### # If we are using Hessian-free training # ######################################### if self.hessian_free: pass # gradient_dataset = hf_sequence_dataset([train_X.get_value()], batch_size=None, number_batches=5000) # cg_dataset = hf_sequence_dataset([train_X.get_value()], batch_size=None, number_batches=1000) # valid_dataset = hf_sequence_dataset([valid_X.get_value()], batch_size=None, number_batches=1000) # # s = x_samples # costs = [cost, show_cost] # hf_optimizer(params, [Xs], s, costs, u, ua).train(gradient_dataset, cg_dataset, initial_lambda=1.0, preconditioner=True, validation=valid_dataset) ################################ # If we are using SGD training # ################################ else: log.maybeLog(self.logger, "\n-----------TRAINING RNN-GSN------------\n") # TRAINING STOP = False counter = 0 if not continue_training: self.learning_rate.set_value( self.init_learn_rate) # learning rate times = [] best_cost = float('inf') best_params = None patience = 0 log.maybeLog( self.logger, ['train X size:', str(train_X.shape.eval())]) if valid_X is not None: log.maybeLog(self.logger, ['valid X size:', str(valid_X.shape.eval())]) if test_X is not None: log.maybeLog( self.logger, ['test X size:', str(test_X.shape.eval())]) if self.vis_init: self.bias_list[0].set_value( logit( numpy.clip(0.9, 0.001, train_X.get_value().mean(axis=0)))) while not STOP: counter += 1 t = time.time() log.maybeAppend(self.logger, [counter, '\t']) if is_artificial: data.sequence_mnist_data(train_X, train_Y, valid_X, valid_Y, test_X, test_Y, artificial_sequence, rng) #train train_costs = data.apply_cost_function_to_dataset( self.f_learn, train_X, self.batch_size) # record it log.maybeAppend(self.logger, ['Train:', trunc(train_costs), '\t']) #valid valid_costs = data.apply_cost_function_to_dataset( self.f_cost, valid_X, self.batch_size) # record it log.maybeAppend(self.logger, ['Valid:', trunc(valid_costs), '\t']) #test test_costs = data.apply_cost_function_to_dataset( self.f_cost, test_X, self.batch_size) # record it log.maybeAppend(self.logger, ['Test:', trunc(test_costs), '\t']) #check for early stopping cost = numpy.sum(valid_costs) if cost < best_cost * self.early_stop_threshold: patience = 0 best_cost = cost # save the parameters that made it the best best_params = save_params(self.params) else: patience += 1 if counter >= self.n_epoch or patience >= self.early_stop_length: STOP = True if best_params is not None: restore_params(self.params, best_params) save_params_to_file('all', counter, self.params) timing = time.time() - t times.append(timing) log.maybeAppend( self.logger, 'time: ' + make_time_units_string(timing) + '\t') log.maybeLog( self.logger, 'remaining: ' + make_time_units_string( (self.n_epoch - counter) * numpy.mean(times))) if (counter % self.save_frequency) == 0 or STOP is True: n_examples = 100 nums = test_X.get_value(borrow=True)[range(n_examples)] noisy_nums = self.f_noise( test_X.get_value(borrow=True)[range(n_examples)]) reconstructions = [] for i in xrange(0, len(noisy_nums)): recon = self.f_recon( noisy_nums[max(0, (i + 1) - self.batch_size):i + 1]) reconstructions.append(recon) reconstructed = numpy.array(reconstructions) # Concatenate stuff stacked = numpy.vstack([ numpy.vstack([ nums[i * 10:(i + 1) * 10], noisy_nums[i * 10:(i + 1) * 10], reconstructed[i * 10:(i + 1) * 10] ]) for i in range(10) ]) number_reconstruction = PIL.Image.fromarray( tile_raster_images( stacked, (self.root_N_input, self.root_N_input), (10, 30))) number_reconstruction.save( self.outdir + 'rnngsn_number_reconstruction_epoch_' + str(counter) + '.png') #save params save_params_to_file('all', counter, self.params) # ANNEAL! new_lr = self.learning_rate.get_value() * self.annealing self.learning_rate.set_value(new_lr)