def main(): parser = argparse.ArgumentParser() # GSN settings parser.add_argument('--layers', type=int, default=3) # number of hidden layers parser.add_argument('--walkbacks', type=int, default=5) # number of walkbacks parser.add_argument('--hidden_size', type=int, default=1500) parser.add_argument('--hidden_act', type=str, default='tanh') parser.add_argument('--visible_act', type=str, default='sigmoid') # training parser.add_argument('--cost_funct', type=str, default='binary_crossentropy') # the cost function for training parser.add_argument('--n_epoch', type=int, default=200) parser.add_argument('--batch_size', type=int, default=100) parser.add_argument('--save_frequency', type=int, default=10) #number of epochs between parameters being saved parser.add_argument('--early_stop_threshold', type=float, default=0.9995) parser.add_argument('--early_stop_length', type=int, default=30) #the patience number of epochs # noise parser.add_argument('--hidden_add_noise_sigma', type=float, default=2) parser.add_argument('--input_salt_and_pepper', type=float, default=0.4) #default=0.4 # hyper parameters parser.add_argument('--learning_rate', type=float, default=0.25) parser.add_argument('--momentum', type=float, default=0.5) parser.add_argument('--annealing', type=float, default=0.995) parser.add_argument('--noise_annealing', type=float, default=0.99) # data parser.add_argument('--dataset', type=str, default='MNIST') parser.add_argument('--data_path', type=str, default='../data/') parser.add_argument('--classes', type=int, default=10) parser.add_argument('--output_path', type=str, default='../outputs/gsn/') # argparse does not deal with booleans parser.add_argument('--vis_init', type=int, default=0) parser.add_argument('--noiseless_h1', type=int, default=1) parser.add_argument('--input_sampling', type=int, default=1) parser.add_argument('--test_model', type=int, default=0) parser.add_argument('--continue_training', type=int, default=0) #default=0 args = parser.parse_args() ######################################## # Initialization things with arguments # ######################################## outdir = args.output_path + "/" + args.dataset + "/" data.mkdir_p(outdir) args.output_path = outdir # Create the logger logger = log.Logger(outdir) logger.log("---------CREATING GSN------------\n\n") logger.log(args) # See if we should load args from a previous config file (during testing) config_filename = outdir+'config' if args.test_model and 'config' in os.listdir(outdir): config_vals = load_from_config(config_filename) for CV in config_vals: logger.log(CV) if CV.startswith('test'): logger.log('Do not override testing switch') continue try: exec('args.'+CV) in globals(), locals() except: exec('args.'+CV.split('=')[0]+"='"+CV.split('=')[1]+"'") in globals(), locals() else: # Save the current configuration # Useful for logs/experiments logger.log('Saving config') with open(config_filename, 'w') as f: f.write(str(args)) ###################################### # Load the data, train = train+valid # ###################################### if args.dataset.lower() == 'mnist': (train_X, train_Y), (valid_X, valid_Y), (test_X, test_Y) = data.load_mnist(args.data_path) train_X = numpy.concatenate((train_X, valid_X)) train_Y = numpy.concatenate((train_Y, valid_Y)) else: raise AssertionError("Dataset not recognized. Please try MNIST, or implement your own data processing method in data_tools.py") # transfer the datasets into theano shared variables train_X, train_Y = data.shared_dataset((train_X, train_Y), borrow=True) valid_X, valid_Y = data.shared_dataset((valid_X, valid_Y), borrow=True) test_X, test_Y = data.shared_dataset((test_X, test_Y), borrow=True) ########################## # Initialize the new GSN # ########################## gsn = GSN(train_X, valid_X, test_X, vars(args), logger) # Load initial weights and biases from file if testing params_to_load = 'gsn_params.pkl' if args.test_model and os.path.isfile(params_to_load): logger.log("\nLoading existing GSN parameters") loaded_params = cPickle.load(open(params_to_load,'r')) [p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[:len(gsn.weights_list)], gsn.weights_list)] [p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[len(gsn.weights_list):], gsn.bias_list)] else: logger.log("Could not find existing GSN parameter file {}, training instead.".format(params_to_load)) args.test_model = False ######################################### # Train or test the new GSN on the data # ######################################### # Train if not test if not args.test_model: gsn.train() # Otherwise, test else: gsn.test()
def main(): parser = argparse.ArgumentParser() # GSN settings parser.add_argument('--layers', type=int, default=3) # number of hidden layers parser.add_argument('--walkbacks', type=int, default=5) # number of walkbacks parser.add_argument('--hidden_size', type=int, default=1500) parser.add_argument('--hidden_act', type=str, default='tanh') parser.add_argument('--visible_act', type=str, default='sigmoid') # training parser.add_argument('--cost_funct', type=str, default='binary_crossentropy') # the cost function for training parser.add_argument('--n_epoch', type=int, default=500) parser.add_argument('--batch_size', type=int, default=100) parser.add_argument('--save_frequency', type=int, default=5) # number of epochs between parameters being saved parser.add_argument('--early_stop_threshold', type=float, default=0.9995) parser.add_argument('--early_stop_length', type=int, default=30) # the patience number of epochs # noise parser.add_argument('--hidden_add_noise_sigma', type=float, default=2) # default=2 parser.add_argument('--input_salt_and_pepper', type=float, default=0.4) # default=0.4 # hyper parameters parser.add_argument('--learning_rate', type=float, default=0.25) parser.add_argument('--momentum', type=float, default=0.5) parser.add_argument('--annealing', type=float, default=0.995) parser.add_argument('--noise_annealing', type=float, default=1) # data parser.add_argument('--dataset', type=str, default='MNIST') parser.add_argument('--data_path', type=str, default='../data/') parser.add_argument('--classes', type=int, default=10) parser.add_argument('--output_path', type=str, default='../outputs/gsn/') # argparse does not deal with booleans parser.add_argument('--vis_init', type=int, default=0) parser.add_argument('--noiseless_h1', type=int, default=1) parser.add_argument('--input_sampling', type=int, default=1) parser.add_argument('--test_model', type=int, default=0) parser.add_argument('--continue_training', type=int, default=0) # default=0 args = parser.parse_args() ######################################## # Initialization things with arguments # ######################################## outdir = args.output_path + "/" + args.dataset + "/" data.mkdir_p(outdir) args.output_path = outdir # Create the logger logger = log.Logger(outdir) logger.log("---------CREATING GSN------------\n\n") logger.log(args) # See if we should load args from a previous config file (during testing) config_filename = outdir + 'config' if args.test_model and 'config' in os.listdir(outdir): config_vals = load_from_config(config_filename) for CV in config_vals: logger.log(CV) if CV.startswith('test'): logger.log('Do not override testing switch') continue try: exec('args.' + CV) in globals(), locals() except: exec('args.' + CV.split('=')[0] + "='" + CV.split('=')[1] + "'") in globals(), locals() else: # Save the current configuration # Useful for logs/experiments logger.log('Saving config') with open(config_filename, 'w') as f: f.write(str(args)) ###################################### # Load the data, train = train+valid # ###################################### if args.dataset.lower() == 'mnist': (train_X, train_Y), (valid_X, valid_Y), (test_X, test_Y) = data.load_mnist(args.data_path) train_X = numpy.concatenate((train_X, valid_X)) train_Y = numpy.concatenate((train_Y, valid_Y)) else: raise AssertionError( "Dataset not recognized. Please try MNIST, or implement your own data processing method in data_tools.py") # transfer the datasets into theano shared variables train_X, train_Y = data.shared_dataset((train_X, train_Y), borrow=True) valid_X, valid_Y = data.shared_dataset((valid_X, valid_Y), borrow=True) test_X, test_Y = data.shared_dataset((test_X, test_Y), borrow=True) ########################## # Initialize the new GSN # ########################## gsn = GSN(train_X, valid_X, test_X, vars(args), logger) # gsn.train() gsn.load_params('gsn_params_mnist.pkl') gsn.gen_10k_samples() # parzen print 'Evaluating parzen window' import utils.likelihood_estimation as ll ll.main(0.20, 'mnist', '../data/', 'samples.npy')
def experiment(state, outdir_base='./'): rng.seed(1) # seed the numpy random generator R.seed(1) # seed the other random generator (for reconstruction function indices) # Initialize the output directories and files data.mkdir_p(outdir_base) outdir = outdir_base + "/" + state.dataset + "/" data.mkdir_p(outdir) logger = Logger(outdir) train_convergence = outdir + "train_convergence.csv" valid_convergence = outdir + "valid_convergence.csv" test_convergence = outdir + "test_convergence.csv" regression_train_convergence = outdir + "regression_train_convergence.csv" regression_valid_convergence = outdir + "regression_valid_convergence.csv" regression_test_convergence = outdir + "regression_test_convergence.csv" init_empty_file(train_convergence) init_empty_file(valid_convergence) init_empty_file(test_convergence) init_empty_file(regression_train_convergence) init_empty_file(regression_valid_convergence) init_empty_file(regression_test_convergence) logger.log("----------MODEL 1, {0!s}--------------\n\n".format(state.dataset)) # load parameters from config file if this is a test config_filename = outdir + 'config' if state.test_model and 'config' in os.listdir(outdir): config_vals = load_from_config(config_filename) for CV in config_vals: logger.log(CV) if CV.startswith('test'): logger.log('Do not override testing switch') continue try: exec('state.' + CV) in globals(), locals() except: exec('state.' + CV.split('=')[0] + "='" + CV.split('=')[1] + "'") in globals(), locals() else: # Save the current configuration # Useful for logs/experiments logger.log('Saving config') with open(config_filename, 'w') as f: f.write(str(state)) logger.log(state) #################################################### # Load the data, train = train+valid, and sequence # #################################################### artificial = False # internal flag to see if the dataset is one of my artificially-sequenced MNIST varieties. if state.dataset == 'MNIST_1' or state.dataset == 'MNIST_2' or state.dataset == 'MNIST_3' or state.dataset == 'MNIST_4': (train_X, train_Y), (valid_X, valid_Y), (test_X, test_Y) = data.load_mnist(state.data_path) train_X = numpy.concatenate((train_X, valid_X)) train_Y = numpy.concatenate((train_Y, valid_Y)) artificial = True try: dataset = int(state.dataset.split('_')[1]) except: raise AssertionError("artificial dataset number not recognized. Input was " + state.dataset) else: raise AssertionError("dataset not recognized.") # transfer the datasets into theano shared variables train_X = theano.shared(train_X) train_Y = theano.shared(train_Y) valid_X = theano.shared(valid_X) valid_Y = theano.shared(valid_Y) test_X = theano.shared(test_X) test_Y = theano.shared(test_Y) if artificial: # if it my MNIST sequence, appropriately sequence it. logger.log('Sequencing MNIST data...') logger.log(['train set size:', len(train_Y.eval())]) logger.log(['valid set size:', len(valid_Y.eval())]) logger.log(['test set size:', len(test_Y.eval())]) data.sequence_mnist_data(train_X, train_Y, valid_X, valid_Y, test_X, test_Y, dataset, rng) logger.log(['train set size:', len(train_Y.eval())]) logger.log(['valid set size:', len(valid_Y.eval())]) logger.log(['test set size:', len(test_Y.eval())]) logger.log('Sequencing done.\n') # variables from the dataset that are used for initialization and image reconstruction N_input = train_X.eval().shape[1] root_N_input = numpy.sqrt(N_input) # Network and training specifications layers = state.layers # number hidden layers walkbacks = state.walkbacks # number of walkbacks sequence_window_size = state.sequence_window_size # number of previous hidden states to consider for the regression layer_sizes = [N_input] + [state.hidden_size] * layers # layer sizes, from h0 to hK (h0 is the visible layer) learning_rate = theano.shared(cast32(state.learning_rate)) # learning rate regression_learning_rate = theano.shared(cast32(state.learning_rate)) # learning rate annealing = cast32(state.annealing) # exponential annealing coefficient momentum = theano.shared(cast32(state.momentum)) # momentum term # Theano variables and RNG X = T.fmatrix('X') # for use in sampling Xs = [T.fmatrix(name="X_t") if i == 0 else T.fmatrix(name="X_{t-" + str(i) + "}") for i in range( sequence_window_size + 1)] # for use in training - need one X variable for each input in the sequence history window, and what the current one should be Xs_recon = [T.fvector(name="Xrecon_t") if i == 0 else T.fvector(name="Xrecon_{t-" + str(i) + "}") for i in range( sequence_window_size + 1)] # for use in training - need one X variable for each input in the sequence history window, and what the current one should be # sequence_graph_output_index = T.lscalar("i") MRG = RNG_MRG.MRG_RandomStreams(1) ############## # PARAMETERS # ############## # initialize a list of weights and biases based on layer_sizes for the GSN weights_list = [ get_shared_weights(layer_sizes[layer], layer_sizes[layer + 1], name="W_{0!s}_{1!s}".format(layer, layer + 1)) for layer in range(layers)] # initialize each layer to uniform sample from sqrt(6. / (n_in + n_out)) bias_list = [get_shared_bias(layer_sizes[layer], name='b_' + str(layer)) for layer in range(layers + 1)] # initialize each layer to 0's. # parameters for the regression - only need them for the odd layers in the network! regression_weights_list = [ [get_shared_regression_weights(state.hidden_size, name="V_{t-" + str(window + 1) + "}_layer" + str(layer)) for layer in range(layers + 1) if (layer % 2) != 0] for window in range(sequence_window_size)] # initialize to identity matrix the size of hidden layer. regression_bias_list = [get_shared_bias(state.hidden_size, name='vb_' + str(layer)) for layer in range(layers + 1) if (layer % 2) != 0] # initialize to 0's. # need initial biases (tau) as well for when there aren't sequence_window_size hiddens in the history. tau_list = [ [get_shared_bias(state.hidden_size, name='tau_{t-' + str(window + 1) + "}_layer" + str(layer)) for layer in range(layers + 1) if (layer % 2) != 0] for window in range(sequence_window_size)] ########################################################### # load initial parameters of gsn to speed up my debugging # ########################################################### params_to_load = 'gsn_params.pkl' initialized_gsn = False if os.path.isfile(params_to_load): logger.log("\nLoading existing GSN parameters") loaded_params = cPickle.load(open(params_to_load, 'r')) [p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[:len(weights_list)], weights_list)] [p.set_value(lp.get_value(borrow=False)) for lp, p in zip(loaded_params[len(weights_list):], bias_list)] initialized_gsn = True ######################## # ACTIVATION FUNCTIONS # ######################## if state.hidden_act == 'sigmoid': logger.log('Using sigmoid activation for hiddens') hidden_activation = T.nnet.sigmoid elif state.hidden_act == 'rectifier': logger.log('Using rectifier activation for hiddens') hidden_activation = lambda x: T.maximum(cast32(0), x) elif state.hidden_act == 'tanh': logger.log('Using hyperbolic tangent activation for hiddens') hidden_activation = lambda x: T.tanh(x) else: logger.log("Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid".format( state.hidden_act)) raise AssertionError("Did not recognize hidden activation {0!s}, please use tanh, rectifier, or sigmoid".format( state.hidden_act)) if state.visible_act == 'sigmoid': logger.log('Using sigmoid activation for visible layer') visible_activation = T.nnet.sigmoid elif state.visible_act == 'softmax': logger.log('Using softmax activation for visible layer') visible_activation = T.nnet.softmax else: logger.log( "Did not recognize visible activation {0!s}, please use sigmoid or softmax".format(state.visible_act)) raise AssertionError( "Did not recognize visible activation {0!s}, please use sigmoid or softmax".format(state.visible_act)) ############################################### # COMPUTATIONAL GRAPH HELPER METHODS FOR TGSN # ############################################### def update_layers(hiddens, p_X_chain, noisy=True): logger.log('odd layer updates') update_odd_layers(hiddens, noisy) logger.log('even layer updates') update_even_layers(hiddens, p_X_chain, noisy) logger.log('done full update.\n') def update_layers_reverse(hiddens, p_X_chain, noisy=True): logger.log('even layer updates') update_even_layers(hiddens, p_X_chain, noisy) logger.log('odd layer updates') update_odd_layers(hiddens, noisy) logger.log('done full update.\n') # Odd layer update function # just a loop over the odd layers def update_odd_layers(hiddens, noisy): for i in range(1, len(hiddens), 2): logger.log(['updating layer', i]) simple_update_layer(hiddens, None, i, add_noise=noisy) # Even layer update # p_X_chain is given to append the p(X|...) at each full update (one update = odd update + even update) def update_even_layers(hiddens, p_X_chain, noisy): for i in range(0, len(hiddens), 2): logger.log(['updating layer', i]) simple_update_layer(hiddens, p_X_chain, i, add_noise=noisy) # The layer update function # hiddens : list containing the symbolic theano variables [visible, hidden1, hidden2, ...] # layer_update will modify this list inplace # p_X_chain : list containing the successive p(X|...) at each update # update_layer will append to this list # i : the current layer being updated # add_noise : pre (and post) activation gaussian noise flag def simple_update_layer(hiddens, p_X_chain, i, add_noise=True): # Compute the dot product, whatever layer # If the visible layer X if i == 0: logger.log('using ' + str(weights_list[i]) + '.T') hiddens[i] = T.dot(hiddens[i + 1], weights_list[i].T) + bias_list[i] # If the top layer elif i == len(hiddens) - 1: logger.log(['using', weights_list[i - 1]]) hiddens[i] = T.dot(hiddens[i - 1], weights_list[i - 1]) + bias_list[i] # Otherwise in-between layers else: logger.log(["using {0!s} and {1!s}.T".format(weights_list[i - 1], weights_list[i])]) # next layer : hiddens[i+1], assigned weights : W_i # previous layer : hiddens[i-1], assigned weights : W_(i-1) hiddens[i] = T.dot(hiddens[i + 1], weights_list[i].T) + T.dot(hiddens[i - 1], weights_list[i - 1]) + \ bias_list[i] # Add pre-activation noise if NOT input layer if i == 1 and state.noiseless_h1: logger.log('>>NO noise in first hidden layer') add_noise = False # pre activation noise if i != 0 and add_noise: logger.log(['Adding pre-activation gaussian noise for layer', i]) hiddens[i] = add_gaussian_noise(hiddens[i], state.hidden_add_noise_sigma) # ACTIVATION! if i == 0: logger.log('{} activation for visible layer'.format(state.visible_act)) hiddens[i] = visible_activation(hiddens[i]) else: logger.log(['Hidden units {} activation for layer'.format(state.hidden_act), i]) hiddens[i] = hidden_activation(hiddens[i]) # post activation noise # why is there post activation noise? Because there is already pre-activation noise, this just doubles the amount of noise between each activation of the hiddens. # if i != 0 and add_noise: # logger.log(['Adding post-activation gaussian noise for layer', i]) # hiddens[i] = add_gaussian(hiddens[i], state.hidden_add_noise_sigma) # build the reconstruction chain if updating the visible layer X if i == 0: # if input layer -> append p(X|...) p_X_chain.append(hiddens[i]) # sample from p(X|...) - SAMPLING NEEDS TO BE CORRECT FOR INPUT TYPES I.E. FOR BINARY MNIST SAMPLING IS BINOMIAL. real-valued inputs should be gaussian if state.input_sampling: logger.log('Sampling from input') sampled = MRG.binomial(p=hiddens[i], size=hiddens[i].shape, dtype='float32') else: logger.log('>>NO input sampling') sampled = hiddens[i] # add noise sampled = salt_and_pepper(sampled, state.input_salt_and_pepper) # set input layer hiddens[i] = sampled def perform_regression_step(hiddens, sequence_history): logger.log(["Sequence history length:", len(sequence_history)]) # only need to work over the odd layers of the hiddens odd_layers = [i for i in range(len(hiddens)) if (i % 2) != 0] # depending on the size of the sequence history, it could be 0, 1, 2, 3, ... sequence_window_size for (hidden_index, regression_index) in zip(odd_layers, range(len(odd_layers))): terms_used = [] sequence_terms = [] for history_index in range(sequence_window_size): if history_index < len(sequence_history): # dot product with history term sequence_terms.append(T.dot(sequence_history[history_index][regression_index], regression_weights_list[history_index][regression_index])) terms_used.append(regression_weights_list[history_index][regression_index]) else: # otherwise, no history for necessary spot, so use the tau sequence_terms.append(tau_list[history_index][regression_index]) terms_used.append(tau_list[history_index][regression_index]) if len(sequence_terms) > 0: sequence_terms.append(regression_bias_list[regression_index]) terms_used.append(regression_bias_list[regression_index]) logger.log(["REGRESSION for hidden layer {0!s} using:".format(hidden_index), terms_used]) hiddens[hidden_index] = numpy.sum(sequence_terms) def build_gsn_graph(x, noiseflag): p_X_chain = [] if noiseflag: X_init = salt_and_pepper(x, state.input_salt_and_pepper) else: X_init = x # init hiddens with zeros hiddens = [X_init] for w in weights_list: hiddens.append(T.zeros_like(T.dot(hiddens[-1], w))) # The layer update scheme logger.log(["Building the gsn graph :", walkbacks, "updates"]) for i in range(walkbacks): logger.log("GSN Walkback {!s}/{!s}".format(i + 1, walkbacks)) update_layers(hiddens, p_X_chain, noisy=noiseflag) return p_X_chain def build_sequence_graph(xs, noiseflag): predicted_X_chains = [] p_X_chains = [] sequence_history = [] # The layer update scheme logger.log(["Building the regression graph :", len(Xs), "updates"]) for x_index in range(len(xs)): x = xs[x_index] # Predict what the current X should be ''' hidden layer init ''' pred_hiddens = [T.zeros_like(x)] for w in weights_list: # init with zeros pred_hiddens.append(T.zeros_like(T.dot(pred_hiddens[-1], w))) logger.log("Performing regression step!") perform_regression_step(pred_hiddens, sequence_history) # do the regression! logger.log("\n") predicted_X_chain = [] for i in range(walkbacks): logger.log("Prediction Walkback {!s}/{!s}".format(i + 1, walkbacks)) update_layers_reverse(pred_hiddens, predicted_X_chain, noisy=False) # no noise in the prediction because x_prediction can't be recovered from x anyway predicted_X_chains.append(predicted_X_chain) # Now do the actual GSN step and add it to the sequence history # corrupt x if noisy if noiseflag: X_init = salt_and_pepper(x, state.input_salt_and_pepper) else: X_init = x ''' hidden layer init ''' hiddens = [T.zeros_like(x)] for w in weights_list: # init with zeros hiddens.append(T.zeros_like(T.dot(hiddens[-1], w))) # # substitute some of the zero layers for what was predicted - need to advance the prediction by 1 layer so it is the evens # update_even_layers(pred_hiddens,[],noisy=False) # for i in [layer for layer in range(len(hiddens)) if (layer%2 == 0)]: # hiddens[i] = pred_hiddens[i] hiddens[0] = X_init chain = [] for i in range(walkbacks): logger.log("GSN walkback {!s}/{!s}".format(i + 1, walkbacks)) update_layers(hiddens, chain, noisy=noiseflag) # Append the p_X_chain p_X_chains.append(chain) # Append the odd layers of the hiddens to the sequence history sequence_history.append([hiddens[layer] for layer in range(len(hiddens)) if (layer % 2) != 0]) # select the prediction and reconstruction from the lists # prediction_chain = T.stacklists(predicted_X_chains)[sequence_graph_output_index] # reconstruction_chain = T.stacklists(p_X_chains)[sequence_graph_output_index] return predicted_X_chains, p_X_chains ############################################## # Build the training graph for the GSN # ############################################## logger.log("\nBuilding GSN graphs") p_X_chain_init = build_gsn_graph(X, noiseflag=True) predicted_X_chain_gsns, p_X_chains = build_sequence_graph(Xs, noiseflag=True) predicted_X_chain_gsn = predicted_X_chain_gsns[-1] p_X_chain = p_X_chains[-1] ############################################### # Build the training graph for the regression # ############################################### logger.log("\nBuilding regression graph") # no noise! noise is only used as regularization for GSN stage predicted_X_chains_regression, _ = build_sequence_graph(Xs, noiseflag=False) predicted_X_chain = predicted_X_chains_regression[-1] ###################### # COST AND GRADIENTS # ###################### if state.cost_funct == 'binary_crossentropy': logger.log('\nUsing binary cross-entropy cost!') cost_function = lambda x, y: T.mean(T.nnet.binary_crossentropy(x, y)) elif state.cost_funct == 'square': logger.log("\nUsing square error cost!") # cost_function = lambda x,y: T.log(T.mean(T.sqr(x-y))) cost_function = lambda x, y: T.log(T.sum(T.pow((x - y), 2))) else: raise AssertionError( "Did not recognize cost function {0!s}, please use binary_crossentropy or square".format(state.cost_funct)) logger.log('Cost w.r.t p(X|...) at every step in the graph for the TGSN') gsn_costs_init = [cost_function(rX, X) for rX in p_X_chain_init] show_gsn_cost_init = gsn_costs_init[-1] gsn_cost_init = numpy.sum(gsn_costs_init) gsn_init_mse = T.mean(T.sqr(p_X_chain_init[-1] - X), axis=0) gsn_init_error = T.mean(gsn_init_mse) # gsn_costs = T.mean(T.mean(T.nnet.binary_crossentropy(p_X_chain, T.stacklists(Xs)[sequence_graph_output_index]),2),1) gsn_costs = [cost_function(rX, Xs[-1]) for rX in predicted_X_chain_gsn] show_gsn_cost = gsn_costs[-1] gsn_cost = T.sum(gsn_costs) gsn_mse = T.mean(T.sqr(predicted_X_chain_gsn[-1] - Xs[-1]), axis=0) gsn_error = T.mean(gsn_mse) gsn_params = weights_list + bias_list logger.log(["gsn params:", gsn_params]) # l2 regularization # regression_regularization_cost = T.sum([T.sum(recurrent_weights ** 2) for recurrent_weights in regression_weights_list]) regression_regularization_cost = 0 regression_costs = [cost_function(rX, Xs[-1]) for rX in predicted_X_chain] show_regression_cost = regression_costs[-1] regression_cost = T.sum(regression_costs) + state.regularize_weight * regression_regularization_cost regression_mse = T.mean(T.sqr(predicted_X_chain[-1] - Xs[-1]), axis=0) regression_error = T.mean(regression_mse) # only using the odd layers update -> even-indexed parameters in the list because it starts at v1 # need to flatten the regression list -> couldn't immediately find the python method so here is the implementation regression_weights_flattened = [] for weights in regression_weights_list: regression_weights_flattened.extend(weights) tau_flattened = [] for tau in tau_list: tau_flattened.extend(tau) regression_params = regression_weights_flattened + regression_bias_list # + tau_flattened logger.log(["regression params:", regression_params]) logger.log("creating functions...") t = time.time() gradient_init = T.grad(gsn_cost_init, gsn_params) gradient_buffer_init = [theano.shared(numpy.zeros(param.get_value().shape, dtype='float32')) for param in gsn_params] m_gradient_init = [momentum * gb + (cast32(1) - momentum) * g for (gb, g) in zip(gradient_buffer_init, gradient_init)] param_updates_init = [(param, param - learning_rate * mg) for (param, mg) in zip(gsn_params, m_gradient_init)] gradient_buffer_updates_init = zip(gradient_buffer_init, m_gradient_init) updates_init = OrderedDict(param_updates_init + gradient_buffer_updates_init) gsn_f_learn_init = theano.function(inputs=[X], updates=updates_init, outputs=[show_gsn_cost_init, gsn_init_error]) gsn_f_cost_init = theano.function(inputs=[X], outputs=[show_gsn_cost_init, gsn_init_error]) gradient = T.grad(gsn_cost, gsn_params) gradient_buffer = [theano.shared(numpy.zeros(param.get_value().shape, dtype='float32')) for param in gsn_params] m_gradient = [momentum * gb + (cast32(1) - momentum) * g for (gb, g) in zip(gradient_buffer, gradient)] param_updates = [(param, param - learning_rate * mg) for (param, mg) in zip(gsn_params, m_gradient)] gradient_buffer_updates = zip(gradient_buffer, m_gradient) updates = OrderedDict(param_updates + gradient_buffer_updates) gsn_f_cost = theano.function(inputs=Xs, outputs=[show_gsn_cost, gsn_error]) gsn_f_learn = theano.function(inputs=Xs, updates=updates, outputs=[show_gsn_cost, gsn_error]) regression_gradient = T.grad(regression_cost, regression_params) regression_gradient_buffer = [theano.shared(numpy.zeros(rparam.get_value().shape, dtype='float32')) for rparam in regression_params] regression_m_gradient = [momentum * rgb + (cast32(1) - momentum) * rg for (rgb, rg) in zip(regression_gradient_buffer, regression_gradient)] regression_param_updates = [(rparam, rparam - regression_learning_rate * rmg) for (rparam, rmg) in zip(regression_params, regression_m_gradient)] regression_gradient_buffer_updates = zip(regression_gradient_buffer, regression_m_gradient) regression_updates = OrderedDict(regression_param_updates + regression_gradient_buffer_updates) regression_f_cost = theano.function(inputs=Xs, outputs=[show_regression_cost, regression_error]) regression_f_learn = theano.function(inputs=Xs, updates=regression_updates, outputs=[show_regression_cost, regression_error]) logger.log("functions done. took " + make_time_units_string(time.time() - t) + ".\n") ############################################################################################ # Denoise some numbers : show number, noisy number, predicted number, reconstructed number # ############################################################################################ # Recompile the graph without noise for reconstruction function # The layer update scheme logger.log("Creating graph for noisy reconstruction function at checkpoints during training.") predicted_X_chains_R, p_X_chains_R = build_sequence_graph(Xs_recon, noiseflag=False) predicted_X_chain_R = predicted_X_chains_R[-1] p_X_chain_R = p_X_chains_R[-1] f_recon = theano.function(inputs=Xs_recon, outputs=[predicted_X_chain_R[-1], p_X_chain_R[-1]]) # Now do the same but for the GSN in the initial run p_X_chain_R = build_gsn_graph(X, noiseflag=False) f_recon_init = theano.function(inputs=[X], outputs=p_X_chain_R[-1]) ############ # Sampling # ############ f_noise = theano.function(inputs=[X], outputs=salt_and_pepper(X, state.input_salt_and_pepper)) # the input to the sampling function network_state_input = [X] + [T.fmatrix() for i in range(layers)] # "Output" state of the network (noisy) # initialized with input, then we apply updates # network_state_output = network_state_input network_state_output = [X] + network_state_input[1:] visible_pX_chain = [] # ONE update logger.log("Performing one walkback in network state sampling.") update_layers(network_state_output, visible_pX_chain, noisy=True) if layers == 1: f_sample_simple = theano.function(inputs=[X], outputs=visible_pX_chain[-1]) # WHY IS THERE A WARNING???? # because the first odd layers are not used -> directly computed FROM THE EVEN layers # unused input = warn f_sample2 = theano.function(inputs=network_state_input, outputs=network_state_output + visible_pX_chain, on_unused_input='warn') def sample_some_numbers_single_layer(): x0 = test_X.get_value()[7:8] samples = [x0] x = f_noise(x0) for i in range(399): x = f_sample_simple(x) samples.append(x) x = numpy.random.binomial(n=1, p=x, size=x.shape).astype('float32') x = f_noise(x) return numpy.vstack(samples) def sampling_wrapper(NSI): # * is the "splat" operator: It takes a list as input, and expands it into actual positional arguments in the function call. out = f_sample2(*NSI) NSO = out[:len(network_state_output)] vis_pX_chain = out[len(network_state_output):] return NSO, vis_pX_chain def sample_some_numbers(N=400): # The network's initial state init_vis = test_X.get_value()[7:8] noisy_init_vis = f_noise(init_vis) network_state = [ [noisy_init_vis] + [numpy.zeros((1, len(b.get_value())), dtype='float32') for b in bias_list[1:]]] visible_chain = [init_vis] noisy_h0_chain = [noisy_init_vis] for i in range(N - 1): # feed the last state into the network, compute new state, and obtain visible units expectation chain net_state_out, vis_pX_chain = sampling_wrapper(network_state[-1]) # append to the visible chain visible_chain += vis_pX_chain # append state output to the network state chain network_state.append(net_state_out) noisy_h0_chain.append(net_state_out[0]) return numpy.vstack(visible_chain), numpy.vstack(noisy_h0_chain) def plot_samples(epoch_number, iteration): to_sample = time.time() if layers == 1: # one layer model V = sample_some_numbers_single_layer() else: V, _ = sample_some_numbers() img_samples = PIL.Image.fromarray(tile_raster_images(V, (root_N_input, root_N_input), (20, 20))) fname = outdir + 'samples_iteration_' + str(iteration) + '_epoch_' + str(epoch_number) + '.png' img_samples.save(fname) logger.log('Took ' + str(time.time() - to_sample) + ' to sample 400 numbers') ############################# # Save the model parameters # ############################# def save_params_to_file(name, n, gsn_params, iteration): pass logger.log('saving parameters...') save_path = outdir + name + '_params_iteration_' + str(iteration) + '_epoch_' + str(n) + '.pkl' f = open(save_path, 'wb') try: cPickle.dump(gsn_params, f, protocol=cPickle.HIGHEST_PROTOCOL) finally: f.close() def save_params(params): values = [param.get_value(borrow=True) for param in params] return values def restore_params(params, values): for i in range(len(params)): params[i].set_value(values[i]) ################ # GSN TRAINING # ################ def train_GSN(iteration, train_X, train_Y, valid_X, valid_Y, test_X, test_Y): logger.log('----------------TRAINING GSN FOR ITERATION ' + str(iteration) + "--------------\n") # TRAINING n_epoch = state.n_epoch batch_size = state.batch_size STOP = False counter = 0 if iteration == 0: learning_rate.set_value(cast32(state.learning_rate)) # learning rate times = [] best_cost = float('inf') best_params = None patience = 0 logger.log(['learning rate:', learning_rate.get_value()]) logger.log(['train X size:', str(train_X.shape.eval())]) logger.log(['valid X size:', str(valid_X.shape.eval())]) logger.log(['test X size:', str(test_X.shape.eval())]) if state.vis_init: bias_list[0].set_value(logit(numpy.clip(0.9, 0.001, train_X.get_value().mean(axis=0)))) if state.test_model: # If testing, do not train and go directly to generating samples, parzen window estimation, and inpainting logger.log('Testing : skip training') STOP = True while not STOP: counter += 1 t = time.time() logger.append([counter, '\t']) # shuffle the data # data.sequence_mnist_data(train_X, train_Y, valid_X, valid_Y, test_X, test_Y, dataset, rng) # train train_costs = [] train_errors = [] if iteration == 0: for i in range(len(train_X.get_value(borrow=True)) / batch_size): x = train_X.get_value(borrow=True)[i * batch_size: (i + 1) * batch_size] cost, error = gsn_f_learn_init(x) train_costs.append([cost]) train_errors.append([error]) else: for i in range(len(train_X.get_value(borrow=True)) / batch_size): xs = [train_X.get_value(borrow=True)[ (i * batch_size) + sequence_idx: ((i + 1) * batch_size) + sequence_idx] for sequence_idx in range(len(Xs))] xs, _ = fix_input_size(xs) _ins = xs # + [sequence_window_size] cost, error = gsn_f_learn(*_ins) train_costs.append(cost) train_errors.append(error) train_costs = numpy.mean(train_costs) train_errors = numpy.mean(train_errors) logger.append(['Train: ', trunc(train_costs), trunc(train_errors), '\t']) with open(train_convergence, 'a') as f: f.write("{0!s},".format(train_costs)) f.write("\n") # valid valid_costs = [] if iteration == 0: for i in range(len(valid_X.get_value(borrow=True)) / batch_size): x = valid_X.get_value(borrow=True)[i * batch_size: (i + 1) * batch_size] cost, _ = gsn_f_cost_init(x) valid_costs.append([cost]) else: for i in range(len(valid_X.get_value(borrow=True)) / batch_size): xs = [valid_X.get_value(borrow=True)[ (i * batch_size) + sequence_idx: ((i + 1) * batch_size) + sequence_idx] for sequence_idx in range(len(Xs))] xs, _ = fix_input_size(xs) _ins = xs # + [sequence_window_size] costs, _ = gsn_f_cost(*_ins) valid_costs.append(costs) valid_costs = numpy.mean(valid_costs) logger.append(['Valid: ', trunc(valid_costs), '\t']) with open(valid_convergence, 'a') as f: f.write("{0!s},".format(valid_costs)) f.write("\n") # test test_costs = [] test_errors = [] if iteration == 0: for i in range(len(test_X.get_value(borrow=True)) / batch_size): x = test_X.get_value(borrow=True)[i * batch_size: (i + 1) * batch_size] cost, error = gsn_f_cost_init(x) test_costs.append([cost]) test_errors.append([error]) else: for i in range(len(test_X.get_value(borrow=True)) / batch_size): xs = [test_X.get_value(borrow=True)[ (i * batch_size) + sequence_idx: ((i + 1) * batch_size) + sequence_idx] for sequence_idx in range(len(Xs))] xs, _ = fix_input_size(xs) _ins = xs # + [sequence_window_size] costs, errors = gsn_f_cost(*_ins) test_costs.append(costs) test_errors.append(errors) test_costs = numpy.mean(test_costs) test_errors = numpy.mean(test_errors) logger.append(['Test: ', trunc(test_costs), trunc(test_errors), '\t']) with open(test_convergence, 'a') as f: f.write("{0!s},".format(test_costs)) f.write("\n") # check for early stopping cost = numpy.sum(valid_costs) if cost < best_cost * state.early_stop_threshold: patience = 0 best_cost = cost # save the parameters that made it the best best_params = save_params(gsn_params) else: patience += 1 if counter >= n_epoch or patience >= state.early_stop_length: STOP = True if best_params is not None: restore_params(gsn_params, best_params) save_params_to_file('gsn', counter, gsn_params, iteration) logger.log(["next learning rate should be", learning_rate.get_value() * annealing]) timing = time.time() - t times.append(timing) logger.append('time: ' + make_time_units_string(timing)) logger.log('remaining: ' + make_time_units_string((n_epoch - counter) * numpy.mean(times))) if (counter % state.save_frequency) == 0 or STOP is True: n_examples = 100 if iteration == 0: random_idx = numpy.array(R.sample(range(len(test_X.get_value())), n_examples)) numbers = test_X.get_value()[random_idx] noisy_numbers = f_noise(test_X.get_value()[random_idx]) reconstructed = f_recon_init(noisy_numbers) # Concatenate stuff stacked = numpy.vstack([numpy.vstack( [numbers[i * 10: (i + 1) * 10], noisy_numbers[i * 10: (i + 1) * 10], reconstructed[i * 10: (i + 1) * 10]]) for i in range(10)]) number_reconstruction = PIL.Image.fromarray( tile_raster_images(stacked, (root_N_input, root_N_input), (10, 30))) else: n_examples = n_examples + sequence_window_size # Checking reconstruction # grab 100 numbers in the sequence from the test set nums = test_X.get_value()[range(n_examples)] noisy_nums = f_noise(test_X.get_value()[range(n_examples)]) reconstructed_prediction = [] reconstructed = [] for i in range(n_examples): if i >= sequence_window_size: xs = [noisy_nums[i - x] for x in range(len(Xs))] xs.reverse() _ins = xs # + [sequence_window_size] _outs = f_recon(*_ins) prediction = _outs[0] reconstruction = _outs[1] reconstructed_prediction.append(prediction) reconstructed.append(reconstruction) nums = nums[sequence_window_size:] noisy_nums = noisy_nums[sequence_window_size:] reconstructed_prediction = numpy.array(reconstructed_prediction) reconstructed = numpy.array(reconstructed) # Concatenate stuff stacked = numpy.vstack([numpy.vstack([nums[i * 10: (i + 1) * 10], noisy_nums[i * 10: (i + 1) * 10], reconstructed_prediction[i * 10: (i + 1) * 10], reconstructed[i * 10: (i + 1) * 10]]) for i in range(10)]) number_reconstruction = PIL.Image.fromarray( tile_raster_images(stacked, (root_N_input, root_N_input), (10, 40))) # epoch_number = reduce(lambda x,y : x + y, ['_'] * (4-len(str(counter)))) + str(counter) number_reconstruction.save( outdir + 'gsn_number_reconstruction_iteration_' + str(iteration) + '_epoch_' + str( counter) + '.png') # sample_numbers(counter, 'seven') plot_samples(counter, iteration) # save gsn_params save_params_to_file('gsn', counter, gsn_params, iteration) # ANNEAL! new_lr = learning_rate.get_value() * annealing learning_rate.set_value(new_lr) # 10k samples logger.log('Generating 10,000 samples') samples, _ = sample_some_numbers(N=10000) f_samples = outdir + 'samples.npy' numpy.save(f_samples, samples) logger.log('saved digits') ####################### # REGRESSION TRAINING # ####################### def train_regression(iteration, train_X, train_Y, valid_X, valid_Y, test_X, test_Y): logger.log('-------------TRAINING REGRESSION FOR ITERATION {0!s}-------------'.format(iteration)) # TRAINING n_epoch = state.n_epoch batch_size = state.batch_size STOP = False counter = 0 best_cost = float('inf') best_params = None patience = 0 if iteration == 0: regression_learning_rate.set_value(cast32(state.learning_rate)) # learning rate times = [] logger.log(['learning rate:', regression_learning_rate.get_value()]) logger.log(['train X size:', str(train_X.shape.eval())]) logger.log(['valid X size:', str(valid_X.shape.eval())]) logger.log(['test X size:', str(test_X.shape.eval())]) if state.test_model: # If testing, do not train and go directly to generating samples, parzen window estimation, and inpainting logger.log('Testing : skip training') STOP = True while not STOP: counter += 1 t = time.time() logger.append([counter, '\t']) # shuffle the data # data.sequence_mnist_data(train_X, train_Y, valid_X, valid_Y, test_X, test_Y, dataset, rng) # train train_costs = [] train_errors = [] for i in range(len(train_X.get_value(borrow=True)) / batch_size): xs = [train_X.get_value(borrow=True)[ (i * batch_size) + sequence_idx: ((i + 1) * batch_size) + sequence_idx] for sequence_idx in range(len(Xs))] xs, _ = fix_input_size(xs) _ins = xs # + [sequence_window_size] cost, error = regression_f_learn(*_ins) # print trunc(cost) # print [numpy.asarray(a) for a in f_check(*_ins)] train_costs.append(cost) train_errors.append(error) train_costs = numpy.mean(train_costs) train_errors = numpy.mean(train_errors) logger.append(['rTrain: ', trunc(train_costs), trunc(train_errors), '\t']) with open(regression_train_convergence, 'a') as f: f.write("{0!s},".format(train_costs)) f.write("\n") # valid valid_costs = [] for i in range(len(valid_X.get_value(borrow=True)) / batch_size): xs = [valid_X.get_value(borrow=True)[ (i * batch_size) + sequence_idx: ((i + 1) * batch_size) + sequence_idx] for sequence_idx in range(len(Xs))] xs, _ = fix_input_size(xs) _ins = xs # + [sequence_window_size] cost, _ = regression_f_cost(*_ins) valid_costs.append(cost) valid_costs = numpy.mean(valid_costs) logger.append(['rValid: ', trunc(valid_costs), '\t']) with open(regression_valid_convergence, 'a') as f: f.write("{0!s},".format(valid_costs)) f.write("\n") # test test_costs = [] test_errors = [] for i in range(len(test_X.get_value(borrow=True)) / batch_size): xs = [test_X.get_value(borrow=True)[ (i * batch_size) + sequence_idx: ((i + 1) * batch_size) + sequence_idx] for sequence_idx in range(len(Xs))] xs, _ = fix_input_size(xs) _ins = xs # + [sequence_window_size] cost, error = regression_f_cost(*_ins) test_costs.append(cost) test_errors.append(error) test_costs = numpy.mean(test_costs) test_errors = numpy.mean(test_errors) logger.append(['rTest: ', trunc(test_costs), trunc(test_errors), '\t']) with open(regression_test_convergence, 'a') as f: f.write("{0!s},".format(test_costs)) f.write("\n") # check for early stopping cost = numpy.sum(valid_costs) if cost < best_cost * state.early_stop_threshold: patience = 0 best_cost = cost # keep the best params so far best_params = save_params(regression_params) else: patience += 1 if counter >= n_epoch or patience >= state.early_stop_length: STOP = True if best_params is not None: restore_params(regression_params, best_params) save_params_to_file('regression', counter, regression_params, iteration) logger.log(["next learning rate should be", regression_learning_rate.get_value() * annealing]) timing = time.time() - t times.append(timing) logger.append('time: ' + make_time_units_string(timing)) logger.log('remaining: ' + make_time_units_string((n_epoch - counter) * numpy.mean(times))) if (counter % state.save_frequency) == 0 or STOP is True: n_examples = 100 + sequence_window_size # Checking reconstruction # grab 100 numbers in the sequence from the test set nums = test_X.get_value()[range(n_examples)] noisy_nums = f_noise(test_X.get_value()[range(n_examples)]) reconstructed_prediction = [] reconstructed = [] for i in range(n_examples): if i >= sequence_window_size: xs = [noisy_nums[i - x] for x in range(len(Xs))] xs.reverse() _ins = xs # + [sequence_window_size] _outs = f_recon(*_ins) prediction = _outs[0] reconstruction = _outs[1] reconstructed_prediction.append(prediction) reconstructed.append(reconstruction) nums = nums[sequence_window_size:] noisy_nums = noisy_nums[sequence_window_size:] reconstructed_prediction = numpy.array(reconstructed_prediction) reconstructed = numpy.array(reconstructed) # Concatenate stuff stacked = numpy.vstack([numpy.vstack([nums[i * 10: (i + 1) * 10], noisy_nums[i * 10: (i + 1) * 10], reconstructed_prediction[i * 10: (i + 1) * 10], reconstructed[i * 10: (i + 1) * 10]]) for i in range(10)]) number_reconstruction = PIL.Image.fromarray( tile_raster_images(stacked, (root_N_input, root_N_input), (10, 40))) # epoch_number = reduce(lambda x,y : x + y, ['_'] * (4-len(str(counter)))) + str(counter) number_reconstruction.save( outdir + 'regression_number_reconstruction_iteration_' + str(iteration) + '_epoch_' + str( counter) + '.png') # save gsn_params save_params_to_file('regression', counter, regression_params, iteration) # ANNEAL! new_r_lr = regression_learning_rate.get_value() * annealing regression_learning_rate.set_value(new_r_lr) ##################### # STORY 1 ALGORITHM # ##################### # alternate training the gsn and training the regression for iteration in range(state.max_iterations): # if iteration is 0 and initialized_gsn is False: # train_regression(iteration, train_X, train_Y, valid_X, valid_Y, test_X, test_Y) # else: # train_GSN(iteration, train_X, train_Y, valid_X, valid_Y, test_X, test_Y) # train_regression(iteration, train_X, train_Y, valid_X, valid_Y, test_X, test_Y) train_GSN(iteration, train_X, train_Y, valid_X, valid_Y, test_X, test_Y) train_regression(iteration, train_X, train_Y, valid_X, valid_Y, test_X, test_Y)
def main(): parser = argparse.ArgumentParser() # GSN settings parser.add_argument('--layers', type=int, default=3) # number of hidden layers parser.add_argument('--walkbacks', type=int, default=5) # number of walkbacks parser.add_argument('--hidden_size', type=int, default=1500) parser.add_argument('--hidden_act', type=str, default='tanh') parser.add_argument('--visible_act', type=str, default='sigmoid') # training parser.add_argument( '--cost_funct', type=str, default='binary_crossentropy') # the cost function for training parser.add_argument('--n_epoch', type=int, default=200) parser.add_argument('--batch_size', type=int, default=100) parser.add_argument( '--save_frequency', type=int, default=10) #number of epochs between parameters being saved parser.add_argument('--early_stop_threshold', type=float, default=0.9995) parser.add_argument('--early_stop_length', type=int, default=30) #the patience number of epochs # noise parser.add_argument('--hidden_add_noise_sigma', type=float, default=2) parser.add_argument('--input_salt_and_pepper', type=float, default=0.4) #default=0.4 # hyper parameters parser.add_argument('--learning_rate', type=float, default=0.25) parser.add_argument('--momentum', type=float, default=0.5) parser.add_argument('--annealing', type=float, default=0.995) parser.add_argument('--noise_annealing', type=float, default=0.99) # data parser.add_argument('--dataset', type=str, default='MNIST') parser.add_argument('--data_path', type=str, default='../data/') parser.add_argument('--classes', type=int, default=10) parser.add_argument('--output_path', type=str, default='../outputs/gsn/') # argparse does not deal with booleans parser.add_argument('--vis_init', type=int, default=0) parser.add_argument('--noiseless_h1', type=int, default=1) parser.add_argument('--input_sampling', type=int, default=1) parser.add_argument('--test_model', type=int, default=0) parser.add_argument('--continue_training', type=int, default=0) #default=0 args = parser.parse_args() ######################################## # Initialization things with arguments # ######################################## outdir = args.output_path + "/" + args.dataset + "/" data.mkdir_p(outdir) args.output_path = outdir # Create the logger logger = log.Logger(outdir) logger.log("---------CREATING GSN------------\n\n") logger.log(args) # See if we should load args from a previous config file (during testing) config_filename = outdir + 'config' if args.test_model and 'config' in os.listdir(outdir): config_vals = load_from_config(config_filename) for CV in config_vals: logger.log(CV) if CV.startswith('test'): logger.log('Do not override testing switch') continue try: exec('args.' + CV) in globals(), locals() except: exec('args.' + CV.split('=')[0] + "='" + CV.split('=')[1] + "'") in globals(), locals() else: # Save the current configuration # Useful for logs/experiments logger.log('Saving config') with open(config_filename, 'w') as f: f.write(str(args)) ###################################### # Load the data, train = train+valid # ###################################### if args.dataset.lower() == 'mnist': (train_X, train_Y), (valid_X, valid_Y), (test_X, test_Y) = data.load_mnist(args.data_path) train_X = numpy.concatenate((train_X, valid_X)) train_Y = numpy.concatenate((train_Y, valid_Y)) else: raise AssertionError( "Dataset not recognized. Please try MNIST, or implement your own data processing method in data_tools.py" ) # transfer the datasets into theano shared variables train_X, train_Y = data.shared_dataset((train_X, train_Y), borrow=True) valid_X, valid_Y = data.shared_dataset((valid_X, valid_Y), borrow=True) test_X, test_Y = data.shared_dataset((test_X, test_Y), borrow=True) ########################## # Initialize the new GSN # ########################## gsn = GSN(train_X, valid_X, test_X, vars(args), logger) # Load initial weights and biases from file if testing params_to_load = 'gsn_params.pkl' if args.test_model and os.path.isfile(params_to_load): logger.log("\nLoading existing GSN parameters") loaded_params = cPickle.load(open(params_to_load, 'r')) [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip( loaded_params[:len(gsn.weights_list)], gsn.weights_list) ] [ p.set_value(lp.get_value(borrow=False)) for lp, p in zip( loaded_params[len(gsn.weights_list):], gsn.bias_list) ] else: logger.log( "Could not find existing GSN parameter file {}, training instead.". format(params_to_load)) args.test_model = False ######################################### # Train or test the new GSN on the data # ######################################### # Train if not test if not args.test_model: gsn.train() # Otherwise, test else: gsn.test()
def main(): parser = argparse.ArgumentParser() # GSN settings parser.add_argument('--layers', type=int, default=3) # number of hidden layers parser.add_argument('--walkbacks', type=int, default=5) # number of walkbacks parser.add_argument('--hidden_size', type=int, default=1500) parser.add_argument('--hidden_act', type=str, default='tanh') parser.add_argument('--visible_act', type=str, default='sigmoid') # training parser.add_argument( '--cost_funct', type=str, default='binary_crossentropy') # the cost function for training parser.add_argument('--n_epoch', type=int, default=500) parser.add_argument('--batch_size', type=int, default=100) parser.add_argument( '--save_frequency', type=int, default=5) #number of epochs between parameters being saved parser.add_argument('--early_stop_threshold', type=float, default=0.9995) parser.add_argument('--early_stop_length', type=int, default=30) #the patience number of epochs # noise parser.add_argument('--hidden_add_noise_sigma', type=float, default=2) #default=2 parser.add_argument('--input_salt_and_pepper', type=float, default=0.4) #default=0.4 # hyper parameters parser.add_argument('--learning_rate', type=float, default=0.25) parser.add_argument('--momentum', type=float, default=0.5) parser.add_argument('--annealing', type=float, default=0.995) parser.add_argument('--noise_annealing', type=float, default=1) # data parser.add_argument('--dataset', type=str, default='MNIST') parser.add_argument('--data_path', type=str, default='../data/') parser.add_argument('--classes', type=int, default=10) parser.add_argument('--output_path', type=str, default='../outputs/gsn/') # argparse does not deal with booleans parser.add_argument('--vis_init', type=int, default=0) parser.add_argument('--noiseless_h1', type=int, default=1) parser.add_argument('--input_sampling', type=int, default=1) parser.add_argument('--test_model', type=int, default=0) parser.add_argument('--continue_training', type=int, default=0) #default=0 args = parser.parse_args() ######################################## # Initialization things with arguments # ######################################## outdir = args.output_path + "/" + args.dataset + "/" data.mkdir_p(outdir) args.output_path = outdir # Create the logger logger = log.Logger(outdir) logger.log("---------CREATING GSN------------\n\n") logger.log(args) # See if we should load args from a previous config file (during testing) config_filename = outdir + 'config' if args.test_model and 'config' in os.listdir(outdir): config_vals = load_from_config(config_filename) for CV in config_vals: logger.log(CV) if CV.startswith('test'): logger.log('Do not override testing switch') continue try: exec('args.' + CV) in globals(), locals() except: exec('args.' + CV.split('=')[0] + "='" + CV.split('=')[1] + "'") in globals(), locals() else: # Save the current configuration # Useful for logs/experiments logger.log('Saving config') with open(config_filename, 'w') as f: f.write(str(args)) ###################################### # Load the data, train = train+valid # ###################################### if args.dataset.lower() == 'mnist': (train_X, train_Y), (valid_X, valid_Y), (test_X, test_Y) = data.load_mnist(args.data_path) train_X = numpy.concatenate((train_X, valid_X)) train_Y = numpy.concatenate((train_Y, valid_Y)) else: raise AssertionError( "Dataset not recognized. Please try MNIST, or implement your own data processing method in data_tools.py" ) # transfer the datasets into theano shared variables train_X, train_Y = data.shared_dataset((train_X, train_Y), borrow=True) valid_X, valid_Y = data.shared_dataset((valid_X, valid_Y), borrow=True) test_X, test_Y = data.shared_dataset((test_X, test_Y), borrow=True) ########################## # Initialize the new GSN # ########################## gsn = GSN(train_X, valid_X, test_X, vars(args), logger) # gsn.train() gsn.load_params('gsn_params_mnist.pkl') gsn.gen_10k_samples() # parzen print 'Evaluating parzen window' import utils.likelihood_estimation as ll ll.main(0.20, 'mnist', '../data/', 'samples.npy')