type=int, default=0, help="ID of GPU device (if there are multiple)") args = parser.parse_args() import tensorflow as tf import data, graph, graphST, warp from params import Params print("=======================================================") print("train.py (training on MNIST)") print("=======================================================") # load data print("loading MNIST dataset...") trainData, validData, testData = data.loadMNIST("data/MNIST.npz") # set parameters print("setting configurations...") params = Params(args) # create directories for model output suffix = args.group if not os.path.exists("models_{0}".format(suffix)): os.mkdir("models_{0}".format(suffix)) if not os.path.exists("models_{0}/interm".format(suffix)): os.mkdir("models_{0}/interm".format(suffix)) if not os.path.exists("models_{0}/final".format(suffix)): os.mkdir("models_{0}/final".format(suffix)) saveFname = args.model
import argparse import options print("setting configurations...") opt = options.set() import tensorflow as tf import data,graph,warp,util print("=======================================================") print("train.py (training on MNIST)") print("=======================================================") # load data print("loading MNIST dataset...") trainData,validData,testData = data.loadMNIST("data/MNIST.npz") # create directories for model output util.mkdir("models_{0}".format(opt.group)) util.mkdir("models_{0}/interm".format(opt.group)) util.mkdir("models_{0}/final".format(opt.group)) print("training model {0}...".format(opt.model)) print("------------------------------------------") print("warpScale: (pert) {0} (trans) {1}".format(opt.warpScale["pert"],opt.warpScale["trans"])) print("warpType: {0}".format(opt.warpType)) print("batchSize: {0}".format(opt.batchSize)) print("GPU device: {0}".format(opt.gpu)) print("------------------------------------------") tf.reset_default_graph()
def main(): # Main setup latent_sizes = [2, 5, 10, 20, 30, 50, 100] downsampling_factors = [1, 2, 4] N_epochs = 50 binarise_downsampling = False bernoulli_sampling = True # Setup C = 1 # number of channels in image H = 28 # height of image W = 28 # width of image # K = 10 # number of classes hidden_sizes = [200, 200] batch_size = 100 analytic_kl_term = True learning_rate = 0.001 #0.0003 shape = [H * W * C] # Symbolic variables symbolic_x_LR = T.matrix() symbolic_x_HR = T.matrix() symbolic_z = T.matrix() symbolic_learning_rate = T.scalar('learning_rate') # Fix random seed for reproducibility numpy.random.seed(1234) # Data file_name = "mnist.pkl.gz" file_path = data_path(file_name) (X_train, y_train), (X_valid, y_valid), (X_test, y_test) = data.loadMNIST(file_path, shape) X_train = numpy.concatenate([X_train, X_valid]) X_train = X_train.astype(theano.config.floatX) X_test = X_test.astype(theano.config.floatX) N_train_batches = X_train.shape[0] / batch_size N_test_batches = X_test.shape[0] / batch_size if bernoulli_sampling: preprocess = bernoullisample else: preprocess = numpy.round # Setup shared variables X_train_shared = theano.shared(preprocess(X_train), borrow = True) X_test_shared = theano.shared(preprocess(X_test), borrow = True) X_test_shared_fixed = theano.shared(numpy.round(X_test), borrow = True) X_test_shared_normal = theano.shared(X_test, borrow = True) all_runs_duration = 0 for latent_size, downsampling_factor in product(latent_sizes, downsampling_factors): run_start = time.time() print("Training model with a latent size of {} and images downsampled by {}:\n".format(latent_size, downsampling_factor)) # Models h = H / downsampling_factor w = W / downsampling_factor ## Recognition model q(z|x) l_enc_HR_in = InputLayer((None, H * W * C), name = "ENC_HR_INPUT") l_enc_HR_downsample = l_enc_HR_in l_enc_HR_downsample = ReshapeLayer(l_enc_HR_downsample, (-1, C, H, W)) if downsampling_factor != 1: l_enc_HR_downsample = Pool2DLayer(l_enc_HR_downsample, pool_size = downsampling_factor, mode = "average_exc_pad") # TODO Should downsampled data be binarised? (worse performance) if binarise_downsampling: l_enc_HR_downsample = NonlinearityLayer(l_enc_HR_downsample, nonlinearity = T.round) l_enc_HR_downsample = ReshapeLayer(l_enc_HR_downsample, (-1, h * w * C)) l_enc_LR_in = InputLayer((None, h * w * C), name = "ENC_LR_INPUT") l_enc = l_enc_LR_in for i, hidden_size in enumerate(hidden_sizes, start = 1): l_enc = DenseLayer(l_enc, num_units = hidden_size, nonlinearity = softplus, name = 'ENC_DENSE{:d}'.format(i)) l_z_mu = DenseLayer(l_enc, num_units = latent_size, nonlinearity = identity, name = 'ENC_Z_MU') l_z_log_var = DenseLayer(l_enc, num_units = latent_size, nonlinearity = identity, name = 'ENC_Z_LOG_VAR') # Sample the latent variables using mu(x) and log(sigma^2(x)) l_z = SimpleSampleLayer(mean = l_z_mu, log_var = l_z_log_var) # as Kingma # l_z = SampleLayer(mean = l_z_mu, log_var = l_z_log_var) ## Generative model p(x|z) l_dec_in = InputLayer((None, latent_size), name = "DEC_INPUT") l_dec = l_dec_in for i, hidden_size in enumerate_reversed(hidden_sizes, start = 0): l_dec = DenseLayer(l_dec, num_units = hidden_size, nonlinearity = softplus, name = 'DEC_DENSE{:d}'.format(i)) l_dec_x_mu = DenseLayer(l_dec, num_units = H * W * C, nonlinearity = sigmoid, name = 'DEC_X_MU') l_dec_x_log_var = DenseLayer(l_dec, num_units = H * W * C, nonlinearity = sigmoid, name = 'DEC_X_MU') # TRY relu instead of softplus (maybe with more hidden units) # TRY softmax instead of sigmoid # PROBLEM with this is that we have several pixels activated. ## Get outputs from models # With noise x_LR_train = get_output(l_enc_HR_downsample, symbolic_x_HR, deterministic = False) z_train, z_mu_train, z_log_var_train = get_output( [l_z, l_z_mu, l_z_log_var], {l_enc_LR_in: x_LR_train}, deterministic = False ) x_mu_train, x_log_var_train = get_output([l_dec_x_mu, l_dec_x_log_var], {l_dec_in: z_train}, deterministic = False) # Without noise x_LR_eval = get_output(l_enc_HR_downsample, symbolic_x_HR, deterministic = True) z_eval, z_mu_eval, z_log_var_eval = get_output( [l_z, l_z_mu, l_z_log_var], {l_enc_LR_in: x_LR_eval}, deterministic = True ) x_mu_eval, x_log_var_eval = get_output([l_dec_x_mu, l_dec_x_log_var], {l_dec_in: z_eval}, deterministic = True) # Sampling x_mu_sample = get_output(l_dec_x_mu, {l_dec_in: symbolic_z}, deterministic = True) # Likelihood # Calculate the loglikelihood(x) = E_q[ log p(x|z) + log p(z) - log q(z|x)] def log_likelihood(z, z_mu, z_log_var, x_mu, x_log_var, x, analytic_kl_term): if analytic_kl_term: kl_term = kl_normal2_stdnormal(z_mu, z_log_var).sum(axis = 1) log_px_given_z = log_bernoulli(x, x_mu, eps = 1e-6).sum(axis = 1) # log_px_given_z = log_normal2(x, x_mu, x_log_var).sum(axis = 1) LL = T.mean(-kl_term + log_px_given_z) else: log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis = 1) log_pz = log_stdnormal(z).sum(axis = 1) log_px_given_z = log_bernoulli(x, x_mu, eps = 1e-6).sum(axis = 1) # log_px_given_z = log_normal2(x, x_mu, x_log_var).sum(axis = 1) LL = T.mean(log_pz + log_px_given_z - log_qz_given_x) return LL # log-likelihood for training ll_train = log_likelihood( z_train, z_mu_train, z_log_var_train, x_mu_train, x_log_var_train, symbolic_x_HR, analytic_kl_term) # log-likelihood for evaluating ll_eval = log_likelihood( z_eval, z_mu_eval, z_log_var_eval, x_mu_eval, x_log_var_eval, symbolic_x_HR, analytic_kl_term) # Parameters to train parameters = get_all_params([l_z, l_dec_x_mu], trainable = True) # parameters = get_all_params([l_z, l_dec_x_mu, l_dec_x_log_var], trainable = True) print("Parameters that will be trained:") for parameter in parameters: print("{}: {}".format(parameter, parameter.get_value().shape)) ### Take gradient of negative log-likelihood gradients = T.grad(-ll_train, parameters) # Adding gradient clipping to reduce the effects of exploding gradients, # and hence speed up convergence gradient_clipping = 1 gradient_norm_max = 5 gradient_constrained = updates.total_norm_constraint(gradients, max_norm = gradient_norm_max) gradients_clipped = [T.clip(g,-gradient_clipping, gradient_clipping) for g in gradient_constrained] # Setting up functions for training symbolic_batch_index = T.iscalar('index') batch_slice = slice(symbolic_batch_index * batch_size, (symbolic_batch_index + 1) * batch_size) update_expressions = updates.adam(gradients_clipped, parameters, learning_rate = symbolic_learning_rate) train_model = theano.function( [symbolic_batch_index, symbolic_learning_rate], ll_train, updates = update_expressions, givens = {symbolic_x_HR: X_train_shared[batch_slice]} ) test_model = theano.function( [symbolic_batch_index], ll_eval, givens = {symbolic_x_HR: X_test_shared[batch_slice]} ) test_model_fixed = theano.function( [symbolic_batch_index], ll_eval, givens = {symbolic_x_HR: X_test_shared_fixed[batch_slice]} ) def train_epoch(learning_rate): costs = [] for i in range(N_train_batches): cost_batch = train_model(i, learning_rate) costs += [cost_batch] return numpy.mean(costs) def test_epoch(): costs = [] for i in range(N_test_batches): cost_batch = test_model(i) costs += [cost_batch] return numpy.mean(costs) def test_epoch_fixed(): costs = [] for i in range(N_test_batches): cost_batch = test_model_fixed(i) costs += [cost_batch] return numpy.mean(costs) # Training epochs = [] cost_train = [] cost_test = [] print for epoch in range(N_epochs): epoch_start = time.time() # Shuffle train data numpy.random.shuffle(X_train) X_train_shared.set_value(preprocess(X_train)) # TODO: Using dynamically changed learning rate train_cost = train_epoch(learning_rate) test_cost = test_epoch() test_cost_fixed = test_epoch_fixed() epoch_duration = time.time() - epoch_start epochs.append(epoch + 1) cost_train.append(train_cost) cost_test.append(test_cost) # line = "Epoch: %i\tTime: %0.2f\tLR: %0.5f\tLL Train: %0.3f\tLL test: %0.3f\t" % ( epoch, t, learning_rate, train_cost, test_cost) print("Epoch {:d} (duration: {:.2f} s, learning rate: {:.1e}):".format(epoch + 1, epoch_duration, learning_rate)) print(" log-likelihood: {:.3f} (training set), {:.3f} (test set)".format(train_cost, test_cost)) print # Results ## Reconstruction N_reconstructions = 50 X_test_eval = X_test_shared.eval() X_test_eval_fixed = X_test_shared_fixed.eval() X_test_eval_normal = X_test_shared_normal.eval() subset = numpy.random.randint(0, len(X_test_eval), size = N_reconstructions) x_original = X_test_eval[numpy.array(subset)] x_LR = get_output(l_enc_HR_downsample, x_original).eval() z = get_output(l_z, x_LR).eval() x_reconstructed = x_mu_sample.eval({symbolic_z: z}) x_original_fixed = X_test_eval_fixed[numpy.array(subset)] x_LR_fixed = get_output(l_enc_HR_downsample, x_original_fixed).eval() z_fixed = get_output(l_z, x_LR_fixed).eval() x_reconstructed_fixed = x_mu_sample.eval({symbolic_z: z_fixed}) originals = X_test_eval_normal[numpy.array(subset)] reconstructions = { "originals": x_original, "downsampled": x_LR, "reconstructions": x_reconstructed } reconstructions_fixed = { "originals": x_original_fixed, "downsampled": x_LR_fixed, "reconstructions": x_reconstructed_fixed } ## Manifold if latent_size == 2: x = numpy.linspace(0.1, 0.9, 20) # TODO: Ideally sample from the real p(z) v = gaussian.ppf(x) z = numpy.zeros((20**2, 2)) i = 0 for a in v: for b in v: z[i,0] = a z[i,1] = b i += 1 z = z.astype('float32') samples = x_mu_sample.eval({symbolic_z: z}) else: samples = None ## Reconstructions of homemade numbers if downsampling_factor == 2: file_names = [ "hm_7_Avenir.png", "hm_7_Noteworthy.png", "hm_7_Chalkboard.png", "hm_7_drawn.png", "hm_A_Noteworthy.png", "hm_A_drawn.png", "hm_7_0.txt", "hm_7_1.txt", "hm_7_2.txt", "hm_A.txt" ] x_LR_HM = data.loadHomemade(map(data_path, file_names), [h * w]) z = get_output(l_z, x_LR_HM).eval() x_HM_reconstructed = x_mu_sample.eval({symbolic_z: z}) reconstructions_homemade = { "originals": x_LR_HM, "reconstructions": x_HM_reconstructed } else: reconstructions_homemade = None # Saving setup_and_results = { "setup": { "image size": (C, H, W), "downsampling factor": downsampling_factor, "learning rate": learning_rate, "analytic K-L term": analytic_kl_term, "batch size": batch_size, "hidden layer sizes": hidden_sizes, "latent size": latent_size, "number of epochs": N_epochs }, "results": { "learning curve": { "epochs": epochs, "training cost function": cost_train, "test cost function": cost_test }, "originals": originals, "reconstructions": reconstructions, "reconstructions (fixed)": reconstructions_fixed, "manifold": { "samples": samples }, "reconstructed homemade numbers": reconstructions_homemade } } file_name = "results{}_ds{}{}_l{}_e{}.pkl".format("_bs" if bernoulli_sampling else "", downsampling_factor, "b" if binarise_downsampling else "", latent_size, N_epochs) with open(data_path(file_name), "w") as f: pickle.dump(setup_and_results, f) run_duration = time.time() - run_start all_runs_duration += run_duration print("Run took {:.2f} minutes.".format(run_duration / 60)) print("\n") print("All runs took {:.2f} minutes in total.".format(all_runs_duration / 60))
classifier = graph.CNN(opt) # ------ define loss ------ loss = torch.nn.CrossEntropyLoss() # ------ optimizer ------ optimList = [{ "params": geometric.parameters(), "lr": opt.lrGP }, { "params": classifier.parameters(), "lr": opt.lrC }] optim = torch.optim.SGD(optimList) # load data print(util.toMagenta("loading MNIST dataset...")) trainData, testData = data.loadMNIST(opt, "data") # visdom visualizer vis = util.Visdom(opt) print(util.toYellow("======= TRAINING START =======")) timeStart = time.time() # start session with torch.cuda.device(0): geometric.train() classifier.train() if opt.fromIt != 0: util.restoreModel(opt, geometric, classifier, opt.fromIt) print( util.toMagenta("resuming from iteration {0}...".format( opt.fromIt)))
def main(): # TODO Make this work better. # See https://swarbrickjones.wordpress.com/2015/04/29/convolutional-autoencoders-in-pythontheanolasagne/. # Setup C = 1 # number of channels in image H = 28 # height of image W = 28 # width of image # K = 10 # number of classes shape = [C * H * W] padding_size = 2 downsampling_factor = 2 # Dense layers hidden_sizes = [200, 200] latent_size = 2 # Convolutional layers filters = [{"number": 16, "size": 3, "stride": 1}] batch_size = 100 analytic_kl_term = True learning_rate = 0.01 N_epochs = 10 # 1000 # Symbolic variables symbolic_x_LR = T.matrix() symbolic_x_HR = T.matrix() symbolic_z = T.matrix() symbolic_learning_rate = T.scalar('learning_rate') # Fix random seed for reproducibility numpy.random.seed(1234) # Data file_name = "mnist.pkl.gz" file_path = data_path(file_name) (X_train, y_train), (X_valid, y_valid), (X_test, y_test) = data.loadMNIST(file_path, shape) X_train = numpy.concatenate([X_train, X_valid]) X_train = X_train.astype(theano.config.floatX) X_test = X_test.astype(theano.config.floatX) N_train_batches = X_train.shape[0] / batch_size N_test_batches = X_test.shape[0] / batch_size # Setup shared variables X_train_shared = theano.shared(X_train, borrow = True) X_test_shared = theano.shared(X_test, borrow = True) # Models ## Recognition model q(z|x) pool_size = 2 l_enc_HR_in = InputLayer((None, C * H * W), name = "ENC_HR_INPUT") l_enc_HR_downsample = l_enc_HR_in l_enc_HR_downsample = ReshapeLayer(l_enc_HR_downsample, (-1, C, H, W)) l_enc_HR_downsample = PadLayer(l_enc_HR_downsample, width = padding_size) l_enc_HR_downsample = Pool2DLayer(l_enc_HR_downsample, pool_size = downsampling_factor, mode = "average_exc_pad") _, _, h, w = l_enc_HR_downsample.output_shape l_enc_HR_downsample = ReshapeLayer(l_enc_HR_downsample, (-1, C * h * w)) l_enc_LR_in = InputLayer((None, C * h * w), name = "ENC_LR_INPUT") l_enc = l_enc_LR_in l_enc = ReshapeLayer(l_enc, (-1, C, h, w)) for i, filter_ in enumerate(filters): l_enc = Conv2DLayer(l_enc, filter_["number"], filter_["size"], filter_["stride"], pad = "same", nonlinearity = rectify, name = 'ENC_CONV_{:d}'.format(i)) # l_enc = Pool2DLayer(l_enc, pool_size) l_z_mu = DenseLayer(l_enc, num_units = latent_size, nonlinearity = None, name = 'ENC_Z_MU') l_z_log_var = DenseLayer(l_enc, num_units = latent_size, nonlinearity = None, name = 'ENC_Z_LOG_VAR') # Sample the latent variables using mu(x) and log(sigma^2(x)) l_z = SimpleSampleLayer(mean = l_z_mu, log_var = l_z_log_var) # as Kingma # l_z = SampleLayer(mean = l_z_mu, log_var = l_z_log_var) ## Generative model p(x|z) l_dec_in = InputLayer((None, latent_size), name = "DEC_INPUT") l_dec = DenseLayer(l_dec_in, num_units = C * H * W, nonlinearity = rectify, name = "DEC_DENSE") l_dec = ReshapeLayer(l_dec, (-1, C, H, W)) for i, filter_ in enumerate_reversed(filters, start = 0): if filter_["stride"] == 1: l_dec = Conv2DLayer(l_dec, filter_["number"], filter_["size"], filter_["stride"], pad = "same", nonlinearity = rectify, name = 'DEC_CONV_{:d}'.format(i)) else: l_dec = Deconv2DLayer(l_dec, filter_["number"], filter_["size"], filter_["stride"], nonlinearity = rectify, name = 'DEC_CONV_{:d}'.format(i)) l_dec_x_mu = Conv2DLayer(l_dec, num_filters = C, filter_size = (3, 3), stride = 1, pad = 'same', nonlinearity = None, name = 'DEC_X_MU') l_dec_x_mu = ReshapeLayer(l_dec_x_mu, (-1, C * H * W)) ## Get outputs from models # With noise x_LR_train = get_output(l_enc_HR_downsample, symbolic_x_HR, deterministic = False) z_train, z_mu_train, z_log_var_train = get_output( [l_z, l_z_mu, l_z_log_var], {l_enc_LR_in: x_LR_train}, deterministic = False ) x_mu_train = get_output(l_dec_x_mu, {l_dec_in: z_train}, deterministic = False) # Without noise x_LR_eval = get_output(l_enc_HR_downsample, symbolic_x_HR, deterministic = True) z_eval, z_mu_eval, z_log_var_eval = get_output( [l_z, l_z_mu, l_z_log_var], {l_enc_LR_in: x_LR_eval}, deterministic = True ) x_mu_eval = get_output(l_dec_x_mu, {l_dec_in: z_eval}, deterministic = True) # Sampling x_mu_sample = get_output(l_dec_x_mu, {l_dec_in: symbolic_z}, deterministic = True) # Likelihood # Calculate the loglikelihood(x) = E_q[ log p(x|z) + log p(z) - log q(z|x)] def log_likelihood(z, z_mu, z_log_var, x_mu, x, analytic_kl_term): if analytic_kl_term: kl_term = kl_normal2_stdnormal(z_mu, z_log_var).sum(axis = 1) log_px_given_z = log_bernoulli(x, x_mu, eps = 1e-6).sum(axis = 1) LL = T.mean(-kl_term + log_px_given_z) else: log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis = 1) log_pz = log_stdnormal(z).sum(axis = 1) log_px_given_z = log_bernoulli(x, x_mu, eps = 1e-6).sum(axis = 1) LL = T.mean(log_pz + log_px_given_z - log_qz_given_x) return LL # log-likelihood for training ll_train = log_likelihood( z_train, z_mu_train, z_log_var_train, x_mu_train, symbolic_x_HR, analytic_kl_term) # log-likelihood for evaluating ll_eval = log_likelihood( z_eval, z_mu_eval, z_log_var_eval, x_mu_eval, symbolic_x_HR, analytic_kl_term) # Parameters to train parameters = get_all_params([l_z_mu, l_dec_x_mu], trainable = True) print("Parameters that will be trained:") for parameter in parameters: print("{}: {}".format(parameter, parameter.get_value().shape)) ### Take gradient of negative log-likelihood gradients = T.grad(-ll_train, parameters) # Adding gradient clipping to reduce the effects of exploding gradients, # and hence speed up convergence gradient_clipping = 1 gradient_norm_max = 5 gradient_constrained = updates.total_norm_constraint(gradients, max_norm = gradient_norm_max) gradients_clipped = [T.clip(g,-gradient_clipping, gradient_clipping) for g in gradient_constrained] # Setting up functions for training symbolic_batch_index = T.iscalar('index') batch_slice = slice(symbolic_batch_index * batch_size, (symbolic_batch_index + 1) * batch_size) update_expressions = updates.adam(gradients_clipped, parameters, learning_rate = symbolic_learning_rate) train_model = theano.function( [symbolic_batch_index, symbolic_learning_rate], ll_train, updates = update_expressions, givens = {symbolic_x_HR: X_train_shared[batch_slice]} ) test_model = theano.function( [symbolic_batch_index], ll_eval, givens = {symbolic_x_HR: X_test_shared[batch_slice]} ) def train_epoch(learning_rate): costs = [] for i in range(N_train_batches): cost_batch = train_model(i, learning_rate) costs += [cost_batch] return numpy.mean(costs) def test_epoch(): costs = [] for i in range(N_test_batches): cost_batch = test_model(i) costs += [cost_batch] return numpy.mean(costs) # Training epochs = [] cost_train = [] cost_test = [] for epoch in range(N_epochs): start = time.time() # Shuffle train data numpy.random.shuffle(X_train) X_train_shared.set_value(X_train) train_cost = train_epoch(learning_rate) test_cost = test_epoch() duration = time.time() - start epochs.append(epoch) cost_train.append(train_cost) cost_test.append(test_cost) # line = "Epoch: %i\tTime: %0.2f\tLR: %0.5f\tLL Train: %0.3f\tLL test: %0.3f\t" % ( epoch, t, learning_rate, train_cost, test_cost) print("Epoch {:d} (duration: {:.2f} s, learning rate: {:.1e}):".format(epoch + 1, duration, learning_rate)) print(" log-likelihood: {:.3f} (training set), {:.3f} (test set)".format(train_cost, test_cost))