def visualize_attention(result, pre_tag="AAA", post_tag="AAA"): seq_len = result[0].shape[0] samp_count = result[0].shape[1] # get generated predictions x_samps = np.zeros((seq_len*samp_count, obs_dim)) y_samps = np.zeros((seq_len*samp_count, label_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): x_samps[idx] = result[0][s2,s1,:obs_dim] y_samps[idx] = result[0][s2,s1,obs_dim:] idx += 1 file_name = "{0:s}_traj_xs_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(x_samps, file_name, num_rows=20) file_name = "{0:s}_traj_ys_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(y_samps, file_name, num_rows=20) # get sequential attention maps seq_samps = np.zeros((seq_len*samp_count, x_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = result[1][s2,s1,:x_dim] + result[1][s2,s1,x_dim:] idx += 1 file_name = "{0:s}_traj_att_maps_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=20) # get sequential attention maps (read out values) seq_samps = np.zeros((seq_len*samp_count, x_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = result[2][s2,s1,:x_dim] + result[2][s2,s1,x_dim:] idx += 1 file_name = "{0:s}_traj_read_outs_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=20) return
def process_samples(step_type='add', data_name='MNIST'): # sample several interchangeable versions of the model if data_name == 'MNIST': conditions = [{'occ_dim': 0, 'drop_prob': 0.8}, \ {'occ_dim': 16, 'drop_prob': 0.0}] if data_name == 'SVHN': conditions = [{'occ_dim': 0, 'drop_prob': 0.8}, \ {'occ_dim': 17, 'drop_prob': 0.0}] if data_name == 'TFD': conditions = [{'occ_dim': 0, 'drop_prob': 0.8}, \ {'occ_dim': 25, 'drop_prob': 0.0}] for cond_dict in conditions: occ_dim = cond_dict['occ_dim'] drop_prob = cond_dict['drop_prob'] dp_int = int(100.0 * drop_prob) # save the samples to a pkl file, in their numpy array form sample_pkl_name = "IMP-{}-OD{}-DP{}-{}.pkl".format( data_name, occ_dim, dp_int, step_type) pickle_file = open(sample_pkl_name) samples = cPickle.load(pickle_file) pickle_file.close() print("Loaded some samples from: {}".format(sample_pkl_name)) sample_list = [] for i in range(samples.shape[0]): sample_list.append(samples[i, :, :]) # downsample the sequence.... #keep_idx = range(len(sample_list)) keep_idx = [0, 2, 4, 6, 9, 12, 15] sample_list = [sample_list[i] for i in keep_idx] seq_len = len(sample_list) samp_count = sample_list[0].shape[0] obs_dim = sample_list[0].shape[1] seq_samps = np.zeros((seq_len * samp_count, obs_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = sample_list[s2][s1, :].ravel() idx += 1 sample_img_name = "IMP-{}-OD{}-DP{}-{}.png".format( data_name, occ_dim, dp_int, step_type) row_count = int(samp_count / 16) print("row_count: {}".format(row_count)) utils.visualize_samples(seq_samps, sample_img_name, num_rows=row_count) return
def test_mnist_img(occ_dim=15, drop_prob=0.0): ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = RESULT_PATH + "TM_OD{}_DP{}".format(occ_dim, dp_int) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 batch_reps = 1 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1], ))) TM = TemplateMatchImputer(x_train=Xtr, x_type='bernoulli') Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva[:500], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) img_match_on_known, img_match_on_unknown = TM.best_match_img(xo, xm) display_count = 100 # visualize matches on known elements Xs = np.zeros((2 * display_count, Xva.shape[1])) for idx in range(display_count): Xs[2 * idx] = xi[idx] Xs[(2 * idx) + 1] = img_match_on_known[idx] file_name = "{0:s}_SAMPLES_MOK.png".format(result_tag) utils.visualize_samples(Xs, file_name, num_rows=20) # visualize matches on unknown elements Xs = np.zeros((2 * display_count, Xva.shape[1])) for idx in range(display_count): Xs[2 * idx] = xi[idx] Xs[(2 * idx) + 1] = img_match_on_unknown[idx] file_name = "{0:s}_SAMPLES_MOU.png".format(result_tag) utils.visualize_samples(Xs, file_name, num_rows=20) return
def test_mnist_img(occ_dim=15, drop_prob=0.0): ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = RESULT_PATH + "TM_OD{}_DP{}".format(occ_dim, dp_int) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 batch_reps = 1 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1],))) TM = TemplateMatchImputer(x_train=Xtr, x_type='bernoulli') Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva[:500], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) img_match_on_known, img_match_on_unknown = TM.best_match_img(xo, xm) display_count = 100 # visualize matches on known elements Xs = np.zeros((2*display_count, Xva.shape[1])) for idx in range(display_count): Xs[2*idx] = xi[idx] Xs[(2*idx)+1] = img_match_on_known[idx] file_name = "{0:s}_SAMPLES_MOK.png".format(result_tag) utils.visualize_samples(Xs, file_name, num_rows=20) # visualize matches on unknown elements Xs = np.zeros((2*display_count, Xva.shape[1])) for idx in range(display_count): Xs[2*idx] = xi[idx] Xs[(2*idx)+1] = img_match_on_unknown[idx] file_name = "{0:s}_SAMPLES_MOU.png".format(result_tag) utils.visualize_samples(Xs, file_name, num_rows=20) return
def process_samples(step_type='add', data_name='MNIST'): # sample several interchangeable versions of the model if data_name == 'MNIST': conditions = [{'occ_dim': 0, 'drop_prob': 0.8}, \ {'occ_dim': 16, 'drop_prob': 0.0}] if data_name == 'SVHN': conditions = [{'occ_dim': 0, 'drop_prob': 0.8}, \ {'occ_dim': 17, 'drop_prob': 0.0}] if data_name == 'TFD': conditions = [{'occ_dim': 0, 'drop_prob': 0.8}, \ {'occ_dim': 25, 'drop_prob': 0.0}] for cond_dict in conditions: occ_dim = cond_dict['occ_dim'] drop_prob = cond_dict['drop_prob'] dp_int = int(100.0 * drop_prob) # save the samples to a pkl file, in their numpy array form sample_pkl_name = "IMP-{}-OD{}-DP{}-{}.pkl".format(data_name, occ_dim, dp_int, step_type) pickle_file = open(sample_pkl_name) samples = cPickle.load(pickle_file) pickle_file.close() print("Loaded some samples from: {}".format(sample_pkl_name)) sample_list = [] for i in range(samples.shape[0]): sample_list.append(samples[i,:,:]) # downsample the sequence.... #keep_idx = range(len(sample_list)) keep_idx = [0, 2, 4, 6, 9, 12, 15] sample_list = [sample_list[i] for i in keep_idx] seq_len = len(sample_list) samp_count = sample_list[0].shape[0] obs_dim = sample_list[0].shape[1] seq_samps = np.zeros((seq_len*samp_count, obs_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = sample_list[s2][s1,:].ravel() idx += 1 sample_img_name = "IMP-{}-OD{}-DP{}-{}.png".format(data_name, occ_dim, dp_int, step_type) row_count = int(samp_count / 16) print("row_count: {}".format(row_count)) utils.visualize_samples(seq_samps, sample_img_name, num_rows=row_count) return
def visualize_attention_joint(result, pre_tag="AAA", post_tag="AAA"): seq_len = result[0].shape[0] samp_count = result[0].shape[1] # get generated predictions seq_samps = np.zeros((3*seq_len*samp_count, obs_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = result[3][s2,s1,:] idx += 1 for s2 in range(seq_len): seq_samps[idx] = result[0][s2,s1,:] idx += 1 for s2 in range(seq_len): seq_samps[idx] = result[1][s2,s1,:] idx += 1 file_name = "{0:s}_traj_joint_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=(3*samp_count)) return
def visualize_attention(result, pre_tag="AAA", post_tag="AAA"): seq_len = result[0].shape[0] samp_count = result[0].shape[1] # get generated predictions x_samps = np.zeros((seq_len*samp_count, obs_dim)) y_samps = np.zeros((seq_len*samp_count, label_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): x_samps[idx] = result[0][s2,s1,:obs_dim] y_samps[idx] = result[0][s2,s1,obs_dim:] # add ticks at the corners of label predictions, to make them # easier to parse visually. max_val = np.mean(result[0][s2,s1,obs_dim:]) y_samps[idx][0] = max_val y_samps[idx][9] = max_val y_samps[idx][-1] = max_val y_samps[idx][-10] = max_val idx += 1 file_name = "{0:s}_traj_xs_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(x_samps, file_name, num_rows=20) file_name = "{0:s}_traj_ys_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(y_samps, file_name, num_rows=20) # get sequential attention maps seq_samps = np.zeros((seq_len*samp_count, x_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = result[1][s2,s1,:x_dim] + result[1][s2,s1,x_dim:] idx += 1 file_name = "{0:s}_traj_att_maps_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=20) # get sequential attention maps (read out values) seq_samps = np.zeros((seq_len*samp_count, x_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = result[2][s2,s1,:x_dim] + result[2][s2,s1,x_dim:] idx += 1 file_name = "{0:s}_traj_read_outs_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=20) return
def visualize_attention(result, pre_tag="AAA", post_tag="AAA"): seq_len = result[0].shape[0] samp_count = result[0].shape[1] # get generated predictions x_samps = np.zeros((seq_len * samp_count, obs_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): x_samps[idx] = result[0][s2, s1, :] idx += 1 file_name = "{0:s}_traj_xs_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(x_samps, file_name, num_rows=samp_count) # get sequential attention maps seq_samps = np.zeros((seq_len * samp_count, obs_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = result[1][s2, s1, :] idx += 1 file_name = "{0:s}_traj_att_maps_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=samp_count) # get sequential attention maps (read out values) seq_samps = np.zeros((seq_len * samp_count, obs_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = result[2][s2, s1, :] idx += 1 file_name = "{0:s}_traj_read_outs_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=samp_count) # get original input sequences seq_samps = np.zeros((seq_len * samp_count, obs_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = result[3][s2, s1, :] idx += 1 file_name = "{0:s}_traj_xs_in_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=samp_count) return
def visualize_attention(sampler_result, pre_tag="AAA", post_tag="AAA"): # get generated predictions seq_len = sampler_result[0].shape[0] samp_count = sampler_result[0].shape[1] x_dim = sampler_result[0].shape[2] seq_samps = np.zeros((samp_count, 28*28)) for samp in range(samp_count): step = 0 samp_vals = np.zeros((28,28)) for col in range(28): col_vals = np.zeros((28,)) for rep in range(step_reps): if (rep == (step_reps-1)): col_vals = sampler_result[0][step,samp,:] step += 1 samp_vals[:,col] = col_vals seq_samps[samp,:] = samp_vals.ravel() file_name = "{0:s}_traj_xs_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=10) # get sequential attention maps seq_samps = np.zeros((samp_count, 28*28)) for samp in range(samp_count): step = 0 samp_vals = np.zeros((28,28)) for col in range(28): col_vals = np.zeros((28,)) for rep in range(step_reps): col_vals = col_vals + sampler_result[1][step,samp,:x_dim] col_vals = col_vals + sampler_result[1][step,samp,x_dim:] step += 1 samp_vals[:,col] = col_vals / (2.0*step_reps) seq_samps[samp,:] = samp_vals.ravel() file_name = "{0:s}_traj_att_maps_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=10) # get sequential attention maps (read out values) seq_samps = np.zeros((samp_count, 28*28)) for samp in range(samp_count): step = 0 samp_vals = np.zeros((28,28)) for col in range(28): col_vals = np.zeros((28,)) for rep in range(step_reps): col_vals = col_vals + sampler_result[2][step,samp,:x_dim] col_vals = col_vals + sampler_result[2][step,samp,x_dim:] step += 1 samp_vals[:,col] = col_vals / (2.0*step_reps) seq_samps[samp,:] = samp_vals.ravel() file_name = "{0:s}_traj_read_outs_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=10) return
def train_walk_from_pretrained_osm(lam_kld=0.0): # Simple test code, to check that everything is basically functional. print("TESTING...") # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with data_file = 'data/tfd_data_48x48.pkl' dataset = load_tfd(tfd_pkl_name=data_file, which_set='unlabeled', fold='all') Xtr_unlabeled = dataset[0] dataset = load_tfd(tfd_pkl_name=data_file, which_set='train', fold='all') Xtr_train = dataset[0] Xtr = np.vstack([Xtr_unlabeled, Xtr_train]) dataset = load_tfd(tfd_pkl_name=data_file, which_set='valid', fold='all') Xva = dataset[0] print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape),str(Xva.shape))) # get and set some basic dataset information tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] data_dim = Xtr.shape[1] batch_size = 400 batch_reps = 5 prior_sigma = 1.0 Xtr_mean = np.mean(Xtr, axis=0, keepdims=True) Xtr_mean = (0.0 * Xtr_mean) + np.mean(np.mean(Xtr,axis=1)) Xc_mean = np.repeat(Xtr_mean, batch_size, axis=0) # Symbolic inputs Xd = T.matrix(name='Xd') Xc = T.matrix(name='Xc') Xm = T.matrix(name='Xm') Xt = T.matrix(name='Xt') ############################### # Setup discriminator network # ############################### # Set some reasonable mlp parameters dn_params = {} # Set up some proto-networks pc0 = [data_dim, (300, 4), (300, 4), 10] dn_params['proto_configs'] = [pc0] # Set up some spawn networks sc0 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} #sc1 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} dn_params['spawn_configs'] = [sc0] dn_params['spawn_weights'] = [1.0] # Set remaining params dn_params['init_scale'] = 1.0 dn_params['lam_l2a'] = 1e-2 dn_params['vis_drop'] = 0.2 dn_params['hid_drop'] = 0.5 # Initialize a network object to use as the discriminator DN = PeaNet(rng=rng, Xd=Xd, params=dn_params) DN.init_biases(0.0) ####################################################### # Load inferencer and generator from saved parameters # ####################################################### gn_fname = RESULT_PATH+"pt_osm_params_b100000_GN.pkl" in_fname = RESULT_PATH+"pt_osm_params_b100000_IN.pkl" IN = load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) GN = load_infnet_from_file(f_name=gn_fname, rng=rng, Xd=Xd) ######################################################## # Define parameters for the VCGLoop, and initialize it # ######################################################## print("Building the VCGLoop...") vcgl_params = {} vcgl_params['x_type'] = 'gaussian' vcgl_params['xt_transform'] = 'sigmoid' vcgl_params['logvar_bound'] = LOGVAR_BOUND vcgl_params['cost_decay'] = 0.1 vcgl_params['chain_type'] = 'walkout' vcgl_params['lam_l2d'] = 5e-2 VCGL = VCGLoop(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, Xt=Xt, \ i_net=IN, g_net=GN, d_net=DN, chain_len=5, \ data_dim=data_dim, prior_dim=PRIOR_DIM, params=vcgl_params) out_file = open(RESULT_PATH+"pt_walk_results.txt", 'wb') #################################################### # Train the VCGLoop by unrolling and applying BPTT # #################################################### learn_rate = 0.0005 cost_1 = [0. for i in range(10)] for i in range(100000): scale = float(min((i+1), 5000)) / 5000.0 if ((i+1 % 25000) == 0): learn_rate = learn_rate * 0.8 ######################################## # TRAIN THE CHAIN IN FREE-RUNNING MODE # ######################################## VCGL.set_all_sgd_params(learn_rate=(scale*learn_rate), \ mom_1=0.9, mom_2=0.99) VCGL.set_disc_weights(dweight_gn=25.0, dweight_dn=25.0) VCGL.set_lam_chain_nll(1.0) VCGL.set_lam_chain_kld(lam_kld) # get some data to train with tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,)) Xd_batch = Xtr.take(tr_idx, axis=0) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # examples from the target distribution, to train discriminator tr_idx = npr.randint(low=0,high=tr_samples,size=(2*batch_size,)) Xt_batch = Xtr.take(tr_idx, axis=0) # do a minibatch update of the model, and compute some costs outputs = VCGL.train_joint(Xd_batch, Xc_batch, Xm_batch, Xt_batch, batch_reps) cost_1 = [(cost_1[k] + 1.*outputs[k]) for k in range(len(outputs))] if ((i % 500) == 0): cost_1 = [(v / 500.0) for v in cost_1] o_str_1 = "batch: {0:d}, joint_cost: {1:.4f}, chain_nll_cost: {2:.4f}, chain_kld_cost: {3:.4f}, disc_cost_gn: {4:.4f}, disc_cost_dn: {5:.4f}".format( \ i, cost_1[0], cost_1[1], cost_1[2], cost_1[5], cost_1[6]) print(o_str_1) cost_1 = [0. for v in cost_1] if ((i % 1000) == 0): tr_idx = npr.randint(low=0,high=Xtr.shape[0],size=(5,)) va_idx = npr.randint(low=0,high=Xva.shape[0],size=(5,)) Xd_batch = np.vstack([Xtr.take(tr_idx, axis=0), Xva.take(va_idx, axis=0)]) # draw some chains of samples from the VAE loop file_name = RESULT_PATH+"pt_walk_chain_samples_b{0:d}.png".format(i) Xd_samps = np.repeat(Xd_batch, 3, axis=0) sample_lists = VCGL.OSM.sample_from_chain(Xd_samps, loop_iters=20) Xs = np.vstack(sample_lists["data samples"]) utils.visualize_samples(Xs, file_name, num_rows=20) # draw some masked chains of samples from the VAE loop file_name = RESULT_PATH+"pt_walk_mask_samples_b{0:d}.png".format(i) Xd_samps = np.repeat(Xc_mean[0:Xd_batch.shape[0],:], 3, axis=0) Xc_samps = np.repeat(Xd_batch, 3, axis=0) Xm_rand = sample_masks(Xc_samps, drop_prob=0.0) Xm_patch = sample_patch_masks(Xc_samps, (48,48), (25,25)) Xm_samps = Xm_rand * Xm_patch sample_lists = VCGL.OSM.sample_from_chain(Xd_samps, \ X_c=Xc_samps, X_m=Xm_samps, loop_iters=20) Xs = np.vstack(sample_lists["data samples"]) utils.visualize_samples(Xs, file_name, num_rows=20) # draw some samples independently from the GenNet's prior file_name = RESULT_PATH+"pt_walk_prior_samples_b{0:d}.png".format(i) Xs = VCGL.sample_from_prior(20*20) utils.visualize_samples(Xs, file_name, num_rows=20) # DUMP PARAMETERS FROM TIME-TO-TIME if (i % 5000 == 0): DN.save_to_file(f_name=RESULT_PATH+"pt_walk_params_b{0:d}_DN.pkl".format(i)) IN.save_to_file(f_name=RESULT_PATH+"pt_walk_params_b{0:d}_IN.pkl".format(i)) GN.save_to_file(f_name=RESULT_PATH+"pt_walk_params_b{0:d}_GN.pkl".format(i)) return
def evaluate_lenet5(learning_rate=0.05, n_epochs=500, dataset='./data/mnist.pkl.gz', nkerns=[48, 64], batch_size=256): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (28, 28) # this is the size of MNIST images start_rate = numpy.asarray([0.05]).astype(theano.config.floatX) learning_rate = theano.shared(value=start_rate, name='learning_rate') ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' tanh = lambda vals: T.tanh(vals) relu = lambda vals: relu_actfun(vals) # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_prep = Reshape2D4DLayer(input=x, out_shape=(1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-7+1,28-7+1)=(22,22) # maxpooling reduces this further to (22/2,22/2) = (11,11) # 4D output tensor is thus of shape (batch_size,nkerns[0],11,11) layer0 = ConvPoolLayer(rng, input=layer0_prep.output, \ filt_def=(nkerns[0], 1, 7, 7), pool_def=(2, 2), \ activation=relu, drop_rate=0.0, input_noise=0.1, bias_noise=0.05, \ W=None, b=None, name="layer0", W_scale=2.0) # Construct the second convolutional pooling layer # filtering reduces the image size to (11-4+1,11-4+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = ConvPoolLayer(rng, input=layer0.output, \ filt_def=(nkerns[1], nkerns[0], 4, 4), pool_def=(2, 2), \ activation=relu, drop_rate=0.0, input_noise=0.0, bias_noise=0.05, \ W=None, b=None, name="layer1", W_scale=2.0) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_prep = Reshape4D2DLayer(layer1.output) # construct a fully-connected relu layer layer2 = HiddenLayer(rng, layer2_prep.output, nkerns[1]*4*4, 512, \ activation=relu, pool_size=0, \ drop_rate=0.0, input_noise=0.0, bias_noise=0.05, \ W=None, b=None, name="layer2", W_scale=2.0) # construct an output layer to predict classes layer3 = HiddenLayer(rng, layer2.output, 512, 10, \ activation=relu, pool_size=0, \ drop_rate=0.5, input_noise=0.0, bias_noise=0.0, \ W=None, b=None, name="layer2", W_scale=2.0) # get a loss function to apply to the output layer loss_func = LogisticRegression(layer3) # the cost we minimize during training is the NLL of the model cost = loss_func.loss_func(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], loss_func.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], loss_func.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params moms = OrderedDict() for p in params: moms[p] = theano.shared(value=numpy.zeros( \ p.get_value(borrow=True).shape).astype(theano.config.floatX)) # create a list of gradients for all model parameters grads = OrderedDict() for p in params: grads[p] = T.grad(cost, p) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for p in params: mom_update = (moms[p], (0.8 * moms[p]) + (0.2 * grads[p])) param_update = (p, p - learning_rate[0] * moms[p]) updates.append(mom_update) updates.append(param_update) train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean( validation_losses) / batch_size print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) / batch_size print( (' epoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if ((epoch + 1 % 30) == 0): new_rate = 0.5 * learning_rate.get_value(borrow=True) learning_rate.set_value(new_rate) if ((epoch % 10) == 0): W_l0 = layer0.W.get_value(borrow=False) W_l0 = W_l0.reshape((W_l0.shape[0], numpy.prod(W_l0.shape[1:]))) visualize_samples(W_l0, 'A1_CONV_FILTS.png') end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def test_lstm_structpred(step_type='add', use_pol=True, use_binary=False): ########################################### # Make a tag for identifying result files # ########################################### pol_tag = "P1" if use_pol else "P0" bin_tag = "B1" if use_binary else "B0" res_tag = "STRUCT_PRED_RESULTS/SP_LSTM_{}_{}_{}".format( step_type, pol_tag, bin_tag) if use_binary: ############################ # Get binary training data # ############################ rng = np.random.RandomState(1234) Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/') #Xtr = np.vstack((Xtr, Xva)) #Xva = Xte else: ################################ # Get continuous training data # ################################ rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xte = datasets[2][0] #Xtr = np.concatenate((Xtr, Xva), axis=0) #Xva = Xte Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 ######################################################## # Split data into "observation" and "prediction" parts # ######################################################## obs_cols = 14 # number of columns to observe pred_cols = 28 - obs_cols # number of columns to predict x_dim = obs_cols * 28 # dimensionality of observations y_dim = pred_cols * 28 # dimensionality of predictions Xtr, Ytr = img_split(Xtr, im_dim=(28, 28), split_col=obs_cols, transposed=True) Xva, Yva = img_split(Xva, im_dim=(28, 28), split_col=obs_cols, transposed=True) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ read_dim = 128 write_dim = 128 mlp_dim = 128 rnn_dim = 128 z_dim = 64 n_iter = 15 rnninits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } # setup reader/writer models reader_mlp = MLP([Rectifier(), Tanh()], [x_dim, mlp_dim, read_dim], name="reader_mlp", **inits) writer_mlp = MLP([Rectifier(), None], [rnn_dim, mlp_dim, y_dim], name="writer_mlp", **inits) # setup submodels for processing LSTM inputs pol_inp_dim = y_dim + read_dim + rnn_dim var_inp_dim = y_dim + y_dim + read_dim + rnn_dim pol_mlp_in = MLP([Identity()], [pol_inp_dim, 4 * rnn_dim], name="pol_mlp_in", **inits) var_mlp_in = MLP([Identity()], [var_inp_dim, 4 * rnn_dim], name="var_mlp_in", **inits) dec_mlp_in = MLP([Identity()], [z_dim, 4 * rnn_dim], name="dec_mlp_in", **inits) # setup submodels for turning LSTM states into conditionals over z pol_mlp_out = CondNet([], [rnn_dim, z_dim], name="pol_mlp_out", **inits) var_mlp_out = CondNet([], [rnn_dim, z_dim], name="var_mlp_out", **inits) dec_mlp_out = CondNet([], [rnn_dim, z_dim], name="dec_mlp_out", **inits) # setup the LSTMs for primary policy, guide policy, and shared dynamics pol_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="pol_rnn", **rnninits) var_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="var_rnn", **rnninits) dec_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="dec_rnn", **rnninits) model = IRStructPredModel(n_iter, step_type=step_type, use_pol=use_pol, reader_mlp=reader_mlp, writer_mlp=writer_mlp, pol_mlp_in=pol_mlp_in, pol_mlp_out=pol_mlp_out, pol_rnn=pol_rnn, var_mlp_in=var_mlp_in, var_mlp_out=var_mlp_out, var_rnn=var_rnn, dec_mlp_in=dec_mlp_in, dec_mlp_out=dec_mlp_out, dec_rnn=dec_rnn) model.initialize() compile_start_time = time.time() # build the cost gradients, training function, samplers, etc. model.build_sampling_funcs() print("Testing model sampler...") # draw some independent samples from the model samp_count = 10 samp_reps = 3 x_in = Xtr[:10, :].repeat(samp_reps, axis=0) y_in = Ytr[:10, :].repeat(samp_reps, axis=0) x_samps, y_samps = model.sample_model(x_in, y_in, sample_source='p') # TODO: visualize sample prediction trajectories img_seq = seq_img_join(x_samps, y_samps, im_dim=(28, 28), transposed=True) seq_len = len(img_seq) samp_count = img_seq[0].shape[0] seq_samps = np.zeros((seq_len * samp_count, img_seq[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = img_seq[s2][s1] idx += 1 file_name = "{0:s}_samples_b{1:d}.png".format(res_tag, 0) utils.visualize_samples(seq_samps, file_name, num_rows=samp_count) model.build_model_funcs() compile_end_time = time.time() compile_minutes = (compile_end_time - compile_start_time) / 60.0 print("THEANO COMPILE TIME (MIN): {}".format(compile_minutes)) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ print("Beginning to train the model...") out_file = open("{}_results.txt".format(res_tag), 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.9 batch_idx = np.arange(batch_size) + tr_samples for i in range(300000): scale = min(1.0, ((i + 1) / 5000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr, Ytr = row_shuffle(Xtr, Ytr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update model.set_sgd_params(lr=scale * learn_rate, mom_1=scale * momentum, mom_2=0.98) model.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.1) model.set_grad_noise(grad_noise=0.02) # perform a minibatch update and record the cost for this batch Xb = to_fX(Xtr.take(batch_idx, axis=0)) Yb = to_fX(Ytr.take(batch_idx, axis=0)) result = model.train_joint(Xb, Yb) costs = [(costs[j] + result[j]) for j in range(len(result))] # diagnostics if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " total_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_term : {0:.4f}".format(costs[2]) str5 = " kld_q2p : {0:.4f}".format(costs[3]) str6 = " kld_p2q : {0:.4f}".format(costs[4]) str7 = " reg_term : {0:.4f}".format(costs[5]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): model.save_model_params("{}_params.pkl".format(res_tag)) # compute a small-sample estimate of NLL bound on validation set Xva, Yva = row_shuffle(Xva, Yva) Xb = to_fX(Xva[:5000]) Yb = to_fX(Yva[:5000]) va_costs = model.compute_nll_bound(Xb, Yb) str1 = " va_nll_bound : {}".format(va_costs[1]) str2 = " va_nll_term : {}".format(va_costs[2]) str3 = " va_kld_q2p : {}".format(va_costs[3]) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() # draw some independent samples from the model samp_count = 10 samp_reps = 3 x_in = Xva[:samp_count, :].repeat(samp_reps, axis=0) y_in = Yva[:samp_count, :].repeat(samp_reps, axis=0) x_samps, y_samps = model.sample_model(x_in, y_in, sample_source='p') # visualize sample prediction trajectories img_seq = seq_img_join(x_samps, y_samps, im_dim=(28, 28), transposed=True) seq_len = len(img_seq) samp_count = img_seq[0].shape[0] seq_samps = np.zeros((seq_len * samp_count, img_seq[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): if use_binary: seq_samps[idx] = binarize_data(img_seq[s2][s1]) else: seq_samps[idx] = img_seq[s2][s1] idx += 1 file_name = "{0:s}_samples_b{1:d}.png".format(res_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=samp_count)
def main(args): cfg = cfg_dict[args.cfg_name] writer = SummaryWriter(os.path.join("runs", args.cfg_name)) train_loader = get_data_loader(cfg, cfg["train_dir"]) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = EDSR(cfg).to(device) criterion = torch.nn.L1Loss() optimizer = torch.optim.Adam(model.parameters(), lr=cfg["init_lr"], betas=(0.9, 0.999), eps=1e-8) global_batches = 0 if args.train: for epoch in range(cfg["n_epoch"]): model.train() running_loss = 0.0 for i, batch in enumerate(train_loader): lr, hr = batch[0].to(device), batch[1].to(device) optimizer.zero_grad() sr = model(lr) loss = model.loss(sr, hr) # loss = criterion(model(lr), hr) running_loss += loss.item() loss.backward() optimizer.step() global_batches += 1 if global_batches % cfg["lr_decay_every"] == 0: for param_group in optimizer.param_groups: print(f"decay lr to {param_group['lr'] / 10}") param_group["lr"] /= 10 if epoch % args.log_every == 0: model.eval() with torch.no_grad(): batch_samples = {"lr": batch[0], "hr": batch[1], "sr": sr.cpu()} writer.add_scalar("training-loss", running_loss / len(train_loader), global_step=global_batches) writer.add_scalar("PSNR", compute_psnr(batch_samples), global_step=global_batches) samples = {k: v[:3] for k, v in batch_samples.items()} fig = visualize_samples(samples, f"epoch-{epoch}") writer.add_figure("sample-visualization", fig, global_step=global_batches) if epoch % args.save_every == 0: state = {"net": model.state_dict(), "optim": optimizer.state_dict()} checkpoint_dir = args.checkpoint_dir if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) path = os.path.join(checkpoint_dir, args.cfg_name) torch.save(state, path) # eval if args.eval: assert args.model_path and args.lr_img_path print(f"evaluating {args.lr_img_path}") state = torch.load(args.model_path, map_location=device) model.load_state_dict(state["net"]) optimizer.load_state_dict(state["optim"]) with torch.no_grad(): lr = img2tensor(args.lr_img_path) sr = model(lr.clone().to(device)).cpu() samples = {"lr": lr, "sr": sr} if args.hr_img_path: samples["hr"] = img2tensor(args.hr_img_path) print(f"PSNR: {compute_psnr(samples)}") directory = os.path.dirname(args.lr_img_path) name = f"eval-{args.cfg_name}-{args.lr_img_path.split('/')[-1]}" visualize_samples(samples, name, save=True, directory=directory, size=6)
def test_gip_sigma_scale_mnist(): from LogPDFs import cross_validate_sigma # Simple test code, to check that everything is basically functional. print("TESTING...") # Initialize a source of randomness rng = np.random.RandomState(12345) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr = datasets[0][0] Xtr = Xtr.get_value(borrow=False) Xva = datasets[2][0] Xva = Xva.get_value(borrow=False) print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape),str(Xva.shape))) # get and set some basic dataset information tr_samples = Xtr.shape[0] batch_size = 100 Xtr_mean = np.mean(Xtr, axis=0, keepdims=True) Xtr_mean = (0.0 * Xtr_mean) + np.mean(Xtr) Xc_mean = np.repeat(Xtr_mean, batch_size, axis=0).astype(theano.config.floatX) # Symbolic inputs Xd = T.matrix(name='Xd') Xc = T.matrix(name='Xc') Xm = T.matrix(name='Xm') Xt = T.matrix(name='Xt') # Load inferencer and generator from saved parameters gn_fname = "MNIST_WALKOUT_TEST_MAX_KLD/pt_walk_params_b70000_GN.pkl" in_fname = "MNIST_WALKOUT_TEST_MAX_KLD/pt_walk_params_b70000_IN.pkl" IN = load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) GN = load_infnet_from_file(f_name=gn_fname, rng=rng, Xd=Xd) x_dim = IN.shared_layers[0].in_dim z_dim = IN.mu_layers[-1].out_dim # construct a GIPair with the loaded InfNet and GenNet osm_params = {} osm_params['x_type'] = 'gaussian' osm_params['xt_transform'] = 'sigmoid' osm_params['logvar_bound'] = LOGVAR_BOUND OSM = OneStageModel(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, \ p_x_given_z=GN, q_z_given_x=IN, \ x_dim=x_dim, z_dim=z_dim, params=osm_params) # compute variational likelihood bound and its sub-components Xva = row_shuffle(Xva) Xb = Xva[0:5000] file_name = "A_MNIST_POST_KLDS.png" post_klds = OSM.compute_post_klds(Xb) post_dim_klds = np.mean(post_klds, axis=0) utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, \ file_name) # compute information about free-energy on validation set file_name = "A_MNIST_FREE_ENERGY.png" fe_terms = OSM.compute_fe_terms(Xb, 20) utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ x_label='Posterior KLd', y_label='Negative Log-likelihood') # bound_results = OSM.compute_ll_bound(Xva) # ll_bounds = bound_results[0] # post_klds = bound_results[1] # log_likelihoods = bound_results[2] # max_lls = bound_results[3] # print("mean ll bound: {0:.4f}".format(np.mean(ll_bounds))) # print("mean posterior KLd: {0:.4f}".format(np.mean(post_klds))) # print("mean log-likelihood: {0:.4f}".format(np.mean(log_likelihoods))) # print("mean max log-likelihood: {0:.4f}".format(np.mean(max_lls))) # print("min ll bound: {0:.4f}".format(np.min(ll_bounds))) # print("max posterior KLd: {0:.4f}".format(np.max(post_klds))) # print("min log-likelihood: {0:.4f}".format(np.min(log_likelihoods))) # print("min max log-likelihood: {0:.4f}".format(np.min(max_lls))) # # compute some information about the approximate posteriors # post_stats = OSM.compute_post_stats(Xva, 0.0*Xva, 0.0*Xva) # all_post_klds = np.sort(post_stats[0].ravel()) # post KLds for each obs and dim # obs_post_klds = np.sort(post_stats[1]) # summed post KLds for each obs # post_dim_klds = post_stats[2] # average post KLds for each post dim # post_dim_vars = post_stats[3] # average squared mean for each post dim # utils.plot_line(np.arange(all_post_klds.shape[0]), all_post_klds, "AAA_ALL_POST_KLDS.png") # utils.plot_line(np.arange(obs_post_klds.shape[0]), obs_post_klds, "AAA_OBS_POST_KLDS.png") # utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, "AAA_POST_DIM_KLDS.png") # utils.plot_stem(np.arange(post_dim_vars.shape[0]), post_dim_vars, "AAA_POST_DIM_VARS.png") # draw many samples from the GIP for i in range(5): tr_idx = npr.randint(low=0,high=tr_samples,size=(100,)) Xd_batch = Xtr.take(tr_idx, axis=0) Xs = [] for row in range(3): Xs.append([]) for col in range(3): sample_lists = OSM.sample_from_chain(Xd_batch[0:10,:], loop_iters=100, \ sigma_scale=1.0) Xs[row].append(group_chains(sample_lists['data samples'])) Xs, block_im_dim = block_video(Xs, (28,28), (3,3)) to_video(Xs, block_im_dim, "A_MNIST_KLD_CHAIN_VIDEO_{0:d}.avi".format(i), frame_rate=10) #sample_lists = GIP.sample_from_chain(Xd_batch[0,:].reshape((1,data_dim)), loop_iters=300, \ # sigma_scale=1.0) #Xs = np.vstack(sample_lists["data samples"]) #file_name = "TFD_TEST_{0:d}.png".format(i) #utils.visualize_samples(Xs, file_name, num_rows=15) file_name = "A_MNIST_KLD_PRIOR_SAMPLE.png" Xs = OSM.sample_from_prior(20*20) utils.visualize_samples(Xs, file_name, num_rows=20) # # test Parzen density estimator built from prior samples # Xs = OSM.sample_from_prior(10000) # [best_sigma, best_ll, best_lls] = \ # cross_validate_sigma(Xs, Xva, [0.12, 0.14, 0.15, 0.16, 0.18], 20) # sort_idx = np.argsort(best_lls) # sort_idx = sort_idx[0:400] # utils.plot_line(np.arange(sort_idx.shape[0]), best_lls[sort_idx], "A_MNIST_BEST_LLS_1.png") # utils.visualize_samples(Xva[sort_idx], "A_MNIST_BAD_DIGITS_1.png", num_rows=20) # ########## # # AGAIN! # # ########## # Xs = OSM.sample_from_prior(10000) # tr_idx = npr.randint(low=0,high=tr_samples,size=(5000,)) # Xva = Xtr.take(tr_idx, axis=0) # [best_sigma, best_ll, best_lls] = \ # cross_validate_sigma(Xs, Xva, [0.12, 0.14, 0.15, 0.16, 0.18], 20) # sort_idx = np.argsort(best_lls) # sort_idx = sort_idx[0:400] # utils.plot_line(np.arange(sort_idx.shape[0]), best_lls[sort_idx], "A_MNIST_BEST_LLS_2.png") # utils.visualize_samples(Xva[sort_idx], "A_MNIST_BAD_DIGITS_2.png", num_rows=20) return
def test_tfd(step_type='add', occ_dim=15, drop_prob=0.0): ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = "{}GPSI_OD{}_DP{}_{}_NA".format(RESULT_PATH, occ_dim, dp_int, step_type) ########################## # Get some training data # ########################## data_file = 'data/tfd_data_48x48.pkl' dataset = load_tfd(tfd_pkl_name=data_file, which_set='unlabeled', fold='all') Xtr_unlabeled = dataset[0] dataset = load_tfd(tfd_pkl_name=data_file, which_set='train', fold='all') Xtr_train = dataset[0] Xtr = np.vstack([Xtr_unlabeled, Xtr_train]) dataset = load_tfd(tfd_pkl_name=data_file, which_set='valid', fold='all') Xva = dataset[0] Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 250 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX( all_pix_mean * np.ones((Xtr.shape[1],)) ) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ obs_dim = Xtr.shape[1] z_dim = 200 imp_steps = 6 init_scale = 1.0 x_in_sym = T.matrix('x_in_sym') x_out_sym = T.matrix('x_out_sym') x_mask_sym = T.matrix('x_mask_sym') ################# # p_zi_given_xi # ################# params = {} shared_config = [obs_dim, 1500, 1500] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_zi_given_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_zi_given_xi.init_biases(0.2) ################### # p_xip1_given_zi # ################### params = {} shared_config = [z_dim, 1500, 1500] output_config = [obs_dim, obs_dim] params['shared_config'] = shared_config params['output_config'] = output_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_xip1_given_zi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_xip1_given_zi.init_biases(0.2) ################### # q_zi_given_x_xi # ################### params = {} shared_config = [(obs_dim + obs_dim), 1500, 1500] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_zi_given_x_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_zi_given_x_xi.init_biases(0.2) ########################################################### # Define parameters for the GPSImputer, and initialize it # ########################################################### print("Building the GPSImputer...") gpsi_params = {} gpsi_params['obs_dim'] = obs_dim gpsi_params['z_dim'] = z_dim gpsi_params['imp_steps'] = imp_steps gpsi_params['step_type'] = step_type gpsi_params['x_type'] = 'bernoulli' gpsi_params['obs_transform'] = 'sigmoid' GPSI = GPSImputer(rng=rng, x_in=x_in_sym, x_out=x_out_sym, x_mask=x_mask_sym, \ p_zi_given_xi=p_zi_given_xi, \ p_xip1_given_zi=p_xip1_given_zi, \ q_zi_given_x_xi=q_zi_given_x_xi, \ params=gpsi_params, \ shared_param_dicts=None) # # test model saving # print("Testing model save to file...") # GPSI.save_to_file("AAA_GPSI_SAVE_TEST.pkl") # # test model loading # print("Testing model load from file...") # GPSI = load_gpsimputer_from_file(f_name="AAA_GPSI_SAVE_TEST.pkl", rng=rng) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format(result_tag) out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.5 batch_idx = np.arange(batch_size) + tr_samples for i in range(200005): scale = min(1.0, ((i+1) / 5000.0)) if (((i + 1) % 15000) == 0): learn_rate = learn_rate * 0.92 if (i > 10000): momentum = 0.90 else: momentum = 0.50 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update GPSI.set_sgd_params(lr=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.98) GPSI.set_train_switch(1.0) GPSI.set_lam_nll(lam_nll=1.0) GPSI.set_lam_kld(lam_kld_p=0.1, lam_kld_q=0.9) GPSI.set_lam_l2w(1e-4) # perform a minibatch update and record the cost for this batch xb = to_fX( Xtr.take(batch_idx, axis=0) ) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) result = GPSI.train_joint(xi, xo, xm, batch_reps) # do diagnostics and general training tracking costs = [(costs[j] + result[j]) for j in range(len(result)-1)] if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_cost : {0:.4f}".format(costs[2]) str5 = " kld_cost : {0:.4f}".format(costs[3]) str6 = " reg_cost : {0:.4f}".format(costs[4]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva[0:5000], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) nll, kld = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10) vfe = np.mean(nll) + np.mean(kld) str1 = " va_nll_bound : {}".format(vfe) str2 = " va_nll_term : {}".format(np.mean(nll)) str3 = " va_kld_q2p : {}".format(np.mean(kld)) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() GPSI.save_to_file("{}_PARAMS.pkl".format(result_tag)) if ((i % 20000) == 0): # Get some validation samples for evaluating model performance xb = to_fX( Xva[0:100] ) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) xi = np.repeat(xi, 2, axis=0) xo = np.repeat(xo, 2, axis=0) xm = np.repeat(xm, 2, axis=0) # draw some sample imputations from the model samp_count = xi.shape[0] _, model_samps = GPSI.sample_imputer(xi, xo, xm, use_guide_policy=False) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "{0:s}_samples_ng_b{1:d}.png".format(result_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # get visualizations of policy parameters file_name = "{0:s}_gen_gen_weights_b{1:d}.png".format(result_tag, i) W = GPSI.gen_gen_weights.get_value(borrow=False) utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "{0:s}_gen_inf_weights_b{1:d}.png".format(result_tag, i) W = GPSI.gen_inf_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
def test_lstm_structpred(step_type='add', use_pol=True, use_binary=False): ########################################### # Make a tag for identifying result files # ########################################### pol_tag = "P1" if use_pol else "P0" bin_tag = "B1" if use_binary else "B0" res_tag = "STRUCT_PRED_RESULTS/SP_LSTM_{}_{}_{}".format(step_type, pol_tag, bin_tag) if use_binary: ############################ # Get binary training data # ############################ rng = np.random.RandomState(1234) Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/') #Xtr = np.vstack((Xtr, Xva)) #Xva = Xte else: ################################ # Get continuous training data # ################################ rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xte = datasets[2][0] #Xtr = np.concatenate((Xtr, Xva), axis=0) #Xva = Xte Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 ######################################################## # Split data into "observation" and "prediction" parts # ######################################################## obs_cols = 14 # number of columns to observe pred_cols = 28 - obs_cols # number of columns to predict x_dim = obs_cols * 28 # dimensionality of observations y_dim = pred_cols * 28 # dimensionality of predictions Xtr, Ytr = img_split(Xtr, im_dim=(28, 28), split_col=obs_cols, transposed=True) Xva, Yva = img_split(Xva, im_dim=(28, 28), split_col=obs_cols, transposed=True) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ read_dim = 128 write_dim = 128 mlp_dim = 128 rnn_dim = 128 z_dim = 64 n_iter = 15 rnninits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } # setup reader/writer models reader_mlp = MLP([Rectifier(), Tanh()], [x_dim, mlp_dim, read_dim], name="reader_mlp", **inits) writer_mlp = MLP([Rectifier(), None], [rnn_dim, mlp_dim, y_dim], name="writer_mlp", **inits) # setup submodels for processing LSTM inputs pol_inp_dim = y_dim + read_dim + rnn_dim var_inp_dim = y_dim + y_dim + read_dim + rnn_dim pol_mlp_in = MLP([Identity()], [pol_inp_dim, 4*rnn_dim], name="pol_mlp_in", **inits) var_mlp_in = MLP([Identity()], [var_inp_dim, 4*rnn_dim], name="var_mlp_in", **inits) dec_mlp_in = MLP([Identity()], [z_dim, 4*rnn_dim], name="dec_mlp_in", **inits) # setup submodels for turning LSTM states into conditionals over z pol_mlp_out = CondNet([], [rnn_dim, z_dim], name="pol_mlp_out", **inits) var_mlp_out = CondNet([], [rnn_dim, z_dim], name="var_mlp_out", **inits) dec_mlp_out = CondNet([], [rnn_dim, z_dim], name="dec_mlp_out", **inits) # setup the LSTMs for primary policy, guide policy, and shared dynamics pol_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="pol_rnn", **rnninits) var_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="var_rnn", **rnninits) dec_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="dec_rnn", **rnninits) model = IRStructPredModel( n_iter, step_type=step_type, use_pol=use_pol, reader_mlp=reader_mlp, writer_mlp=writer_mlp, pol_mlp_in=pol_mlp_in, pol_mlp_out=pol_mlp_out, pol_rnn=pol_rnn, var_mlp_in=var_mlp_in, var_mlp_out=var_mlp_out, var_rnn=var_rnn, dec_mlp_in=dec_mlp_in, dec_mlp_out=dec_mlp_out, dec_rnn=dec_rnn) model.initialize() compile_start_time = time.time() # build the cost gradients, training function, samplers, etc. model.build_sampling_funcs() print("Testing model sampler...") # draw some independent samples from the model samp_count = 10 samp_reps = 3 x_in = Xtr[:10,:].repeat(samp_reps, axis=0) y_in = Ytr[:10,:].repeat(samp_reps, axis=0) x_samps, y_samps = model.sample_model(x_in, y_in, sample_source='p') # TODO: visualize sample prediction trajectories img_seq = seq_img_join(x_samps, y_samps, im_dim=(28,28), transposed=True) seq_len = len(img_seq) samp_count = img_seq[0].shape[0] seq_samps = np.zeros((seq_len*samp_count, img_seq[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = img_seq[s2][s1] idx += 1 file_name = "{0:s}_samples_b{1:d}.png".format(res_tag, 0) utils.visualize_samples(seq_samps, file_name, num_rows=samp_count) model.build_model_funcs() compile_end_time = time.time() compile_minutes = (compile_end_time - compile_start_time) / 60.0 print("THEANO COMPILE TIME (MIN): {}".format(compile_minutes)) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ print("Beginning to train the model...") out_file = open("{}_results.txt".format(res_tag), 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.9 batch_idx = np.arange(batch_size) + tr_samples for i in range(300000): scale = min(1.0, ((i+1) / 5000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr, Ytr = row_shuffle(Xtr, Ytr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update model.set_sgd_params(lr=scale*learn_rate, mom_1=scale*momentum, mom_2=0.98) model.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.1) model.set_grad_noise(grad_noise=0.02) # perform a minibatch update and record the cost for this batch Xb = to_fX(Xtr.take(batch_idx, axis=0)) Yb = to_fX(Ytr.take(batch_idx, axis=0)) result = model.train_joint(Xb, Yb) costs = [(costs[j] + result[j]) for j in range(len(result))] # diagnostics if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " total_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_term : {0:.4f}".format(costs[2]) str5 = " kld_q2p : {0:.4f}".format(costs[3]) str6 = " kld_p2q : {0:.4f}".format(costs[4]) str7 = " reg_term : {0:.4f}".format(costs[5]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): model.save_model_params("{}_params.pkl".format(res_tag)) # compute a small-sample estimate of NLL bound on validation set Xva, Yva = row_shuffle(Xva, Yva) Xb = to_fX(Xva[:5000]) Yb = to_fX(Yva[:5000]) va_costs = model.compute_nll_bound(Xb, Yb) str1 = " va_nll_bound : {}".format(va_costs[1]) str2 = " va_nll_term : {}".format(va_costs[2]) str3 = " va_kld_q2p : {}".format(va_costs[3]) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() # draw some independent samples from the model samp_count = 10 samp_reps = 3 x_in = Xva[:samp_count,:].repeat(samp_reps, axis=0) y_in = Yva[:samp_count,:].repeat(samp_reps, axis=0) x_samps, y_samps = model.sample_model(x_in, y_in, sample_source='p') # visualize sample prediction trajectories img_seq = seq_img_join(x_samps, y_samps, im_dim=(28,28), transposed=True) seq_len = len(img_seq) samp_count = img_seq[0].shape[0] seq_samps = np.zeros((seq_len*samp_count, img_seq[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): if use_binary: seq_samps[idx] = binarize_data(img_seq[s2][s1]) else: seq_samps[idx] = img_seq[s2][s1] idx += 1 file_name = "{0:s}_samples_b{1:d}.png".format(res_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=samp_count)
def test_one_stage_model(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/') Xtr = np.vstack((Xtr, Xva)) Xva = Xte #del Xte tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 128 batch_reps = 1 ############################################### # Setup some parameters for the OneStageModel # ############################################### x_dim = Xtr.shape[1] z_dim = 64 x_type = 'bernoulli' xin_sym = T.matrix('xin_sym') ############### # p_x_given_z # ############### params = {} shared_config = \ [ {'layer_type': 'fc', 'in_chans': z_dim, 'out_chans': 256, 'activation': relu_actfun, 'apply_bn': True}, \ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': 7*7*128, 'activation': relu_actfun, 'apply_bn': True, 'shape_func_out': lambda x: T.reshape(x, (-1, 128, 7, 7))}, \ {'layer_type': 'conv', 'in_chans': 128, # in shape: (batch, 128, 7, 7) 'out_chans': 64, # out shape: (batch, 64, 14, 14) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'half', 'apply_bn': True} ] output_config = \ [ {'layer_type': 'conv', 'in_chans': 64, # in shape: (batch, 64, 14, 14) 'out_chans': 1, # out shape: (batch, 1, 28, 28) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'half', 'apply_bn': False, 'shape_func_out': lambda x: T.flatten(x, 2)}, \ {'layer_type': 'conv', 'in_chans': 64, 'out_chans': 1, 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'half', 'apply_bn': False, 'shape_func_out': lambda x: T.flatten(x, 2)} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = 1.0 params['build_theano_funcs'] = False p_x_given_z = HydraNet(rng=rng, Xd=xin_sym, \ params=params, shared_param_dicts=None) p_x_given_z.init_biases(0.0) ############### # q_z_given_x # ############### params = {} shared_config = \ [ {'layer_type': 'conv', 'in_chans': 1, # in shape: (batch, 784) 'out_chans': 64, # out shape: (batch, 64, 14, 14) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'double', 'apply_bn': True, 'shape_func_in': lambda x: T.reshape(x, (-1, 1, 28, 28))}, \ {'layer_type': 'conv', 'in_chans': 64, # in shape: (batch, 64, 14, 14) 'out_chans': 128, # out shape: (batch, 128, 7, 7) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'double', 'apply_bn': True, 'shape_func_out': lambda x: T.flatten(x, 2)}, \ {'layer_type': 'fc', 'in_chans': 128*7*7, 'out_chans': 256, 'activation': relu_actfun, 'apply_bn': True} ] output_config = \ [ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': z_dim, 'activation': relu_actfun, 'apply_bn': False}, \ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': z_dim, 'activation': relu_actfun, 'apply_bn': False} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = 1.0 params['build_theano_funcs'] = False q_z_given_x = HydraNet(rng=rng, Xd=xin_sym, \ params=params, shared_param_dicts=None) q_z_given_x.init_biases(0.0) ############################################################## # Define parameters for the TwoStageModel, and initialize it # ############################################################## print("Building the OneStageModel...") osm_params = {} osm_params['x_type'] = x_type osm_params['obs_transform'] = 'sigmoid' OSM = OneStageModel(rng=rng, x_in=xin_sym, x_dim=x_dim, z_dim=z_dim, p_x_given_z=p_x_given_z, q_z_given_x=q_z_given_x, params=osm_params) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format("OSM_TEST") out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.0005 momentum = 0.9 batch_idx = np.arange(batch_size) + tr_samples for i in range(500000): scale = min(0.5, ((i + 1) / 5000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) Xb = to_fX(Xtr.take(batch_idx, axis=0)) #Xb = binarize_data(Xtr.take(batch_idx, axis=0)) # set sgd and objective function hyperparams for this update OSM.set_sgd_params(lr=scale*learn_rate, \ mom_1=(scale*momentum), mom_2=0.98) OSM.set_lam_nll(lam_nll=1.0) OSM.set_lam_kld(lam_kld=1.0) OSM.set_lam_l2w(1e-5) # perform a minibatch update and record the cost for this batch result = OSM.train_joint(Xb, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_cost : {0:.4f}".format(costs[1]) str4 = " kld_cost : {0:.4f}".format(costs[2]) str5 = " reg_cost : {0:.4f}".format(costs[3]) joint_str = "\n".join([str1, str2, str3, str4, str5]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() costs = [0.0 for v in costs] if (((i % 5000) == 0) or ((i < 10000) and ((i % 1000) == 0))): # draw some independent random samples from the model samp_count = 300 model_samps = OSM.sample_from_prior(samp_count) file_name = "OSM_SAMPLES_b{0:d}.png".format(i) utils.visualize_samples(model_samps, file_name, num_rows=15) # compute free energy estimate for validation samples Xva = row_shuffle(Xva) fe_terms = OSM.compute_fe_terms(Xva[0:5000], 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) out_str = " nll_bound : {0:.4f}".format(fe_mean) print(out_str) out_file.write(out_str + "\n") out_file.flush() return
for i in range(MCS.chain_len): Xtr_chains.append(0.0*Xtr) print("Testing chain sampler....") loop_times = [] # TESTING SAMPLING SPEED! for i in range(batch_count): start_time = time.clock() batch_start = i * batch_size batch_end = min(tr_samples, (batch_start + batch_size)) Xd_batch = Xtr[batch_start:batch_end] Xd_chain = MCS.sample_from_chain(Xd_batch) Xs = [Xd_batch[0:50]] Xs.extend([xd[0:50] for xd in Xd_chain]) file_name = "MCS_TEST_{0:d}.png".format(i) utils.visualize_samples(np.vstack(Xs), file_name, num_rows=10) loop_times.append((time.clock() - start_time)) total_time = sum(loop_times) mean_time = total_time / batch_count time_std = sum([(t - mean_time)**2.0 for t in loop_times]) / batch_count print("total_time: {0:.4f}".format(total_time)) print("mean_time: {0:.4f}, time_std: {1:.4f}".format(mean_time, time_std)) start_time = time.clock() Xtr_chains = resample_chain_steps(MCS, Xtr_chains) total_time = time.clock() - start_time print("total_time: {0:.4f}".format(total_time))
def test_two_stage_model2(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/') Xtr = np.vstack((Xtr, Xva)) Xva = Xte #del Xte tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 128 batch_reps = 1 ############################################### # Setup some parameters for the TwoStageModel # ############################################### x_dim = Xtr.shape[1] z_dim = 50 h_dim = 100 x_type = 'bernoulli' # some InfNet instances to build the TwoStageModel from xin_sym = T.matrix('xin_sym') xout_sym = T.matrix('xout_sym') ############### # p_h_given_z # ############### params = {} shared_config = \ [ {'layer_type': 'fc', 'in_chans': z_dim, 'out_chans': 100, 'activation': tanh_actfun, 'apply_bn': True}, \ {'layer_type': 'fc', 'in_chans': 100, 'out_chans': 100, 'activation': tanh_actfun, 'apply_bn': True} ] output_config = \ [ {'layer_type': 'fc', 'in_chans': 100, 'out_chans': h_dim, 'activation': tanh_actfun, 'apply_bn': False}, \ {'layer_type': 'fc', 'in_chans': 100, 'out_chans': h_dim, 'activation': tanh_actfun, 'apply_bn': False} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = 1.0 params['build_theano_funcs'] = False p_h_given_z = HydraNet(rng=rng, Xd=xin_sym, params=params, shared_param_dicts=None) p_h_given_z.init_biases(0.0) ############### # p_x_given_h # ############### params = {} shared_config = \ [ {'layer_type': 'fc', 'in_chans': h_dim, 'out_chans': 200, 'activation': tanh_actfun, 'apply_bn': True}, \ {'layer_type': 'fc', 'in_chans': 200, 'out_chans': 200, 'activation': tanh_actfun, 'apply_bn': True} ] output_config = \ [ {'layer_type': 'fc', 'in_chans': 200, 'out_chans': x_dim, 'activation': tanh_actfun, 'apply_bn': False}, \ {'layer_type': 'fc', 'in_chans': 200, 'out_chans': x_dim, 'activation': tanh_actfun, 'apply_bn': False} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = 1.0 params['build_theano_funcs'] = False p_x_given_h = HydraNet(rng=rng, Xd=xin_sym, params=params, shared_param_dicts=None) p_x_given_h.init_biases(0.0) ############### # q_h_given_x # ############### params = {} shared_config = \ [ {'layer_type': 'fc', 'in_chans': x_dim, 'out_chans': 200, 'activation': tanh_actfun, 'apply_bn': True}, \ {'layer_type': 'fc', 'in_chans': 200, 'out_chans': 200, 'activation': tanh_actfun, 'apply_bn': True} ] output_config = \ [ {'layer_type': 'fc', 'in_chans': 200, 'out_chans': h_dim, 'activation': tanh_actfun, 'apply_bn': False}, \ {'layer_type': 'fc', 'in_chans': 200, 'out_chans': h_dim, 'activation': tanh_actfun, 'apply_bn': False} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = 1.0 params['build_theano_funcs'] = False q_h_given_x = HydraNet(rng=rng, Xd=xin_sym, params=params, shared_param_dicts=None) q_h_given_x.init_biases(0.0) ############### # q_z_given_h # ############### params = {} shared_config = \ [ {'layer_type': 'fc', 'in_chans': h_dim, 'out_chans': 100, 'activation': tanh_actfun, 'apply_bn': True}, \ {'layer_type': 'fc', 'in_chans': 100, 'out_chans': 100, 'activation': tanh_actfun, 'apply_bn': True} ] output_config = \ [ {'layer_type': 'fc', 'in_chans': 100, 'out_chans': z_dim, 'activation': tanh_actfun, 'apply_bn': False}, \ {'layer_type': 'fc', 'in_chans': 100, 'out_chans': z_dim, 'activation': tanh_actfun, 'apply_bn': False} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = 1.0 params['build_theano_funcs'] = False q_z_given_h = HydraNet(rng=rng, Xd=xin_sym, params=params, shared_param_dicts=None) q_z_given_h.init_biases(0.0) ############################################################## # Define parameters for the TwoStageModel, and initialize it # ############################################################## print("Building the TwoStageModel...") tsm_params = {} tsm_params['x_type'] = x_type tsm_params['obs_transform'] = 'sigmoid' TSM = TwoStageModel2(rng=rng, x_in=xin_sym, x_out=xout_sym, x_dim=x_dim, z_dim=z_dim, h_dim=h_dim, q_h_given_x=q_h_given_x, q_z_given_h=q_z_given_h, p_h_given_z=p_h_given_z, p_x_given_h=p_x_given_h, params=tsm_params) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format("TSM2A_TEST") out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.001 momentum = 0.9 batch_idx = np.arange(batch_size) + tr_samples for i in range(500000): scale = min(1.0, ((i + 1) / 5000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) Xb = to_fX(Xtr.take(batch_idx, axis=0)) #Xb = binarize_data(Xtr.take(batch_idx, axis=0)) # set sgd and objective function hyperparams for this update TSM.set_sgd_params(lr=scale * learn_rate, mom_1=(scale * momentum), mom_2=0.98) TSM.set_train_switch(1.0) TSM.set_lam_nll(lam_nll=1.0) TSM.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0) TSM.set_lam_l2w(1e-5) # perform a minibatch update and record the cost for this batch result = TSM.train_joint(Xb, Xb, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_cost : {0:.4f}".format(costs[1]) str4 = " kld_cost : {0:.4f}".format(costs[2]) str5 = " reg_cost : {0:.4f}".format(costs[3]) str6 = " nll : {0:.4f}".format(np.mean(costs[4])) str7 = " kld_z : {0:.4f}".format(np.mean(costs[5])) str8 = " kld_h : {0:.4f}".format(np.mean(costs[6])) joint_str = "\n".join( [str1, str2, str3, str4, str5, str6, str7, str8]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() costs = [0.0 for v in costs] if (((i % 5000) == 0) or ((i < 10000) and ((i % 1000) == 0))): # draw some independent random samples from the model samp_count = 300 model_samps = TSM.sample_from_prior(samp_count) file_name = "TSM2A_SAMPLES_b{0:d}.png".format(i) utils.visualize_samples(model_samps, file_name, num_rows=15) # compute free energy estimate for validation samples Xva = row_shuffle(Xva) fe_terms = TSM.compute_fe_terms(Xva[0:5000], Xva[0:5000], 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) out_str = " nll_bound : {0:.4f}".format(fe_mean) print(out_str) out_file.write(out_str + "\n") out_file.flush() return
def test_mnist(step_type='add', imp_steps=6, occ_dim=15, drop_prob=0.0): ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = "{}GPSI_conv_bn_OD{}_DP{}_IS{}_{}_NA".format( RESULT_PATH, occ_dim, dp_int, imp_steps, step_type) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xte = datasets[2][0] # Merge validation set and training set, and test on test set. Xtr = np.concatenate((Xtr, Xva), axis=0) Xva = Xte Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 batch_reps = 1 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1], ))) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ x_dim = Xtr.shape[1] z_dim = 100 init_scale = 1.0 use_bn = True x_in_sym = T.matrix('x_in_sym') x_out_sym = T.matrix('x_out_sym') x_mask_sym = T.matrix('x_mask_sym') ################# # p_zi_given_xi # ################# params = {} shared_config = \ [ {'layer_type': 'conv', 'in_chans': 1, # in shape: (batch, 784) 'out_chans': 64, # out shape: (batch, 64, 14, 14) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'double', 'apply_bn': use_bn, 'shape_func_in': lambda x: T.reshape(x, (-1, 1, 28, 28))}, \ {'layer_type': 'conv', 'in_chans': 64, # in shape: (batch, 64, 14, 14) 'out_chans': 128, # out shape: (batch, 128, 7, 7) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'double', 'apply_bn': use_bn, 'shape_func_out': lambda x: T.flatten(x, 2)}, \ {'layer_type': 'fc', 'in_chans': 128*7*7, 'out_chans': 256, 'activation': relu_actfun, 'apply_bn': use_bn} ] output_config = \ [ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': z_dim, 'activation': relu_actfun, 'apply_bn': False}, \ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': z_dim, 'activation': relu_actfun, 'apply_bn': False} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = init_scale params['build_theano_funcs'] = False p_zi_given_xi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_zi_given_xi.init_biases(0.0) ################### # p_sip1_given_zi # ################### params = {} shared_config = \ [ {'layer_type': 'fc', 'in_chans': z_dim, 'out_chans': 256, 'activation': relu_actfun, 'apply_bn': use_bn}, \ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': 7*7*128, 'activation': relu_actfun, 'apply_bn': use_bn, 'shape_func_out': lambda x: T.reshape(x, (-1, 128, 7, 7))}, \ {'layer_type': 'conv', 'in_chans': 128, # in shape: (batch, 128, 7, 7) 'out_chans': 64, # out shape: (batch, 64, 14, 14) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'half', 'apply_bn': use_bn} ] output_config = \ [ {'layer_type': 'conv', 'in_chans': 64, # in shape: (batch, 64, 14, 14) 'out_chans': 1, # out shape: (batch, 1, 28, 28) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'half', 'apply_bn': False, 'shape_func_out': lambda x: T.flatten(x, 2)}, \ {'layer_type': 'conv', 'in_chans': 64, 'out_chans': 1, 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'half', 'apply_bn': False, 'shape_func_out': lambda x: T.flatten(x, 2)}, \ {'layer_type': 'conv', 'in_chans': 64, 'out_chans': 1, 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'half', 'apply_bn': False, 'shape_func_out': lambda x: T.flatten(x, 2)} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = init_scale params['build_theano_funcs'] = False p_sip1_given_zi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_sip1_given_zi.init_biases(0.0) ################# # q_zi_given_xi # ################# params = {} shared_config = \ [ {'layer_type': 'conv', 'in_chans': 2, # in shape: (batch, 784+784) 'out_chans': 64, # out shape: (batch, 64, 14, 14) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'double', 'apply_bn': use_bn, 'shape_func_in': lambda x: T.reshape(x, (-1, 2, 28, 28))}, \ {'layer_type': 'conv', 'in_chans': 64, # in shape: (batch, 64, 14, 14) 'out_chans': 128, # out shape: (batch, 128, 7, 7) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'double', 'apply_bn': use_bn, 'shape_func_out': lambda x: T.flatten(x, 2)}, \ {'layer_type': 'fc', 'in_chans': 128*7*7, 'out_chans': 256, 'activation': relu_actfun, 'apply_bn': use_bn} ] output_config = \ [ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': z_dim, 'activation': relu_actfun, 'apply_bn': False}, \ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': z_dim, 'activation': relu_actfun, 'apply_bn': False} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = init_scale params['build_theano_funcs'] = False q_zi_given_xi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_zi_given_xi.init_biases(0.0) ########################################################### # Define parameters for the GPSImputer, and initialize it # ########################################################### print("Building the GPSImputer...") gpsi_params = {} gpsi_params['x_dim'] = x_dim gpsi_params['z_dim'] = z_dim # switch between direct construction and construction via p_x_given_si gpsi_params['imp_steps'] = imp_steps gpsi_params['step_type'] = step_type gpsi_params['x_type'] = 'bernoulli' gpsi_params['obs_transform'] = 'sigmoid' GPSI = GPSImputer(rng=rng, x_in=x_in_sym, x_out=x_out_sym, x_mask=x_mask_sym, p_zi_given_xi=p_zi_given_xi, p_sip1_given_zi=p_sip1_given_zi, q_zi_given_xi=q_zi_given_xi, params=gpsi_params, shared_param_dicts=None) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format(result_tag) out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.0001 momentum = 0.90 batch_idx = np.arange(batch_size) + tr_samples for i in range(200000): scale = min(1.0, ((i + 1) / 5000.0)) if (((i + 1) % 15000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update GPSI.set_sgd_params(lr=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.98) GPSI.set_train_switch(1.0) GPSI.set_lam_nll(lam_nll=1.0) GPSI.set_lam_kld(lam_kld_q=1.0, lam_kld_p=0.1, lam_kld_g=0.0) GPSI.set_lam_l2w(1e-5) # perform a minibatch update and record the cost for this batch xb = to_fX(Xtr.take(batch_idx, axis=0)) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) result = GPSI.train_joint(xi, xo, xm, batch_reps) # do diagnostics and general training tracking costs = [(costs[j] + result[j]) for j in range(len(result) - 1)] if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_cost : {0:.4f}".format(costs[2]) str5 = " kld_cost : {0:.4f}".format(costs[3]) str6 = " reg_cost : {0:.4f}".format(costs[4]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva[0:5000], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) nll, kld = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10) vfe = np.mean(nll) + np.mean(kld) str1 = " va_nll_bound : {}".format(vfe) str2 = " va_nll_term : {}".format(np.mean(nll)) str3 = " va_kld_q2p : {}".format(np.mean(kld)) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() if ((i % 2000) == 0): #GPSI.save_to_file("{}_PARAMS.pkl".format(result_tag)) # Get some validation samples for evaluating model performance xb = to_fX(Xva[0:100]) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) xi = np.repeat(xi, 2, axis=0) xo = np.repeat(xo, 2, axis=0) xm = np.repeat(xm, 2, axis=0) # draw some sample imputations from the model samp_count = xi.shape[0] _, model_samps = GPSI.sample_imputer(xi, xo, xm, use_guide_policy=False) seq_len = len(model_samps) seq_samps = np.zeros( (seq_len * samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "{0:s}_samples_ng_b{1:d}.png".format(result_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=20)
def pretrain_gip(extra_lam_kld=0.0, kld2_scale=0.0): # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with tr_file = 'data/svhn_train_gray.pkl' te_file = 'data/svhn_test_gray.pkl' ex_file = 'data/svhn_extra_gray.pkl' data = load_svhn_gray(tr_file, te_file, ex_file=ex_file, ex_count=200000) #all_file = 'data/svhn_all_gray_zca.pkl' #data = load_svhn_all_gray_zca(all_file) Xtr = np.vstack([data['Xtr'], data['Xex']]) Xtr = Xtr - np.mean(Xtr, axis=1, keepdims=True) Xtr = Xtr / np.std(Xtr, axis=1, keepdims=True) Xtr = shift_and_scale_into_01(Xtr) Xtr, Xva = train_valid_split(Xtr, valid_count=5000) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 100 batch_reps = 5 # setup some symbolic variables and stuff Xp = T.matrix('Xp_base') Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') data_dim = Xtr.shape[1] prior_sigma = 1.0 ########################## # NETWORK CONFIGURATIONS # ########################## gn_params = {} gn_config = [PRIOR_DIM, 2400, 2400, data_dim] gn_params['mlp_config'] = gn_config gn_params['activation'] = relu_actfun gn_params['out_type'] = 'gaussian' gn_params['mean_transform'] = 'sigmoid' gn_params['logvar_type'] = 'single_shared' gn_params['init_scale'] = 1.2 gn_params['lam_l2a'] = 1e-2 gn_params['vis_drop'] = 0.0 gn_params['hid_drop'] = 0.0 gn_params['bias_noise'] = 0.1 # choose some parameters for the continuous inferencer in_params = {} shared_config = [data_dim, 2400, 2400] top_config = [shared_config[-1], PRIOR_DIM] in_params['shared_config'] = shared_config in_params['mu_config'] = top_config in_params['sigma_config'] = top_config in_params['activation'] = relu_actfun in_params['init_scale'] = 1.2 in_params['lam_l2a'] = 1e-2 in_params['vis_drop'] = 0.2 in_params['hid_drop'] = 0.0 in_params['bias_noise'] = 0.1 in_params['input_noise'] = 0.0 in_params['kld2_scale'] = kld2_scale # Initialize the base networks for this GIPair IN = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, prior_sigma=prior_sigma, \ params=in_params, shared_param_dicts=None) GN = GenNet(rng=rng, Xp=Xp, prior_sigma=prior_sigma, \ params=gn_params, shared_param_dicts=None) # Initialize biases in IN and GN IN.init_biases(0.1) GN.init_biases(0.1) ###################################### # LOAD AND RESTART FROM SAVED PARAMS # ###################################### # new_in_params = {'kld2_scale': kld2_scale, 'bias_noise': 0.2} # new_gn_params = {'bias_noise': 0.2} # # Load inferencer and generator from saved parameters # gn_fname = "TMS_RESULTS_DROPLESS/pt_params_b50000_GN.pkl" # in_fname = "TMS_RESULTS_DROPLESS/pt_params_b50000_IN.pkl" # IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd, \ # Xc=Xc, Xm=Xm, new_params=new_in_params) # GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp, \ # new_params=new_gn_params) # in_params = IN.params # gn_params = GN.params ######################### # INITIALIZE THE GIPAIR # ######################### GIP = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN, i_net=IN, \ data_dim=data_dim, prior_dim=PRIOR_DIM, params=None) GIP.set_lam_l2w(1e-4) #################### # RICA PRETRAINING # #################### IN.W_rica.set_value(0.05 * IN.W_rica.get_value(borrow=False)) GN.W_rica.set_value(0.05 * GN.W_rica.get_value(borrow=False)) for i in range(6000): scale = min(1.0, (float(i+1) / 6000.0)) l_rate = 0.0001 * scale lam_l1 = 0.025 tr_idx = npr.randint(low=0,high=tr_samples,size=(1000,)) Xd_batch = Xtr.take(tr_idx, axis=0) inr_out = IN.train_rica(Xd_batch, l_rate, lam_l1) gnr_out = GN.train_rica(Xd_batch, l_rate, lam_l1) inr_out = [v for v in gnr_out] if ((i % 1000) == 0): print("rica batch {0:d}: in_recon={1:.4f}, in_spars={2:.4f}, gn_recon={3:.4f}, gn_spars={4:.4f}".format( \ i, 1.*inr_out[1], 1.*inr_out[2], 1.*gnr_out[1], 1.*gnr_out[2])) # draw inference net first layer weights file_name = RESULT_PATH+"pt_rica_inf_weights.png".format(i) utils.visualize_samples(IN.W_rica.get_value(borrow=False).T, file_name, num_rows=20) # draw generator net final layer weights file_name = RESULT_PATH+"pt_rica_gen_weights.png".format(i) if ('gaussian' in gn_params['out_type']): lay_num = -2 else: lay_num = -1 utils.visualize_samples(GN.W_rica.get_value(borrow=False), file_name, num_rows=20) ###################### # BASIC VAE TRAINING # ###################### out_file = open(RESULT_PATH+"pt_gip_results.txt", 'wb') # Set initial learning rate and basic SGD hyper parameters cost_1 = [0. for i in range(10)] learn_rate = 0.0002 for i in range(300000): scale = min(1.0, float(i) / 40000.0) if ((i + 1) % 100000 == 0): learn_rate = learn_rate * 0.8 # do a minibatch update of the model, and compute some costs tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,)) Xd_batch = Xtr.take(tr_idx, axis=0) Xd_batch = np.repeat(Xd_batch, batch_reps, axis=0) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # do a minibatch update of the model, and compute some costs GIP.set_all_sgd_params(lr_gn=(scale*learn_rate), \ lr_in=(scale*learn_rate), mom_1=0.9, mom_2=0.999) GIP.set_lam_nll(1.0) GIP.set_lam_kld(1.0 + extra_lam_kld*scale) outputs = GIP.train_joint(Xd_batch, Xc_batch, Xm_batch) cost_1 = [(cost_1[k] + 1.*outputs[k]) for k in range(len(outputs))] if ((i % 1000) == 0): cost_1 = [(v / 1000.) for v in cost_1] o_str = "batch: {0:d}, joint_cost: {1:.4f}, data_nll_cost: {2:.4f}, post_kld_cost: {3:.4f}, other_reg_cost: {4:.4f}".format( \ i, cost_1[0], cost_1[1], cost_1[2], cost_1[3]) print(o_str) out_file.write(o_str+"\n") out_file.flush() cost_1 = [0. for v in cost_1] if ((i % 5000) == 0): cost_2 = GIP.compute_costs(Xva, 0.*Xva, 0.*Xva) o_str = "--val: {0:d}, joint_cost: {1:.4f}, data_nll_cost: {2:.4f}, post_kld_cost: {3:.4f}, other_reg_cost: {4:.4f}".format( \ i, 1.*cost_2[0], 1.*cost_2[1], 1.*cost_2[2], 1.*cost_2[3]) print(o_str) out_file.write(o_str+"\n") out_file.flush() if ((i % 5000) == 0): tr_idx = npr.randint(low=0,high=va_samples,size=(100,)) Xd_batch = Xva.take(tr_idx, axis=0) file_name = RESULT_PATH+"pt_gip_chain_samples_b{0:d}.png".format(i) Xd_samps = np.repeat(Xd_batch[0:10,:], 3, axis=0) sample_lists = GIP.sample_from_chain(Xd_samps, loop_iters=20) Xs = np.vstack(sample_lists["data samples"]) utils.visualize_samples(Xs, file_name, num_rows=20) # draw samples freely from the generative model's prior file_name = RESULT_PATH+"pt_gip_prior_samples_b{0:d}.png".format(i) Xs = GIP.sample_from_prior(20*20) utils.visualize_samples(Xs, file_name, num_rows=20) # draw inference net first layer weights file_name = RESULT_PATH+"pt_gip_inf_weights_b{0:d}.png".format(i) utils.visualize_net_layer(GIP.IN.shared_layers[0], file_name) # draw generator net final layer weights file_name = RESULT_PATH+"pt_gip_gen_weights_b{0:d}.png".format(i) if (gn_params['out_type'] == 'gaussian'): lay_num = -2 else: lay_num = -1 utils.visualize_net_layer(GIP.GN.mlp_layers[lay_num], file_name, \ colorImg=False, use_transpose=True) ######################### # Check posterior KLds. # ######################### post_klds = posterior_klds(IN, Xtr, 5000, 5) file_name = RESULT_PATH+"pt_gip_post_klds_b{0:d}.png".format(i) utils.plot_kde_histogram2( \ np.asarray(post_klds), np.asarray(post_klds), file_name, bins=30) if ((i % 10000) == 0): IN.save_to_file(f_name=RESULT_PATH+"pt_gip_params_b{0:d}_IN.pkl".format(i)) GN.save_to_file(f_name=RESULT_PATH+"pt_gip_params_b{0:d}_GN.pkl".format(i)) IN.save_to_file(f_name=RESULT_PATH+"pt_gip_params_IN.pkl") GN.save_to_file(f_name=RESULT_PATH+"pt_gip_params_GN.pkl") return
def train_walk_from_pretrained_gip(extra_lam_kld=0.0): # Simple test code, to check that everything is basically functional. print("TESTING...") # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with tr_file = 'data/svhn_train_gray.pkl' te_file = 'data/svhn_test_gray.pkl' ex_file = 'data/svhn_extra_gray.pkl' data = load_svhn_gray(tr_file, te_file, ex_file=ex_file, ex_count=200000) #all_file = 'data/svhn_all_gray_zca.pkl' #data = load_svhn_all_gray_zca(all_file) Xtr = np.vstack([data['Xtr'], data['Xex']]) Xtr = Xtr - np.mean(Xtr, axis=1, keepdims=True) Xtr = Xtr / np.std(Xtr, axis=1, keepdims=True) Xtr = shift_and_scale_into_01(Xtr) Xtr, Xva = train_valid_split(Xtr, valid_count=5000) print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape),str(Xva.shape))) # get and set some basic dataset information tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] data_dim = Xtr.shape[1] batch_size = 100 batch_reps = 5 prior_sigma = 1.0 Xtr_mean = np.mean(Xtr, axis=0, keepdims=True) Xtr_mean = (0.0 * Xtr_mean) + np.mean(np.mean(Xtr,axis=1)) Xc_mean = np.repeat(Xtr_mean, batch_size, axis=0) # Symbolic inputs Xd = T.matrix(name='Xd') Xc = T.matrix(name='Xc') Xm = T.matrix(name='Xm') Xt = T.matrix(name='Xt') Xp = T.matrix(name='Xp') START_FRESH = True if START_FRESH: ############################### # Setup discriminator network # ############################### # Set some reasonable mlp parameters dn_params = {} # Set up some proto-networks pc0 = [data_dim, (300, 4), (300, 4), 10] dn_params['proto_configs'] = [pc0] # Set up some spawn networks sc0 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} #sc1 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} dn_params['spawn_configs'] = [sc0] dn_params['spawn_weights'] = [1.0] # Set remaining params dn_params['init_scale'] = 0.25 dn_params['lam_l2a'] = 1e-2 dn_params['vis_drop'] = 0.2 dn_params['hid_drop'] = 0.5 # Initialize a network object to use as the discriminator DN = PeaNet(rng=rng, Xd=Xd, params=dn_params) DN.init_biases(0.0) ####################################################### # Load inferencer and generator from saved parameters # ####################################################### gn_fname = RESULT_PATH+"pt_gip_params_b200000_GN.pkl" in_fname = RESULT_PATH+"pt_gip_params_b200000_IN.pkl" IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd, Xc=Xc, Xm=Xm) GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp) else: ########################################################### # Load all networks from partially-trained VCGLoop params # ########################################################### gn_fname = RESULT_PATH+"pt_walk_params_GN.pkl" in_fname = RESULT_PATH+"pt_walk_params_IN.pkl" dn_fname = RESULT_PATH+"pt_walk_params_DN.pkl" IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd, Xc=Xc, Xm=Xm) GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp) DN = PNet.load_peanet_from_file(f_name=dn_fname, rng=rng, Xd=Xd) ############################### # Initialize the main VCGLoop # ############################### vcgl_params = {} vcgl_params['lam_l2d'] = 5e-2 VCGL = VCGLoop(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, Xt=Xt, i_net=IN, \ g_net=GN, d_net=DN, chain_len=6, data_dim=data_dim, \ prior_dim=PRIOR_DIM, params=vcgl_params) VCGL.set_lam_l2w(1e-4) out_file = open(RESULT_PATH+"pt_walk_results.txt", 'wb') #################################################### # Train the VCGLoop by unrolling and applying BPTT # #################################################### learn_rate = 0.0002 cost_1 = [0. for i in range(10)] for i in range(1000000): scale = float(min((i+1), 25000)) / 25000.0 if ((i+1 % 50000) == 0): learn_rate = learn_rate * 0.8 ######################################## # TRAIN THE CHAIN IN FREE-RUNNING MODE # ######################################## VCGL.set_all_sgd_params(learn_rate=(scale*learn_rate), \ mom_1=0.9, mom_2=0.999) VCGL.set_disc_weights(dweight_gn=20.0, dweight_dn=4.0) VCGL.set_lam_chain_nll(1.0) VCGL.set_lam_chain_kld(1.0 + extra_lam_kld) VCGL.set_lam_chain_vel(0.0) VCGL.set_lam_mask_nll(0.0) VCGL.set_lam_mask_kld(0.0) # get some data to train with tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,)) Xd_batch = Xtr.take(tr_idx, axis=0) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # do 5 repetitions of the batch Xd_batch = np.repeat(Xd_batch, batch_reps, axis=0) Xc_batch = np.repeat(Xc_batch, batch_reps, axis=0) Xm_batch = np.repeat(Xm_batch, batch_reps, axis=0) # examples from the target distribution, to train discriminator tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_reps*batch_size,)) Xt_batch = Xtr.take(tr_idx, axis=0) # do a minibatch update of the model, and compute some costs outputs = VCGL.train_joint(Xd_batch, Xc_batch, Xm_batch, Xt_batch) cost_1 = [(cost_1[k] + 1.*outputs[k]) for k in range(len(outputs))] if ((i % 1000) == 0): cost_1 = [(v / 1000.0) for v in cost_1] o_str_1 = "batch: {0:d}, joint_cost: {1:.4f}, chain_nll_cost: {2:.4f}, chain_kld_cost: {3:.4f}, disc_cost_gn: {4:.4f}, disc_cost_dn: {5:.4f}".format( \ i, cost_1[0], cost_1[1], cost_1[2], cost_1[6], cost_1[7]) print(o_str_1) out_file.write(o_str_1+"\n") out_file.flush() cost_1 = [0. for v in cost_1] if ((i % 5000) == 0): tr_idx = npr.randint(low=0,high=Xtr.shape[0],size=(5,)) va_idx = npr.randint(low=0,high=Xva.shape[0],size=(5,)) Xd_batch = np.vstack([Xtr.take(tr_idx, axis=0), Xva.take(va_idx, axis=0)]) # draw some chains of samples from the VAE loop file_name = RESULT_PATH+"pt_walk_chain_samples_b{0:d}.png".format(i) Xd_samps = np.repeat(Xd_batch, 3, axis=0) sample_lists = VCGL.GIP.sample_from_chain(Xd_samps, loop_iters=20) Xs = np.vstack(sample_lists["data samples"]) utils.visualize_samples(Xs, file_name, num_rows=20) # draw some masked chains of samples from the VAE loop file_name = RESULT_PATH+"pt_walk_mask_samples_b{0:d}.png".format(i) Xd_samps = np.repeat(Xc_mean[0:Xd_batch.shape[0],:], 3, axis=0) Xc_samps = np.repeat(Xd_batch, 3, axis=0) Xm_rand = sample_masks(Xc_samps, drop_prob=0.2) Xm_patch = sample_patch_masks(Xc_samps, (32,32), (16,16)) Xm_samps = Xm_rand * Xm_patch sample_lists = VCGL.GIP.sample_from_chain(Xd_samps, \ X_c=Xc_samps, X_m=Xm_samps, loop_iters=20) Xs = np.vstack(sample_lists["data samples"]) utils.visualize_samples(Xs, file_name, num_rows=20) # draw some samples independently from the GenNet's prior file_name = RESULT_PATH+"pt_walk_prior_samples_b{0:d}.png".format(i) Xs = VCGL.sample_from_prior(20*20) utils.visualize_samples(Xs, file_name, num_rows=20) # draw discriminator network's weights file_name = RESULT_PATH+"pt_walk_dis_weights_b{0:d}.png".format(i) utils.visualize_net_layer(VCGL.DN.proto_nets[0][0], file_name) # draw inference net first layer weights file_name = RESULT_PATH+"pt_walk_inf_weights_b{0:d}.png".format(i) utils.visualize_net_layer(VCGL.IN.shared_layers[0], file_name) # draw generator net final layer weights file_name = RESULT_PATH+"pt_walk_gen_weights_b{0:d}.png".format(i) if GN.out_type == 'sigmoid': utils.visualize_net_layer(VCGL.GN.mlp_layers[-1], file_name, use_transpose=True) else: utils.visualize_net_layer(VCGL.GN.mlp_layers[-2], file_name, use_transpose=True) ######################### # Check posterior KLds. # ######################### post_klds = posterior_klds(IN, Xtr, 5000, 5) file_name = RESULT_PATH+"pt_walk_post_klds_b{0:d}.png".format(i) utils.plot_kde_histogram2( \ np.asarray(post_klds), np.asarray(post_klds), file_name, bins=30) # DUMP PARAMETERS FROM TIME-TO-TIME if (i % 10000 == 0): DN.save_to_file(f_name=RESULT_PATH+"pt_walk_params_b{0:d}_DN.pkl".format(i)) IN.save_to_file(f_name=RESULT_PATH+"pt_walk_params_b{0:d}_IN.pkl".format(i)) GN.save_to_file(f_name=RESULT_PATH+"pt_walk_params_b{0:d}_GN.pkl".format(i)) return
def main(): # 导入高分辨和低分辨的图片 lr_batch, hr_batch = batch_queue_for_training(TRAIN_DATA_PATH) coord = tf.train.Coordinator() # ======================================== # Create Network # ======================================== lr_holders = tf.placeholder( dtype=tf.float32, shape=[BATCH_SIZE, INPUT_SIZE, INPUT_SIZE, NUM_CHENNELS]) hr_holders = tf.placeholder( dtype=tf.float32, shape=[BATCH_SIZE, PATCH_SIZE, PATCH_SIZE, NUM_CHENNELS]) real_data = hr_holders # ---------------------------------------- # Generator with tf.variable_scope('generator', reuse=tf.AUTO_REUSE): fake_data = generator(lr_holders) real_hsv_data = tf.image.rgb_to_hsv(real_data) fake_hsv_data = tf.image.rgb_to_hsv(fake_data) interpolation = tf.image.resize_bicubic(lr_holders, [PATCH_SIZE, PATCH_SIZE]) # ---------------------------------------- # Discriminator with tf.variable_scope('discriminator', reuse=tf.AUTO_REUSE): y_fake = discriminator(fake_data) with tf.variable_scope('discriminator', reuse=tf.AUTO_REUSE): y_real = discriminator(real_data) # ---------------------------------------- # Encoder with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE): real_code = encoder(real_data) with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE): fake_code = encoder(fake_data) # ---------------------------------------- # Code Discriminator with tf.variable_scope('code_discriminator', reuse=tf.AUTO_REUSE): c_fake = code_discriminator(fake_code) with tf.variable_scope('code_discriminator', reuse=tf.AUTO_REUSE): c_real = code_discriminator(real_code) # ======================================== # Create Optimizer # ======================================== variables = tf.trainable_variables() encoder_vars = [var for var in variables if 'encoder/' in var.name] generator_vars = [var for var in variables if 'generator/' in var.name] discriminator_vars = [ var for var in variables if 'discriminator/' in var.name ] code_discriminator_vars = [ var for var in variables if 'code_discriminator/' in var.name ] # ======================================== # Define Loss(WGAN/DCGAN有问题) # ======================================== generator_loss = tf.reduce_mean(y_fake) discriminator_loss = tf.reduce_mean(y_real) - tf.reduce_mean(y_fake) code_generator_loss = tf.reduce_mean(c_fake) code_discriminator_loss = tf.reduce_mean(c_real) - tf.reduce_mean(c_fake) # generator_loss = tf.reduce_mean(-tf.log(y_fake + EPS)) # discriminator_loss = tf.reduce_mean( # -tf.log(y_real + EPS)) - tf.reduce_mean(-tf.log(1 - y_fake + EPS)) # code_generator_loss = tf.reduce_mean(-tf.log(c_fake + EPS)) # code_discriminator_loss = tf.reduce_mean( # -tf.log(c_real + EPS)) - tf.reduce_mean(-tf.log(1 - c_fake + EPS)) reconstruction_loss = tf.reduce_mean( tf.squared_difference(real_data, fake_data)) # 保证图片颜色不变 s_loss = tf.reduce_mean( tf.abs(real_hsv_data[:, :, :, 1] - fake_hsv_data[:, :, :, 1])) h_loss = tf.reduce_mean( tf.abs(real_hsv_data[:, :, :, 0] - fake_hsv_data[:, :, :, 0])) generator_encoder_loss = generator_loss + code_generator_loss \ + reconstruction_loss_weight * reconstruction_loss + s_loss + h_loss # ---------------------------------------- # Generator generator_encoder_opt = tf.train.RMSPropOptimizer(LEARN_RATE).minimize( generator_encoder_loss, var_list=generator_vars + encoder_vars) # ---------------------------------------- # Discriminator discriminator_opt = tf.train.RMSPropOptimizer(LEARN_RATE).minimize( discriminator_loss, var_list=discriminator_vars) # ---------------------------------------- # Code Discriminator code_discriminator_opt = tf.train.RMSPropOptimizer(LEARN_RATE).minimize( code_discriminator_loss, var_list=code_discriminator_vars) # ======================================== # Important # ======================================== # c=0.01 clip_bounds = [-0.01, 0.01] d_clip = [] for var in discriminator_vars: d_clip.append( tf.assign(var, tf.clip_by_value(var, clip_bounds[0], clip_bounds[1]))) clip_disc_weight = tf.group(*d_clip) c_clip = [] for var in code_discriminator_vars: c_clip.append( tf.assign(var, tf.clip_by_value(var, clip_bounds[0], clip_bounds[1]))) clip_code_disc_weight = tf.group(*c_clip) # for summaries with tf.name_scope('Summary'): tf.summary.image('inputs', lr_holders, max_outputs=4) tf.summary.image('generator', fake_data, max_outputs=4) tf.summary.image('interpolation', interpolation, max_outputs=4) tf.summary.image('targets', hr_holders, max_outputs=4) tf.summary.scalar('generator_loss', generator_loss) tf.summary.scalar('s_loss', s_loss) tf.summary.scalar('h_loss', h_loss) tf.summary.scalar('discriminator_loss', discriminator_loss) tf.summary.scalar('code_generator_loss', code_generator_loss) tf.summary.scalar('code_discriminator_loss', code_discriminator_loss) tf.summary.scalar('reconstruction_loss', reconstruction_loss) tf.summary.scalar('generator_encoder_loss', generator_encoder_loss) # 初始化tensorflow sess = tf.Session() init = [ tf.local_variables_initializer(), tf.global_variables_initializer() ] sess.run(init) # the saver will restore all model's variables during training saver = tf.train.Saver(tf.global_variables(), max_to_keep=MAX_CKPT_TO_KEEP) try: saved_global_step = load(saver, sess, CHECKPOINTS_PATH) if saved_global_step is None: saved_global_step = 0 except: raise ValueError( "You have changed the model, Please Delete CheckPoints!") # Start the queue runners (make batches). threads = tf.train.start_queue_runners(sess=sess, coord=coord) # Merage all the summaries and write them out to TRAINING_DIR merged_summary = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(TRAIN_SUMMARY_PATH, sess.graph) num_item_per_epoch = len(os.listdir(TRAIN_DATA_PATH)) // BATCH_SIZE time_i = time.time() step = saved_global_step for epoch in range(NUM_EPOCH): for item in range(num_item_per_epoch): lr_images, hr_images = sess.run([lr_batch, hr_batch]) feed_dict = {lr_holders: lr_images, hr_holders: hr_images} # ------------------train G twice------------------- _, gene_loss = sess.run( [generator_encoder_opt, generator_encoder_loss], feed_dict=feed_dict) _, gene_loss = sess.run( [generator_encoder_opt, generator_encoder_loss], feed_dict=feed_dict) # ------------------train D ------------------------ _, d_loss = sess.run( [discriminator_opt, discriminator_loss], feed_dict=feed_dict) # ------------------train code_D ------------------- _, c_loss = sess.run( [code_discriminator_opt, code_discriminator_loss], feed_dict=feed_dict) sess.run( [clip_disc_weight, clip_code_disc_weight], feed_dict=feed_dict) sr_img = sess.run(fake_data, feed_dict=feed_dict) interpolation_img = sess.run(interpolation, feed_dict=feed_dict) summary = sess.run(merged_summary, feed_dict=feed_dict) summary_writer.add_summary(summary, global_step=step) message = 'Epoch [{:3d}/{:3d}]'.format(epoch + 1, NUM_EPOCH) \ + '[{:4d}/{:4d}]'.format(item + 1, num_item_per_epoch) \ + 'gene_loss={:6.8f}, '.format(gene_loss) \ + 'd_loss={:6.8f}, '.format(d_loss) \ + 'c_loss={:6.8f}, '.format(c_loss) \ + 'Time={:.2f}.'.format(time.time() - time_i) print(message) step += 1 visualize_samples( sess, hr_images, sr_img, interpolation_img, filename=os.path.join(INFERENCES_SAVE_PATH, 'trian-epoch-{:03d}.png'.format(epoch + 1))) save(saver, sess, CHECKPOINTS_PATH, step) coord.request_stop() coord.join(threads=threads)
def test_with_model_init(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr_shared = datasets[0][0] Xva_shared = datasets[1][0] Xtr = Xtr_shared.get_value(borrow=False).astype(theano.config.floatX) Xva = Xva_shared.get_value(borrow=False).astype(theano.config.floatX) tr_samples = Xtr.shape[0] batch_size = 200 batch_reps = 1 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ obs_dim = Xtr.shape[1] z_dim = 20 h_dim = 100 x_type = 'bernoulli' # some InfNet instances to build the TwoStageModel from X_sym = T.matrix('X_sym') ######################## # p_s0_obs_given_z_obs # ######################## params = {} shared_config = [z_dim, 250, 250] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 1e-3 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_s0_obs_given_z_obs = InfNet(rng=rng, Xd=X_sym, \ params=params, shared_param_dicts=None) p_s0_obs_given_z_obs.init_biases(0.2) ################# # p_hi_given_si # ################# params = {} shared_config = [obs_dim, 250, 250] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_hi_given_si = InfNet(rng=rng, Xd=X_sym, \ params=params, shared_param_dicts=None) p_hi_given_si.init_biases(0.2) ###################### # p_sip1_given_si_hi # ###################### params = {} shared_config = [h_dim, 250, 250] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_sip1_given_si_hi = InfNet(rng=rng, Xd=X_sym, \ params=params, shared_param_dicts=None) p_sip1_given_si_hi.init_biases(0.2) ############### # q_z_given_x # ############### params = {} shared_config = [obs_dim, 250, 250] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_z_given_x = InfNet(rng=rng, Xd=X_sym, \ params=params, shared_param_dicts=None) q_z_given_x.init_biases(0.2) ################### # q_hi_given_x_si # ################### params = {} shared_config = [(obs_dim + obs_dim), 500, 500] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_hi_given_x_si = InfNet(rng=rng, Xd=X_sym, \ params=params, shared_param_dicts=None) q_hi_given_x_si.init_biases(0.2) ################################################################ # Define parameters for the MultiStageModel, and initialize it # ################################################################ print("Building the MultiStageModel...") msm_params = {} msm_params['x_type'] = x_type msm_params['obs_transform'] = 'sigmoid' MSM = MultiStageModel(rng=rng, x_in=X_sym, \ p_s0_obs_given_z_obs=p_s0_obs_given_z_obs, \ p_hi_given_si=p_hi_given_si, \ p_sip1_given_si_hi=p_sip1_given_si_hi, \ q_z_given_x=q_z_given_x, \ q_hi_given_x_si=q_hi_given_x_si, \ obs_dim=obs_dim, z_dim=z_dim, h_dim=h_dim, \ model_init_obs=True, ir_steps=2, \ params=msm_params) obs_mean = (0.9 * np.mean(Xtr, axis=0)) + 0.05 obs_mean_logit = np.log(obs_mean / (1.0 - obs_mean)) MSM.set_input_bias(-obs_mean) MSM.set_obs_bias(0.1*obs_mean_logit) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ costs = [0. for i in range(10)] learn_rate = 0.0003 momentum = 0.8 for i in range(300000): scale = min(1.0, ((i+1) / 10000.0)) extra_kl = max(0.0, ((50000.0 - i) / 50000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # randomly sample a minibatch tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,)) Xb = binarize_data(Xtr.take(tr_idx, axis=0)) Xb = Xb.astype(theano.config.floatX) # set sgd and objective function hyperparams for this update MSM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \ mom_1=(scale*momentum), mom_2=0.98) MSM.set_train_switch(1.0) MSM.set_l1l2_weight(1.0) MSM.set_lam_nll(lam_nll=1.0) MSM.set_lam_kld(lam_kld_1=(1.0+extra_kl), lam_kld_2=(1.0+extra_kl)) MSM.set_lam_l2w(1e-6) MSM.set_kzg_weight(0.01) # perform a minibatch update and record the cost for this batch result = MSM.train_joint(Xb, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] print("-- batch {0:d} --".format(i)) print(" joint_cost: {0:.4f}".format(costs[0])) print(" nll_cost : {0:.4f}".format(costs[1])) print(" kld_cost : {0:.4f}".format(costs[2])) print(" reg_cost : {0:.4f}".format(costs[3])) costs = [0.0 for v in costs] if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))): Xva = row_shuffle(Xva) # draw some independent random samples from the model samp_count = 200 model_samps = MSM.sample_from_prior(samp_count) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "MX_SAMPLES_b{0:d}.png".format(i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # visualize some important weights in the model file_name = "MX_INF_1_WEIGHTS_b{0:d}.png".format(i) W = MSM.inf_1_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MX_INF_2_WEIGHTS_b{0:d}.png".format(i) W = MSM.inf_2_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MX_GEN_1_WEIGHTS_b{0:d}.png".format(i) W = MSM.gen_1_weights.get_value(borrow=False) utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MX_GEN_2_WEIGHTS_b{0:d}.png".format(i) W = MSM.gen_2_weights.get_value(borrow=False) utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MX_GEN_INF_WEIGHTS_b{0:d}.png".format(i) W = MSM.gen_inf_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) # compute information about posterior KLds on validation set post_klds = MSM.compute_post_klds(Xva[0:5000]) file_name = "MX_H0_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(post_klds[0].shape[1]), \ np.mean(post_klds[0], axis=0), file_name) file_name = "MX_HI_COND_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(post_klds[1].shape[1]), \ np.mean(post_klds[1], axis=0), file_name) file_name = "MX_HI_GLOB_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(post_klds[2].shape[1]), \ np.mean(post_klds[2], axis=0), file_name) # compute information about free-energy on validation set file_name = "MX_FREE_ENERGY_b{0:d}.png".format(i) fe_terms = MSM.compute_fe_terms(binarize_data(Xva[0:5000]), 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) print(" nll_bound : {0:.4f}".format(fe_mean)) utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ x_label='Posterior KLd', y_label='Negative Log-likelihood') return
def test_with_model_init(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = to_fX(datasets[0][0]) Xva = to_fX(datasets[1][0]) Ytr = datasets[0][1] Yva = datasets[1][1] Xtr_class_groups = make_class_groups(Xtr, Ytr) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 300 BD = lambda ary: binarize_data(ary) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ obs_dim = Xtr.shape[1] z_dim = 32 h_dim = 100 ir_steps = 2 init_scale = 1.0 x_type = 'bernoulli' # some InfNet instances to build the TwoStageModel from x_in = T.matrix('x_in') x_pos = T.matrix('x_pos') y_in = T.lvector('y_in') ################# # p_hi_given_si # ################# params = {} shared_config = [obs_dim, 500, 500] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_hi_given_si = InfNet(rng=rng, Xd=x_in, \ params=params, shared_param_dicts=None) p_hi_given_si.init_biases(0.2) ###################### # p_sip1_given_si_hi # ###################### params = {} shared_config = [(h_dim + obs_dim), 500, 500] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_sip1_given_si_hi = InfNet(rng=rng, Xd=x_in, \ params=params, shared_param_dicts=None) p_sip1_given_si_hi.init_biases(0.2) ################ # p_s0_given_z # ################ params = {} shared_config = [z_dim, 500, 500] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_s0_given_z = InfNet(rng=rng, Xd=x_in, \ params=params, shared_param_dicts=None) p_s0_given_z.init_biases(0.2) ############### # q_z_given_x # ############### params = {} shared_config = [obs_dim, (500, 4), (500, 4)] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.2 params['hid_drop'] = 0.5 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_z_given_x = InfNet(rng=rng, Xd=x_in, \ params=params, shared_param_dicts=None) q_z_given_x.init_biases(0.0) ################### # q_hi_given_x_si # ################### params = {} shared_config = [(obs_dim + obs_dim), 800, 800] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_hi_given_x_si = InfNet(rng=rng, Xd=x_in, \ params=params, shared_param_dicts=None) q_hi_given_x_si.init_biases(0.2) ################################################################ # Define parameters for the MultiStageModel, and initialize it # ################################################################ print("Building the MultiStageModel...") msm_params = {} msm_params['x_type'] = x_type msm_params['obs_transform'] = 'sigmoid' MSM = MultiStageModelSS(rng=rng, \ x_in=x_in, x_pos=x_pos, y_in=y_in, \ p_s0_given_z=p_s0_given_z, \ p_hi_given_si=p_hi_given_si, \ p_sip1_given_si_hi=p_sip1_given_si_hi, \ q_z_given_x=q_z_given_x, \ q_hi_given_x_si=q_hi_given_x_si, \ class_count=10, \ obs_dim=obs_dim, z_dim=z_dim, h_dim=h_dim, \ ir_steps=ir_steps, params=msm_params) MSM.set_lam_class(lam_class=20.0) MSM.set_lam_nll(lam_nll=1.0) MSM.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.9, \ lam_kld_p2q=0.1) MSM.set_lam_l2w(1e-4) MSM.set_drop_rate(0.0) MSM.q_hi_given_x_si.set_bias_noise(0.0) MSM.p_hi_given_si.set_bias_noise(0.0) MSM.p_sip1_given_si_hi.set_bias_noise(0.0) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ out_file = open("MSS_A_RESULTS.txt", 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.5 batch_idx = np.arange(batch_size) + tr_samples for i in range(250000): scale = min(1.0, ((i+1) / 2000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 if (i > 20000): momentum = 0.90 else: momentum = 0.50 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr, Ytr = row_shuffle(Xtr, Ytr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update MSM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.99) MSM.set_train_switch(1.0) # perform a minibatch update and record the cost for this batch Xi_tr = Xtr.take(batch_idx, axis=0) Yi_tr = Ytr.take(batch_idx, axis=0) Xp_tr, Xn_tr = sample_class_groups(Yi_tr, Xtr_class_groups) result = MSM.train_joint(BD(Xi_tr), BD(Xp_tr), Yi_tr) costs = [(costs[j] + result[j]) for j in range(len(result)-1)] # output useful information about training progress if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost : {0:.4f}".format(costs[0]) str3 = " class_cost : {0:.4f}".format(costs[1]) str4 = " nll_cost : {0:.4f}".format(costs[2]) str5 = " kld_cost : {0:.4f}".format(costs[3]) str6 = " reg_cost : {0:.4f}".format(costs[4]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))): # Get some validation samples for computing diagnostics Xva, Yva = row_shuffle(Xva, Yva) Xb_va = Xva[0:2500] Yb_va = Yva[0:2500] # draw some independent random samples from the model samp_count = 200 model_samps = MSM.sample_from_prior(samp_count) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "MSS_A_SAMPLES_IND_b{0:d}.png".format(i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # draw some conditional random samples from the model Xs = Xb_va[0:50] # only use validation set samples Xs = np.repeat(Xs, 4, axis=0) samp_count = Xs.shape[0] utils.visualize_samples(seq_samps, file_name, num_rows=20) # draw some conditional random samples from the model model_samps = MSM.sample_from_input(BD(Xs), guided_decoding=False) model_samps.append(Xs) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "MSS_A_SAMPLES_CND_UD_b{0:d}.png".format(i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # compute information about posterior KLds on validation set raw_costs = MSM.compute_raw_costs(BD(Xb_va), BD(Xb_va)) init_nll, init_kld, q2p_kld, p2q_kld, step_nll, step_kld = raw_costs file_name = "MSS_A_H0_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(init_kld.shape[1]), \ np.mean(init_kld, axis=0), file_name) file_name = "MSS_A_HI_Q2P_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(q2p_kld.shape[1]), \ np.mean(q2p_kld, axis=0), file_name) file_name = "MSS_A_HI_P2Q_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(p2q_kld.shape[1]), \ np.mean(p2q_kld, axis=0), file_name) # draw weights for the initial encoder/classifier file_name = "MSS_A_QZX_WEIGHTS_b{0:d}.png".format(i) W = q_z_given_x.shared_layers[0].W.get_value(borrow=False).T utils.visualize_samples(W, file_name, num_rows=20) # compute free-energy terms on training samples fe_terms = MSM.compute_fe_terms(BD(Xtr[0:2500]), BD(Xtr[0:2500]), 30) fe_nll = np.mean(fe_terms[0]) fe_kld = np.mean(fe_terms[1]) fe_joint = fe_nll + fe_kld joint_str = " vfe-tr: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() # compute free-energy terms on validation samples fe_terms = MSM.compute_fe_terms(BD(Xb_va), BD(Xb_va), 30) fe_nll = np.mean(fe_terms[0]) fe_kld = np.mean(fe_terms[1]) fe_joint = fe_nll + fe_kld joint_str = " vfe-va: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() # compute multi-sample estimate of classification error err_rate, err_idx, y_preds = MSM.class_error(Xb_va, Yb_va, \ samples=30, prep_func=BD) joint_str = " va-class-error: {0:.4f}".format(err_rate) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() # draw some conditional random samples from the model Xs = Xb_va[err_idx] # use validation samples with class errors if (Xs.shape[0] > 50): Xs = Xs[:50] Xs = np.repeat(Xs, 4, axis=0) if ((Xs.shape[0] % 20) != 0): # round-off the number of error examples, for nice display remainder = Xs.shape[0] % 20 Xs = Xs[:-remainder] samp_count = Xs.shape[0] # draw some conditional random samples from the model model_samps = MSM.sample_from_input(BD(Xs), guided_decoding=False) model_samps.append(Xs) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "MSS_A_SAMPLES_CND_ERR_b{0:d}.png".format(i) utils.visualize_samples(seq_samps, file_name, num_rows=20)
def train_walk_from_pretrained_osm(lam_kld=0.0): # Simple test code, to check that everything is basically functional. print("TESTING...") # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with data_file = 'data/tfd_data_48x48.pkl' dataset = load_tfd(tfd_pkl_name=data_file, which_set='unlabeled', fold='all') Xtr_unlabeled = dataset[0] dataset = load_tfd(tfd_pkl_name=data_file, which_set='train', fold='all') Xtr_train = dataset[0] Xtr = np.vstack([Xtr_unlabeled, Xtr_train]) dataset = load_tfd(tfd_pkl_name=data_file, which_set='valid', fold='all') Xva = dataset[0] print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape), str(Xva.shape))) # get and set some basic dataset information tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] data_dim = Xtr.shape[1] batch_size = 400 batch_reps = 5 prior_sigma = 1.0 Xtr_mean = np.mean(Xtr, axis=0, keepdims=True) Xtr_mean = (0.0 * Xtr_mean) + np.mean(np.mean(Xtr, axis=1)) Xc_mean = np.repeat(Xtr_mean, batch_size, axis=0) # Symbolic inputs Xd = T.matrix(name='Xd') Xc = T.matrix(name='Xc') Xm = T.matrix(name='Xm') Xt = T.matrix(name='Xt') ############################### # Setup discriminator network # ############################### # Set some reasonable mlp parameters dn_params = {} # Set up some proto-networks pc0 = [data_dim, (300, 4), (300, 4), 10] dn_params['proto_configs'] = [pc0] # Set up some spawn networks sc0 = { 'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True } #sc1 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} dn_params['spawn_configs'] = [sc0] dn_params['spawn_weights'] = [1.0] # Set remaining params dn_params['init_scale'] = 1.0 dn_params['lam_l2a'] = 1e-2 dn_params['vis_drop'] = 0.2 dn_params['hid_drop'] = 0.5 # Initialize a network object to use as the discriminator DN = PeaNet(rng=rng, Xd=Xd, params=dn_params) DN.init_biases(0.0) ####################################################### # Load inferencer and generator from saved parameters # ####################################################### gn_fname = RESULT_PATH + "pt_osm_params_b100000_GN.pkl" in_fname = RESULT_PATH + "pt_osm_params_b100000_IN.pkl" IN = load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) GN = load_infnet_from_file(f_name=gn_fname, rng=rng, Xd=Xd) ######################################################## # Define parameters for the VCGLoop, and initialize it # ######################################################## print("Building the VCGLoop...") vcgl_params = {} vcgl_params['x_type'] = 'gaussian' vcgl_params['xt_transform'] = 'sigmoid' vcgl_params['logvar_bound'] = LOGVAR_BOUND vcgl_params['cost_decay'] = 0.1 vcgl_params['chain_type'] = 'walkout' vcgl_params['lam_l2d'] = 5e-2 VCGL = VCGLoop(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, Xt=Xt, \ i_net=IN, g_net=GN, d_net=DN, chain_len=5, \ data_dim=data_dim, prior_dim=PRIOR_DIM, params=vcgl_params) out_file = open(RESULT_PATH + "pt_walk_results.txt", 'wb') #################################################### # Train the VCGLoop by unrolling and applying BPTT # #################################################### learn_rate = 0.0005 cost_1 = [0. for i in range(10)] for i in range(100000): scale = float(min((i + 1), 5000)) / 5000.0 if ((i + 1 % 25000) == 0): learn_rate = learn_rate * 0.8 ######################################## # TRAIN THE CHAIN IN FREE-RUNNING MODE # ######################################## VCGL.set_all_sgd_params(learn_rate=(scale*learn_rate), \ mom_1=0.9, mom_2=0.99) VCGL.set_disc_weights(dweight_gn=25.0, dweight_dn=25.0) VCGL.set_lam_chain_nll(1.0) VCGL.set_lam_chain_kld(lam_kld) # get some data to train with tr_idx = npr.randint(low=0, high=tr_samples, size=(batch_size, )) Xd_batch = Xtr.take(tr_idx, axis=0) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # examples from the target distribution, to train discriminator tr_idx = npr.randint(low=0, high=tr_samples, size=(2 * batch_size, )) Xt_batch = Xtr.take(tr_idx, axis=0) # do a minibatch update of the model, and compute some costs outputs = VCGL.train_joint(Xd_batch, Xc_batch, Xm_batch, Xt_batch, batch_reps) cost_1 = [(cost_1[k] + 1. * outputs[k]) for k in range(len(outputs))] if ((i % 500) == 0): cost_1 = [(v / 500.0) for v in cost_1] o_str_1 = "batch: {0:d}, joint_cost: {1:.4f}, chain_nll_cost: {2:.4f}, chain_kld_cost: {3:.4f}, disc_cost_gn: {4:.4f}, disc_cost_dn: {5:.4f}".format( \ i, cost_1[0], cost_1[1], cost_1[2], cost_1[5], cost_1[6]) print(o_str_1) cost_1 = [0. for v in cost_1] if ((i % 1000) == 0): tr_idx = npr.randint(low=0, high=Xtr.shape[0], size=(5, )) va_idx = npr.randint(low=0, high=Xva.shape[0], size=(5, )) Xd_batch = np.vstack( [Xtr.take(tr_idx, axis=0), Xva.take(va_idx, axis=0)]) # draw some chains of samples from the VAE loop file_name = RESULT_PATH + "pt_walk_chain_samples_b{0:d}.png".format( i) Xd_samps = np.repeat(Xd_batch, 3, axis=0) sample_lists = VCGL.OSM.sample_from_chain(Xd_samps, loop_iters=20) Xs = np.vstack(sample_lists["data samples"]) utils.visualize_samples(Xs, file_name, num_rows=20) # draw some masked chains of samples from the VAE loop file_name = RESULT_PATH + "pt_walk_mask_samples_b{0:d}.png".format( i) Xd_samps = np.repeat(Xc_mean[0:Xd_batch.shape[0], :], 3, axis=0) Xc_samps = np.repeat(Xd_batch, 3, axis=0) Xm_rand = sample_masks(Xc_samps, drop_prob=0.0) Xm_patch = sample_patch_masks(Xc_samps, (48, 48), (25, 25)) Xm_samps = Xm_rand * Xm_patch sample_lists = VCGL.OSM.sample_from_chain(Xd_samps, \ X_c=Xc_samps, X_m=Xm_samps, loop_iters=20) Xs = np.vstack(sample_lists["data samples"]) utils.visualize_samples(Xs, file_name, num_rows=20) # draw some samples independently from the GenNet's prior file_name = RESULT_PATH + "pt_walk_prior_samples_b{0:d}.png".format( i) Xs = VCGL.sample_from_prior(20 * 20) utils.visualize_samples(Xs, file_name, num_rows=20) # DUMP PARAMETERS FROM TIME-TO-TIME if (i % 5000 == 0): DN.save_to_file(f_name=RESULT_PATH + "pt_walk_params_b{0:d}_DN.pkl".format(i)) IN.save_to_file(f_name=RESULT_PATH + "pt_walk_params_b{0:d}_IN.pkl".format(i)) GN.save_to_file(f_name=RESULT_PATH + "pt_walk_params_b{0:d}_GN.pkl".format(i)) return
# configure a trajectory generator num_samples = 100 traj_len = 64 x_range = [-0.8,0.8] y_range = [-0.8,0.8] max_speed = 0.15 TRAJ = TrajectoryGenerator(x_range=x_range, y_range=y_range, \ max_speed=max_speed) # test the writer function start_time = time.time() batch_count = 50 for i in range(batch_count): # generate a minibatch of trajectories traj_pos, traj_vel = TRAJ.generate_trajectories(num_samples, traj_len) traj_x = traj_pos[:,:,0] traj_y = traj_pos[:,:,1] # draw the trajectories center_x = to_fX( traj_x.T.ravel() ) center_y = to_fX( traj_y.T.ravel() ) delta = to_fX( np.ones(center_x.shape) ) sigma = to_fX( np.ones(center_x.shape) ) W = write_func(center_y, center_x, delta, 0.2*sigma) end_time = time.time() render_time = end_time - start_time render_bps = batch_count / render_time print("RENDER BATCH/SECOND: {0:.2f}".format(render_bps)) W = W[:20*traj_len] utils.visualize_samples(W, "AAAAA.png", num_rows=20)
def test_two_stage_model2(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/') Xtr = np.vstack((Xtr, Xva)) Xva = Xte #del Xte tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 128 batch_reps = 1 ############################################### # Setup some parameters for the TwoStageModel # ############################################### x_dim = Xtr.shape[1] z_dim = 50 h_dim = 100 x_type = 'bernoulli' # some InfNet instances to build the TwoStageModel from xin_sym = T.matrix('xin_sym') xout_sym = T.matrix('xout_sym') ############### # p_h_given_z # ############### params = {} shared_config = \ [ {'layer_type': 'fc', 'in_chans': z_dim, 'out_chans': 100, 'activation': tanh_actfun, 'apply_bn': True}, \ {'layer_type': 'fc', 'in_chans': 100, 'out_chans': 100, 'activation': tanh_actfun, 'apply_bn': True} ] output_config = \ [ {'layer_type': 'fc', 'in_chans': 100, 'out_chans': h_dim, 'activation': tanh_actfun, 'apply_bn': False}, \ {'layer_type': 'fc', 'in_chans': 100, 'out_chans': h_dim, 'activation': tanh_actfun, 'apply_bn': False} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = 1.0 params['build_theano_funcs'] = False p_h_given_z = HydraNet(rng=rng, Xd=xin_sym, params=params, shared_param_dicts=None) p_h_given_z.init_biases(0.0) ############### # p_x_given_h # ############### params = {} shared_config = \ [ {'layer_type': 'fc', 'in_chans': h_dim, 'out_chans': 200, 'activation': tanh_actfun, 'apply_bn': True}, \ {'layer_type': 'fc', 'in_chans': 200, 'out_chans': 200, 'activation': tanh_actfun, 'apply_bn': True} ] output_config = \ [ {'layer_type': 'fc', 'in_chans': 200, 'out_chans': x_dim, 'activation': tanh_actfun, 'apply_bn': False}, \ {'layer_type': 'fc', 'in_chans': 200, 'out_chans': x_dim, 'activation': tanh_actfun, 'apply_bn': False} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = 1.0 params['build_theano_funcs'] = False p_x_given_h = HydraNet(rng=rng, Xd=xin_sym, params=params, shared_param_dicts=None) p_x_given_h.init_biases(0.0) ############### # q_h_given_x # ############### params = {} shared_config = \ [ {'layer_type': 'fc', 'in_chans': x_dim, 'out_chans': 200, 'activation': tanh_actfun, 'apply_bn': True}, \ {'layer_type': 'fc', 'in_chans': 200, 'out_chans': 200, 'activation': tanh_actfun, 'apply_bn': True} ] output_config = \ [ {'layer_type': 'fc', 'in_chans': 200, 'out_chans': h_dim, 'activation': tanh_actfun, 'apply_bn': False}, \ {'layer_type': 'fc', 'in_chans': 200, 'out_chans': h_dim, 'activation': tanh_actfun, 'apply_bn': False} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = 1.0 params['build_theano_funcs'] = False q_h_given_x = HydraNet(rng=rng, Xd=xin_sym, params=params, shared_param_dicts=None) q_h_given_x.init_biases(0.0) ############### # q_z_given_h # ############### params = {} shared_config = \ [ {'layer_type': 'fc', 'in_chans': h_dim, 'out_chans': 100, 'activation': tanh_actfun, 'apply_bn': True}, \ {'layer_type': 'fc', 'in_chans': 100, 'out_chans': 100, 'activation': tanh_actfun, 'apply_bn': True} ] output_config = \ [ {'layer_type': 'fc', 'in_chans': 100, 'out_chans': z_dim, 'activation': tanh_actfun, 'apply_bn': False}, \ {'layer_type': 'fc', 'in_chans': 100, 'out_chans': z_dim, 'activation': tanh_actfun, 'apply_bn': False} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = 1.0 params['build_theano_funcs'] = False q_z_given_h = HydraNet(rng=rng, Xd=xin_sym, params=params, shared_param_dicts=None) q_z_given_h.init_biases(0.0) ############################################################## # Define parameters for the TwoStageModel, and initialize it # ############################################################## print("Building the TwoStageModel...") tsm_params = {} tsm_params['x_type'] = x_type tsm_params['obs_transform'] = 'sigmoid' TSM = TwoStageModel2(rng=rng, x_in=xin_sym, x_out=xout_sym, x_dim=x_dim, z_dim=z_dim, h_dim=h_dim, q_h_given_x=q_h_given_x, q_z_given_h=q_z_given_h, p_h_given_z=p_h_given_z, p_x_given_h=p_x_given_h, params=tsm_params) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format("TSM2A_TEST") out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.001 momentum = 0.9 batch_idx = np.arange(batch_size) + tr_samples for i in range(500000): scale = min(1.0, ((i+1) / 5000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) Xb = to_fX( Xtr.take(batch_idx, axis=0) ) #Xb = binarize_data(Xtr.take(batch_idx, axis=0)) # set sgd and objective function hyperparams for this update TSM.set_sgd_params(lr=scale*learn_rate, mom_1=(scale*momentum), mom_2=0.98) TSM.set_train_switch(1.0) TSM.set_lam_nll(lam_nll=1.0) TSM.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0) TSM.set_lam_l2w(1e-5) # perform a minibatch update and record the cost for this batch result = TSM.train_joint(Xb, Xb, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_cost : {0:.4f}".format(costs[1]) str4 = " kld_cost : {0:.4f}".format(costs[2]) str5 = " reg_cost : {0:.4f}".format(costs[3]) str6 = " nll : {0:.4f}".format(np.mean(costs[4])) str7 = " kld_z : {0:.4f}".format(np.mean(costs[5])) str8 = " kld_h : {0:.4f}".format(np.mean(costs[6])) joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7, str8]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if (((i % 5000) == 0) or ((i < 10000) and ((i % 1000) == 0))): # draw some independent random samples from the model samp_count = 300 model_samps = TSM.sample_from_prior(samp_count) file_name = "TSM2A_SAMPLES_b{0:d}.png".format(i) utils.visualize_samples(model_samps, file_name, num_rows=15) # compute free energy estimate for validation samples Xva = row_shuffle(Xva) fe_terms = TSM.compute_fe_terms(Xva[0:5000], Xva[0:5000], 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) out_str = " nll_bound : {0:.4f}".format(fe_mean) print(out_str) out_file.write(out_str+"\n") out_file.flush() return
out_file.write("{}\n".format(o_str_su)) if ((i % 2000) == 0): # check classification error on training and validation set train_err = GIS.classification_error(Xtr_su, Ytr_su) va_err = GIS.classification_error(Xva, Yva) o_str = " tr_err: {0:.4f}, va_err: {1:.4f}".format(train_err, va_err) print(o_str) out_file.write("{}\n".format(o_str)) out_file.flush() if ((i % 5000) == 0): file_name = "GIS_SAMPLES_b{0:d}.png".format(i) tr_idx = npr.randint(low=0,high=un_samples,size=(5,)) va_idx = npr.randint(low=0,high=va_samples,size=(5,)) Xd_samps = np.vstack([Xtr_un[tr_idx,:], Xva[va_idx,:]]) Xd_samps = np.repeat(Xd_samps, 3, axis=0) sample_lists = GIS.sample_gis_from_data(Xd_samps, loop_iters=20) Xs = np.vstack(sample_lists["data samples"]) Ys = GIS.class_probs(Xs) Xs = mnist_prob_embed(Xs, Ys) utils.visualize_samples(Xs, file_name, num_rows=20) out_file.close() print("TESTING COMPLETE!") ############## # EYE BUFFER # ##############
outputs = GIP.train_joint(Xd_batch, Xc_batch, Xm_batch) nll_data = 1.0 * outputs[1] kld_data = 1.0 * outputs[2] if ((i + 1) % 5000 == 0): print(" nll_data: {0:.4f}, kld_data: {1:.4f}".format( \ nll_data, kld_data)) if ((i+1) % 100000 == 0): learn_rate = learn_rate * 0.8 if (i % 1000 == 0): print("batch: {0:d}, mom_match_cost: {1:.4f}, disc_dn: {2:.6f}, disc_gn: {3:.6f}, nll: {4:.4f}, kld: {5:.4f}".format( \ i, mom_match_cost, disc_dn, disc_gn, nll_cost, kld_cost)) if (i % 5000 == 0): # draw independent samples from generative model's prior file_name = "VCG_SAMPLES_b{0:d}.png".format(i) Xs = VCG.sample_from_gn(200) utils.visualize_samples(Xs, file_name) file_name = "VCG_WEIGHTS_b{0:d}.png".format(i) utils.visualize(VCG.DN, 0, 0, file_name) # draw "markov chain" samples initiated from training data file_name = "VCG_SAMPLES_a{0:d}.png".format(i) Xd_samps = np.repeat(Xd_batch[0:10,:], 3, axis=0) sample_lists = GIP.sample_gil_from_data(Xd_samps, loop_iters=10) Xs = np.vstack(sample_lists["data samples"]) utils.visualize_samples(Xs, file_name) print("TESTING COMPLETE!") ############## # EYE BUFFER #
def test_gip_sigma_scale_tfd(): from LogPDFs import cross_validate_sigma # Simple test code, to check that everything is basically functional. print("TESTING...") # Initialize a source of randomness rng = np.random.RandomState(12345) # Load some data to train/validate/test with data_file = "data/tfd_data_48x48.pkl" dataset = load_tfd(tfd_pkl_name=data_file, which_set="unlabeled", fold="all") Xtr_unlabeled = dataset[0] dataset = load_tfd(tfd_pkl_name=data_file, which_set="train", fold="all") Xtr_train = dataset[0] Xtr = np.vstack([Xtr_unlabeled, Xtr_train]) dataset = load_tfd(tfd_pkl_name=data_file, which_set="test", fold="all") Xva = dataset[0] tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape), str(Xva.shape))) # get and set some basic dataset information tr_samples = Xtr.shape[0] data_dim = Xtr.shape[1] batch_size = 100 # Symbolic inputs Xd = T.matrix(name="Xd") Xc = T.matrix(name="Xc") Xm = T.matrix(name="Xm") Xt = T.matrix(name="Xt") # Load inferencer and generator from saved parameters gn_fname = "TFD_WALKOUT_TEST_KLD/pt_walk_params_b25000_GN.pkl" in_fname = "TFD_WALKOUT_TEST_KLD/pt_walk_params_b25000_IN.pkl" IN = load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) GN = load_infnet_from_file(f_name=gn_fname, rng=rng, Xd=Xd) x_dim = IN.shared_layers[0].in_dim z_dim = IN.mu_layers[-1].out_dim # construct a GIPair with the loaded InfNet and GenNet osm_params = {} osm_params["x_type"] = "gaussian" osm_params["xt_transform"] = "sigmoid" osm_params["logvar_bound"] = LOGVAR_BOUND OSM = OneStageModel( rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, p_x_given_z=GN, q_z_given_x=IN, x_dim=x_dim, z_dim=z_dim, params=osm_params ) # # compute variational likelihood bound and its sub-components Xva = row_shuffle(Xva) Xb = Xva[0:5000] # file_name = "A_TFD_POST_KLDS.png" # post_klds = OSM.compute_post_klds(Xb) # post_dim_klds = np.mean(post_klds, axis=0) # utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, \ # file_name) # compute information about free-energy on validation set file_name = "A_TFD_KLD_FREE_ENERGY.png" fe_terms = OSM.compute_fe_terms(Xb, 20) utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, x_label="Posterior KLd", y_label="Negative Log-likelihood") # bound_results = OSM.compute_ll_bound(Xva) # ll_bounds = bound_results[0] # post_klds = bound_results[1] # log_likelihoods = bound_results[2] # max_lls = bound_results[3] # print("mean ll bound: {0:.4f}".format(np.mean(ll_bounds))) # print("mean posterior KLd: {0:.4f}".format(np.mean(post_klds))) # print("mean log-likelihood: {0:.4f}".format(np.mean(log_likelihoods))) # print("mean max log-likelihood: {0:.4f}".format(np.mean(max_lls))) # print("min ll bound: {0:.4f}".format(np.min(ll_bounds))) # print("max posterior KLd: {0:.4f}".format(np.max(post_klds))) # print("min log-likelihood: {0:.4f}".format(np.min(log_likelihoods))) # print("min max log-likelihood: {0:.4f}".format(np.min(max_lls))) # # compute some information about the approximate posteriors # post_stats = OSM.compute_post_stats(Xva, 0.0*Xva, 0.0*Xva) # all_post_klds = np.sort(post_stats[0].ravel()) # post KLds for each obs and dim # obs_post_klds = np.sort(post_stats[1]) # summed post KLds for each obs # post_dim_klds = post_stats[2] # average post KLds for each post dim # post_dim_vars = post_stats[3] # average squared mean for each post dim # utils.plot_line(np.arange(all_post_klds.shape[0]), all_post_klds, "AAA_ALL_POST_KLDS.png") # utils.plot_line(np.arange(obs_post_klds.shape[0]), obs_post_klds, "AAA_OBS_POST_KLDS.png") # utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, "AAA_POST_DIM_KLDS.png") # utils.plot_stem(np.arange(post_dim_vars.shape[0]), post_dim_vars, "AAA_POST_DIM_VARS.png") # draw many samples from the GIP for i in range(5): tr_idx = npr.randint(low=0, high=tr_samples, size=(100,)) Xd_batch = Xtr.take(tr_idx, axis=0) Xs = [] for row in range(3): Xs.append([]) for col in range(3): sample_lists = OSM.sample_from_chain(Xd_batch[0:10, :], loop_iters=100, sigma_scale=1.0) Xs[row].append(group_chains(sample_lists["data samples"])) Xs, block_im_dim = block_video(Xs, (48, 48), (3, 3)) to_video(Xs, block_im_dim, "A_TFD_KLD_CHAIN_VIDEO_{0:d}.avi".format(i), frame_rate=10) # sample_lists = GIP.sample_from_chain(Xd_batch[0,:].reshape((1,data_dim)), loop_iters=300, \ # sigma_scale=1.0) # Xs = np.vstack(sample_lists["data samples"]) # file_name = "TFD_TEST_{0:d}.png".format(i) # utils.visualize_samples(Xs, file_name, num_rows=15) file_name = "A_TFD_KLD_PRIOR_SAMPLE.png" Xs = OSM.sample_from_prior(20 * 20) utils.visualize_samples(Xs, file_name, num_rows=20) # test Parzen density estimator built from prior samples # Xs = OSM.sample_from_prior(10000) # [best_sigma, best_ll, best_lls] = \ # cross_validate_sigma(Xs, Xva, [0.09, 0.095, 0.1, 0.105, 0.11], 10) # sort_idx = np.argsort(best_lls) # sort_idx = sort_idx[0:400] # utils.plot_line(np.arange(sort_idx.shape[0]), best_lls[sort_idx], "A_TFD_BEST_LLS_1.png") # utils.visualize_samples(Xva[sort_idx], "A_TFD_BAD_FACES_1.png", num_rows=20) return
def test_svhn(occ_dim=15, drop_prob=0.0): RESULT_PATH = "IMP_SVHN_VAE/" ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = "{}VAE_OD{}_DP{}".format(RESULT_PATH, occ_dim, dp_int) ########################## # Get some training data # ########################## tr_file = 'data/svhn_train_gray.pkl' te_file = 'data/svhn_test_gray.pkl' ex_file = 'data/svhn_extra_gray.pkl' data = load_svhn_gray(tr_file, te_file, ex_file=ex_file, ex_count=200000) Xtr = to_fX( shift_and_scale_into_01(np.vstack([data['Xtr'], data['Xex']])) ) Xva = to_fX( shift_and_scale_into_01(data['Xte']) ) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 250 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX( all_pix_mean * np.ones((Xtr.shape[1],)) ) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ obs_dim = Xtr.shape[1] z_dim = 100 imp_steps = 15 # we'll check for the best step count (found oracularly) init_scale = 1.0 x_in_sym = T.matrix('x_in_sym') x_out_sym = T.matrix('x_out_sym') x_mask_sym = T.matrix('x_mask_sym') ################# # p_zi_given_xi # ################# params = {} shared_config = [obs_dim, 1000, 1000] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_zi_given_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_zi_given_xi.init_biases(0.2) ################### # p_xip1_given_zi # ################### params = {} shared_config = [z_dim, 1000, 1000] output_config = [obs_dim, obs_dim] params['shared_config'] = shared_config params['output_config'] = output_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_xip1_given_zi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_xip1_given_zi.init_biases(0.2) ################### # q_zi_given_x_xi # ################### params = {} shared_config = [(obs_dim + obs_dim), 1000, 1000] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_zi_given_x_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_zi_given_x_xi.init_biases(0.2) ########################################################### # Define parameters for the GPSImputer, and initialize it # ########################################################### print("Building the GPSImputer...") gpsi_params = {} gpsi_params['obs_dim'] = obs_dim gpsi_params['z_dim'] = z_dim gpsi_params['imp_steps'] = imp_steps gpsi_params['step_type'] = 'jump' gpsi_params['x_type'] = 'bernoulli' gpsi_params['obs_transform'] = 'sigmoid' gpsi_params['use_osm_mode'] = True GPSI = GPSImputer(rng=rng, x_in=x_in_sym, x_out=x_out_sym, x_mask=x_mask_sym, \ p_zi_given_xi=p_zi_given_xi, \ p_xip1_given_zi=p_xip1_given_zi, \ q_zi_given_x_xi=q_zi_given_x_xi, \ params=gpsi_params, \ shared_param_dicts=None) ######################################################################### # Define parameters for the underlying OneStageModel, and initialize it # ######################################################################### print("Building the OneStageModel...") osm_params = {} osm_params['x_type'] = 'bernoulli' osm_params['xt_transform'] = 'sigmoid' OSM = OneStageModel(rng=rng, \ x_in=x_in_sym, \ p_x_given_z=p_xip1_given_zi, \ q_z_given_x=p_zi_given_xi, \ x_dim=obs_dim, z_dim=z_dim, \ params=osm_params) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format(result_tag) out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.5 batch_idx = np.arange(batch_size) + tr_samples for i in range(200005): scale = min(1.0, ((i+1) / 5000.0)) if (((i + 1) % 15000) == 0): learn_rate = learn_rate * 0.92 if (i > 10000): momentum = 0.90 else: momentum = 0.50 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update OSM.set_sgd_params(lr=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.99) OSM.set_lam_nll(lam_nll=1.0) OSM.set_lam_kld(lam_kld_1=1.0, lam_kld_2=0.0) OSM.set_lam_l2w(1e-4) # perform a minibatch update and record the cost for this batch xb = to_fX( Xtr.take(batch_idx, axis=0) ) result = OSM.train_joint(xb, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result)-1)] if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_cost : {0:.4f}".format(costs[1]) str4 = " kld_cost : {0:.4f}".format(costs[2]) str5 = " reg_cost : {0:.4f}".format(costs[3]) joint_str = "\n".join([str1, str2, str3, str4, str5]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva[0:5000], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) step_nll, step_kld = GPSI.compute_per_step_cost(xi, xo, xm, sample_count=10) min_nll = np.min(step_nll) str1 = " va_nll_bound : {}".format(min_nll) str2 = " va_nll_min : {}".format(min_nll) str3 = " va_nll_final : {}".format(step_nll[-1]) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() if ((i % 10000) == 0): # Get some validation samples for evaluating model performance xb = to_fX( Xva[0:100] ) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) xi = np.repeat(xi, 2, axis=0) xo = np.repeat(xo, 2, axis=0) xm = np.repeat(xm, 2, axis=0) # draw some sample imputations from the model samp_count = xi.shape[0] _, model_samps = GPSI.sample_imputer(xi, xo, xm, use_guide_policy=False) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "{}_samples_ng_b{}.png".format(result_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # get visualizations of policy parameters file_name = "{}_gen_gen_weights_b{}.png".format(result_tag, i) W = GPSI.gen_gen_weights.get_value(borrow=False) utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "{}_gen_inf_weights_b{}.png".format(result_tag, i) W = GPSI.gen_inf_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
def pretrain_osm(lam_kld=0.0): # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr = datasets[0][0] Xtr = Xtr.get_value(borrow=False) Xva = datasets[2][0] Xva = Xva.get_value(borrow=False) print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape), str(Xva.shape))) # get and set some basic dataset information Xtr_mean = np.mean(Xtr, axis=0) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 100 batch_reps = 5 # setup some symbolic variables and stuff Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') data_dim = Xtr.shape[1] prior_sigma = 1.0 ########################## # NETWORK CONFIGURATIONS # ########################## gn_params = {} shared_config = [PRIOR_DIM, 1000, 1000] top_config = [shared_config[-1], data_dim] gn_params['shared_config'] = shared_config gn_params['mu_config'] = top_config gn_params['sigma_config'] = top_config gn_params['activation'] = relu_actfun gn_params['init_scale'] = 1.4 gn_params['lam_l2a'] = 0.0 gn_params['vis_drop'] = 0.0 gn_params['hid_drop'] = 0.0 gn_params['bias_noise'] = 0.0 gn_params['input_noise'] = 0.0 # choose some parameters for the continuous inferencer in_params = {} shared_config = [data_dim, 1000, 1000] top_config = [shared_config[-1], PRIOR_DIM] in_params['shared_config'] = shared_config in_params['mu_config'] = top_config in_params['sigma_config'] = top_config in_params['activation'] = relu_actfun in_params['init_scale'] = 1.4 in_params['lam_l2a'] = 0.0 in_params['vis_drop'] = 0.0 in_params['hid_drop'] = 0.0 in_params['bias_noise'] = 0.0 in_params['input_noise'] = 0.0 # Initialize the base networks for this OneStageModel IN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=in_params, shared_param_dicts=None) GN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=gn_params, shared_param_dicts=None) # Initialize biases in IN and GN IN.init_biases(0.2) GN.init_biases(0.2) ######################### # INITIALIZE THE GIPAIR # ######################### osm_params = {} osm_params['x_type'] = 'bernoulli' osm_params['xt_transform'] = 'sigmoid' osm_params['logvar_bound'] = LOGVAR_BOUND OSM = OneStageModel(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, \ p_x_given_z=GN, q_z_given_x=IN, \ x_dim=data_dim, z_dim=PRIOR_DIM, params=osm_params) OSM.set_lam_l2w(1e-5) safe_mean = (0.9 * Xtr_mean) + 0.05 safe_mean_logit = np.log(safe_mean / (1.0 - safe_mean)) OSM.set_output_bias(safe_mean_logit) OSM.set_input_bias(-Xtr_mean) ###################### # BASIC VAE TRAINING # ###################### out_file = open(RESULT_PATH + "pt_osm_results.txt", 'wb') # Set initial learning rate and basic SGD hyper parameters obs_costs = np.zeros((batch_size, )) costs = [0. for i in range(10)] learn_rate = 0.0005 for i in range(150000): scale = min(1.0, float(i) / 10000.0) if ((i > 1) and ((i % 20000) == 0)): learn_rate = learn_rate * 0.9 # do a minibatch update of the model, and compute some costs tr_idx = npr.randint(low=0, high=tr_samples, size=(batch_size, )) Xd_batch = Xtr.take(tr_idx, axis=0) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # do a minibatch update of the model, and compute some costs OSM.set_sgd_params(lr_1=(scale * learn_rate), mom_1=0.5, mom_2=0.98) OSM.set_lam_nll(1.0) OSM.set_lam_kld(lam_kld_1=(1.0 + (scale * (lam_kld - 1.0))), lam_kld_2=0.0) result = OSM.train_joint(Xd_batch, Xc_batch, Xm_batch, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 1000) == 0): # record and then reset the cost trackers costs = [(v / 1000.0) for v in costs] str_1 = "-- batch {0:d} --".format(i) str_2 = " joint_cost: {0:.4f}".format(costs[0]) str_3 = " nll_cost : {0:.4f}".format(costs[1]) str_4 = " kld_cost : {0:.4f}".format(costs[2]) str_5 = " reg_cost : {0:.4f}".format(costs[3]) costs = [0.0 for v in costs] # print out some diagnostic information joint_str = "\n".join([str_1, str_2, str_3, str_4, str_5]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() if ((i % 2000) == 0): Xva = row_shuffle(Xva) model_samps = OSM.sample_from_prior(500) file_name = RESULT_PATH + "pt_osm_samples_b{0:d}_XG.png".format(i) utils.visualize_samples(model_samps, file_name, num_rows=20) # compute information about free-energy on validation set file_name = RESULT_PATH + "pt_osm_free_energy_b{0:d}.png".format(i) fe_terms = OSM.compute_fe_terms(Xva[0:2500], 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) fe_str = " nll_bound : {0:.4f}".format(fe_mean) print(fe_str) out_file.write(fe_str + "\n") utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ x_label='Posterior KLd', y_label='Negative Log-likelihood') # compute information about posterior KLds on validation set file_name = RESULT_PATH + "pt_osm_post_klds_b{0:d}.png".format(i) post_klds = OSM.compute_post_klds(Xva[0:2500]) post_dim_klds = np.mean(post_klds, axis=0) utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, \ file_name) if ((i % 5000) == 0): IN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_b{0:d}_IN.pkl".format(i)) GN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_b{0:d}_GN.pkl".format(i)) IN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_IN.pkl") GN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_GN.pkl") return
def test_mnist(step_type='add', imp_steps=6, occ_dim=15, drop_prob=0.0): ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = "{}GPSI_OD{}_DP{}_IS{}_{}_NA".format(RESULT_PATH, occ_dim, dp_int, imp_steps, step_type) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/') Xtr = np.vstack((Xtr, Xva)) Xva = Xte #del Xte tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] ########################## # Get some training data # ########################## # rng = np.random.RandomState(1234) # dataset = 'data/mnist.pkl.gz' # datasets = load_udm(dataset, as_shared=False, zero_mean=False) # Xtr = datasets[0][0] # Xva = datasets[1][0] # Xte = datasets[2][0] # # Merge validation set and training set, and test on test set. # #Xtr = np.concatenate((Xtr, Xva), axis=0) # #Xva = Xte # Xtr = to_fX(shift_and_scale_into_01(Xtr)) # Xva = to_fX(shift_and_scale_into_01(Xva)) # tr_samples = Xtr.shape[0] # va_samples = Xva.shape[0] batch_size = 200 batch_reps = 1 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX( all_pix_mean * np.ones((Xtr.shape[1],)) ) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ x_dim = Xtr.shape[1] s_dim = x_dim h_dim = 50 z_dim = 100 init_scale = 0.6 x_in_sym = T.matrix('x_in_sym') x_out_sym = T.matrix('x_out_sym') x_mask_sym = T.matrix('x_mask_sym') ############### # p_h_given_x # ############### params = {} shared_config = [x_dim, 250] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = tanh_actfun #relu_actfun params['init_scale'] = 'xg' #init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_h_given_x = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_h_given_x.init_biases(0.0) ################ # p_s0_given_h # ################ params = {} shared_config = [h_dim, 250] output_config = [s_dim, s_dim, s_dim] params['shared_config'] = shared_config params['output_config'] = output_config params['activation'] = tanh_actfun #relu_actfun params['init_scale'] = 'xg' #init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_s0_given_h = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_s0_given_h.init_biases(0.0) ################# # p_zi_given_xi # ################# params = {} shared_config = [(x_dim + x_dim), 500, 500] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = tanh_actfun #relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_zi_given_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_zi_given_xi.init_biases(0.0) ################### # p_sip1_given_zi # ################### params = {} shared_config = [z_dim, 500, 500] output_config = [s_dim, s_dim, s_dim] params['shared_config'] = shared_config params['output_config'] = output_config params['activation'] = tanh_actfun #relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_sip1_given_zi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_sip1_given_zi.init_biases(0.0) ################ # p_x_given_si # ################ params = {} shared_config = [s_dim] output_config = [x_dim, x_dim] params['shared_config'] = shared_config params['output_config'] = output_config params['activation'] = tanh_actfun #relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_x_given_si = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_x_given_si.init_biases(0.0) ############### # q_h_given_x # ############### params = {} shared_config = [x_dim, 250] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = tanh_actfun #relu_actfun params['init_scale'] = 'xg' #init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_h_given_x = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_h_given_x.init_biases(0.0) ################# # q_zi_given_xi # ################# params = {} shared_config = [(x_dim + x_dim), 500, 500] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = tanh_actfun #relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_zi_given_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_zi_given_xi.init_biases(0.0) ########################################################### # Define parameters for the GPSImputer, and initialize it # ########################################################### print("Building the GPSImputer...") gpsi_params = {} gpsi_params['x_dim'] = x_dim gpsi_params['h_dim'] = h_dim gpsi_params['z_dim'] = z_dim gpsi_params['s_dim'] = s_dim # switch between direct construction and construction via p_x_given_si gpsi_params['use_p_x_given_si'] = False gpsi_params['imp_steps'] = imp_steps gpsi_params['step_type'] = step_type gpsi_params['x_type'] = 'bernoulli' gpsi_params['obs_transform'] = 'sigmoid' GPSI = GPSImputerWI(rng=rng, x_in=x_in_sym, x_out=x_out_sym, x_mask=x_mask_sym, \ p_h_given_x=p_h_given_x, \ p_s0_given_h=p_s0_given_h, \ p_zi_given_xi=p_zi_given_xi, \ p_sip1_given_zi=p_sip1_given_zi, \ p_x_given_si=p_x_given_si, \ q_h_given_x=q_h_given_x, \ q_zi_given_xi=q_zi_given_xi, \ params=gpsi_params, \ shared_param_dicts=None) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format(result_tag) out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.5 batch_idx = np.arange(batch_size) + tr_samples for i in range(250000): scale = min(1.0, ((i+1) / 5000.0)) lam_scale = 1.0 - min(1.0, ((i+1) / 100000.0)) # decays from 1.0->0.0 if (((i + 1) % 15000) == 0): learn_rate = learn_rate * 0.93 if (i > 10000): momentum = 0.90 else: momentum = 0.75 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update GPSI.set_sgd_params(lr=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.98) GPSI.set_train_switch(1.0) GPSI.set_lam_nll(lam_nll=1.0) GPSI.set_lam_kld(lam_kld_p=0.05, lam_kld_q=0.95, \ lam_kld_g=(0.1 * lam_scale), lam_kld_s=(0.1 * lam_scale)) GPSI.set_lam_l2w(1e-5) # perform a minibatch update and record the cost for this batch xb = to_fX( Xtr.take(batch_idx, axis=0) ) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) result = GPSI.train_joint(xi, xo, xm, batch_reps) # do diagnostics and general training tracking costs = [(costs[j] + result[j]) for j in range(len(result)-1)] if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_cost : {0:.4f}".format(costs[2]) str5 = " kld_cost : {0:.4f}".format(costs[3]) str6 = " reg_cost : {0:.4f}".format(costs[4]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva[0:5000], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) nll, kld = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10) vfe = np.mean(nll) + np.mean(kld) str1 = " va_nll_bound : {}".format(vfe) str2 = " va_nll_term : {}".format(np.mean(nll)) str3 = " va_kld_q2p : {}".format(np.mean(kld)) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() if ((i % 2000) == 0): GPSI.save_to_file("{}_PARAMS.pkl".format(result_tag)) # Get some validation samples for evaluating model performance xb = to_fX( Xva[0:100] ) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) xi = np.repeat(xi, 2, axis=0) xo = np.repeat(xo, 2, axis=0) xm = np.repeat(xm, 2, axis=0) # draw some sample imputations from the model samp_count = xi.shape[0] _, model_samps = GPSI.sample_imputer(xi, xo, xm, use_guide_policy=False) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "{0:s}_samples_ng_b{1:d}.png".format(result_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # show KLds and NLLs on a step-by-step basis xb = to_fX( Xva[0:1000] ) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) step_costs = GPSI.compute_per_step_cost(xi, xo, xm) step_nlls = step_costs[0] step_klds = step_costs[1] step_nums = np.arange(step_nlls.shape[0]) file_name = "{0:s}_NLL_b{1:d}.png".format(result_tag, i) utils.plot_stem(step_nums, step_nlls, file_name) file_name = "{0:s}_KLD_b{1:d}.png".format(result_tag, i) utils.plot_stem(step_nums, step_klds, file_name)
def pretrain_osm(lam_kld=0.0): # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with data_file = 'data/tfd_data_48x48.pkl' dataset = load_tfd(tfd_pkl_name=data_file, which_set='unlabeled', fold='all') Xtr_unlabeled = dataset[0] dataset = load_tfd(tfd_pkl_name=data_file, which_set='train', fold='all') Xtr_train = dataset[0] Xtr = np.vstack([Xtr_unlabeled, Xtr_train]) dataset = load_tfd(tfd_pkl_name=data_file, which_set='valid', fold='all') Xva = dataset[0] tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 400 batch_reps = 6 carry_frac = 0.25 carry_size = int(batch_size * carry_frac) reset_prob = 0.04 # setup some symbolic variables and stuff Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') data_dim = Xtr.shape[1] prior_sigma = 1.0 Xtr_mean = np.mean(Xtr, axis=0) ########################## # NETWORK CONFIGURATIONS # ########################## gn_params = {} shared_config = [PRIOR_DIM, 1500, 1500] top_config = [shared_config[-1], data_dim] gn_params['shared_config'] = shared_config gn_params['mu_config'] = top_config gn_params['sigma_config'] = top_config gn_params['activation'] = relu_actfun gn_params['init_scale'] = 1.4 gn_params['lam_l2a'] = 0.0 gn_params['vis_drop'] = 0.0 gn_params['hid_drop'] = 0.0 gn_params['bias_noise'] = 0.0 gn_params['input_noise'] = 0.0 # choose some parameters for the continuous inferencer in_params = {} shared_config = [data_dim, 1500, 1500] top_config = [shared_config[-1], PRIOR_DIM] in_params['shared_config'] = shared_config in_params['mu_config'] = top_config in_params['sigma_config'] = top_config in_params['activation'] = relu_actfun in_params['init_scale'] = 1.4 in_params['lam_l2a'] = 0.0 in_params['vis_drop'] = 0.0 in_params['hid_drop'] = 0.0 in_params['bias_noise'] = 0.0 in_params['input_noise'] = 0.0 # Initialize the base networks for this OneStageModel IN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=in_params, shared_param_dicts=None) GN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=gn_params, shared_param_dicts=None) # Initialize biases in IN and GN IN.init_biases(0.2) GN.init_biases(0.2) ###################################### # LOAD AND RESTART FROM SAVED PARAMS # ###################################### # gn_fname = RESULT_PATH+"pt_osm_params_b110000_GN.pkl" # in_fname = RESULT_PATH+"pt_osm_params_b110000_IN.pkl" # IN = load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd, \ # new_params=None) # GN = load_infnet_from_file(f_name=gn_fname, rng=rng, Xd=Xd, \ # new_params=None) # in_params = IN.params # gn_params = GN.params ######################### # INITIALIZE THE GIPAIR # ######################### osm_params = {} osm_params['x_type'] = 'bernoulli' osm_params['xt_transform'] = 'sigmoid' osm_params['logvar_bound'] = LOGVAR_BOUND OSM = OneStageModel(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, \ p_x_given_z=GN, q_z_given_x=IN, \ x_dim=data_dim, z_dim=PRIOR_DIM, params=osm_params) OSM.set_lam_l2w(1e-5) safe_mean = (0.9 * Xtr_mean) + 0.05 safe_mean_logit = np.log(safe_mean / (1.0 - safe_mean)) OSM.set_output_bias(safe_mean_logit) OSM.set_input_bias(-Xtr_mean) ###################### # BASIC VAE TRAINING # ###################### out_file = open(RESULT_PATH + "pt_osm_results.txt", 'wb') # Set initial learning rate and basic SGD hyper parameters obs_costs = np.zeros((batch_size, )) costs = [0. for i in range(10)] learn_rate = 0.002 for i in range(200000): scale = min(1.0, float(i) / 5000.0) if ((i > 1) and ((i % 20000) == 0)): learn_rate = learn_rate * 0.8 if (i < 50000): momentum = 0.5 elif (i < 10000): momentum = 0.7 else: momentum = 0.9 if ((i == 0) or (npr.rand() < reset_prob)): # sample a fully random batch batch_idx = npr.randint(low=0, high=tr_samples, size=(batch_size, )) else: # sample a partially random batch, which retains some portion of # the worst scoring examples from the previous batch fresh_idx = npr.randint(low=0, high=tr_samples, size=(batch_size - carry_size, )) batch_idx = np.concatenate((fresh_idx.ravel(), carry_idx.ravel())) # do a minibatch update of the model, and compute some costs tr_idx = npr.randint(low=0, high=tr_samples, size=(batch_size, )) Xd_batch = Xtr.take(tr_idx, axis=0) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # do a minibatch update of the model, and compute some costs OSM.set_sgd_params(lr_1=(scale*learn_rate), \ mom_1=(scale*momentum), mom_2=0.98) OSM.set_lam_nll(1.0) OSM.set_lam_kld(lam_kld_1=scale * lam_kld, lam_kld_2=0.0, lam_kld_c=50.0) result = OSM.train_joint(Xd_batch, Xc_batch, Xm_batch, batch_reps) batch_costs = result[4] + result[5] obs_costs = collect_obs_costs(batch_costs, batch_reps) carry_idx = batch_idx[np.argsort(-obs_costs)[0:carry_size]] costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 1000) == 0): # record and then reset the cost trackers costs = [(v / 1000.0) for v in costs] str_1 = "-- batch {0:d} --".format(i) str_2 = " joint_cost: {0:.4f}".format(costs[0]) str_3 = " nll_cost : {0:.4f}".format(costs[1]) str_4 = " kld_cost : {0:.4f}".format(costs[2]) str_5 = " reg_cost : {0:.4f}".format(costs[3]) costs = [0.0 for v in costs] # print out some diagnostic information joint_str = "\n".join([str_1, str_2, str_3, str_4, str_5]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() if ((i % 2000) == 0): Xva = row_shuffle(Xva) model_samps = OSM.sample_from_prior(500) file_name = RESULT_PATH + "pt_osm_samples_b{0:d}_XG.png".format(i) utils.visualize_samples(model_samps, file_name, num_rows=20) file_name = RESULT_PATH + "pt_osm_inf_weights_b{0:d}.png".format(i) utils.visualize_samples(OSM.inf_weights.get_value(borrow=False).T, \ file_name, num_rows=30) file_name = RESULT_PATH + "pt_osm_gen_weights_b{0:d}.png".format(i) utils.visualize_samples(OSM.gen_weights.get_value(borrow=False), \ file_name, num_rows=30) # compute information about free-energy on validation set file_name = RESULT_PATH + "pt_osm_free_energy_b{0:d}.png".format(i) fe_terms = OSM.compute_fe_terms(Xva[0:2500], 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) fe_str = " nll_bound : {0:.4f}".format(fe_mean) print(fe_str) out_file.write(fe_str + "\n") utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ x_label='Posterior KLd', y_label='Negative Log-likelihood') # compute information about posterior KLds on validation set file_name = RESULT_PATH + "pt_osm_post_klds_b{0:d}.png".format(i) post_klds = OSM.compute_post_klds(Xva[0:2500]) post_dim_klds = np.mean(post_klds, axis=0) utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, \ file_name) if ((i % 5000) == 0): IN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_b{0:d}_IN.pkl".format(i)) GN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_b{0:d}_GN.pkl".format(i)) IN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_IN.pkl") GN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_GN.pkl") return
def check_tfd_walkout(): # DERPA DERPA DOO KLD_PATH = "TFD_WALKOUT_TEST_KLD/" VAE_PATH = "TFD_WALKOUT_TEST_VAE/" RESULT_PATH = "TFD_WALKOUT_RESULTS/" # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with data_file = 'data/tfd_data_48x48.pkl' dataset = load_tfd(tfd_pkl_name=data_file, which_set='unlabeled', fold='all') Xtr_unlabeled = dataset[0] dataset = load_tfd(tfd_pkl_name=data_file, which_set='train', fold='all') Xtr_train = dataset[0] Xtr = np.vstack([Xtr_unlabeled, Xtr_train]) dataset = load_tfd(tfd_pkl_name=data_file, which_set='valid', fold='all') Xva = dataset[0] tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 100 batch_reps = 5 # Construct a GenNet and an InfNet, then test constructor for GIPair. # Do basic testing, to make sure classes aren't completely broken. Xp = T.matrix('Xp_base') Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') data_dim = Xtr.shape[1] prior_sigma = 1.0 p_vals_kld, v_vals_kld, p_vals_vae, v_vals_vae = [], [], [], [] kl_vals_kld, ll_vals_kld, kl_vals_vae, ll_vals_vae = [], [], [], [] ######################################################## # CHECK MODEL BEHAVIOR AT DIFFERENT STAGES OF TRAINING # ######################################################## for i in range(10000,200000): if ((i % 10000) == 0): if (i <= 150000): net_type = 'gip' b = i else: net_type = 'walk' b = i - 150000 ############################################################# # Process the GIPair trained with strong KLd regularization # ############################################################# gn_fname = KLD_PATH + "pt_{0:s}_params_b{1:d}_GN.pkl".format(net_type, b) in_fname = KLD_PATH + "pt_{0:s}_params_b{1:d}_IN.pkl".format(net_type, b) IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp) IN.set_sigma_scale(1.0) prior_dim = GN.latent_dim post_klds_kld = posterior_klds(IN, Xtr, 5000, 5) # Initialize the GIPair GIP_KLD = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN, i_net=IN, \ data_dim=data_dim, prior_dim=prior_dim, params=None) GIP_KLD.set_lam_l2w(1e-4) GIP_KLD.set_lam_nll(1.0) GIP_KLD.set_lam_kld(1.0) # draw samples freely from the generative model's prior Xs = GIP_KLD.sample_from_prior(20*20) file_name = RESULT_PATH + "prior_samples_b{0:d}_kld.png".format(i) utils.visualize_samples(Xs, file_name, num_rows=20) # test Parzen density estimator built from prior samples Xs = GIP_KLD.sample_from_prior(10000, sigma=1.0) parzen_vals_kld = cross_validate_sigma(Xs, Xva, [0.08, 0.09, 0.1, 0.11, 0.12, 0.15, 0.2], 20) # get variational bound info var_vals_kld = GIP_KLD.compute_ll_bound(Xva) # record info about variational and parzen bounds p_vals_kld.append(parzen_vals_kld[1]) v_vals_kld.append(np.mean(var_vals_kld[0])) ################################################################ # Process the GIPair trained with basic VAE KLd regularization # ################################################################ gn_fname = VAE_PATH + "pt_{0:s}_params_b{1:d}_GN.pkl".format(net_type, b) in_fname = VAE_PATH + "pt_{0:s}_params_b{1:d}_IN.pkl".format(net_type, b) IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp) IN.set_sigma_scale(1.0) prior_dim = GN.latent_dim post_klds_vae = posterior_klds(IN, Xtr, 5000, 5) # Initialize the GIPair GIP_VAE = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN, i_net=IN, \ data_dim=data_dim, prior_dim=prior_dim, params=None) GIP_VAE.set_lam_l2w(1e-4) GIP_VAE.set_lam_nll(1.0) GIP_VAE.set_lam_kld(1.0) # draw samples freely from the generative model's prior Xs = GIP_VAE.sample_from_prior(20*20) file_name = RESULT_PATH + "prior_samples_b{0:d}_vae.png".format(i) utils.visualize_samples(Xs, file_name, num_rows=20) # test Parzen density estimator built from prior samples Xs = GIP_VAE.sample_from_prior(10000, sigma=1.0) parzen_vals_vae = cross_validate_sigma(Xs, Xva, [0.08, 0.09, 0.1, 0.11, 0.12, 0.15, 0.2], 20) # get variational bound info var_vals_vae = GIP_VAE.compute_ll_bound(Xva) # record info about variational and parzen bounds p_vals_vae.append(parzen_vals_vae[1]) v_vals_vae.append(np.mean(var_vals_vae[0])) ######################## # Plot posterior KLds. # ######################## file_name = RESULT_PATH + "post_klds_b{0:d}.pdf".format(i) draw_posterior_kld_hist( \ np.asarray(post_klds_kld), np.asarray(post_klds_vae), file_name, bins=30) if i in [20000, 50000, 80000, 110000, 150000, 190000]: # select random random indices into the validation set va_idx = npr.randint(0,high=va_samples,size=(150,)) # record information about their current variational bounds kl_vals_kld.extend([v for v in var_vals_kld[1][va_idx]]) ll_vals_kld.extend([v for v in var_vals_kld[2][va_idx]]) kl_vals_vae.extend([v for v in var_vals_vae[1][va_idx]]) ll_vals_vae.extend([v for v in var_vals_vae[2][va_idx]]) # do some plotting s1_name = RESULT_PATH + "parzen_vs_variational.pdf" s2_name = RESULT_PATH + "kld_vs_likelihood.pdf" draw_parzen_vs_variational_scatter(p_vals_kld, v_vals_kld, \ p_vals_vae, v_vals_vae, f_name=s1_name) draw_kld_vs_likelihood_scatter(kl_vals_kld, ll_vals_kld, \ kl_vals_vae, ll_vals_vae, f_name=s2_name) return
def test_with_model_init(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/') del Xte tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 batch_reps = 1 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ obs_dim = Xtr.shape[1] z_dim = 20 h_dim = 200 ir_steps = 6 init_scale = 1.0 x_type = 'bernoulli' # some InfNet instances to build the TwoStageModel from x_in_sym = T.matrix('x_in_sym') x_out_sym = T.matrix('x_out_sym') ################# # p_hi_given_si # ################# params = {} shared_config = [obs_dim, 300, 300] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_hi_given_si = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_hi_given_si.init_biases(0.2) ###################### # p_sip1_given_si_hi # ###################### params = {} shared_config = [h_dim, 300, 300] output_config = [obs_dim, obs_dim, obs_dim] params['shared_config'] = shared_config params['output_config'] = output_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_sip1_given_si_hi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_sip1_given_si_hi.init_biases(0.2) ################ # p_s0_given_z # ################ params = {} shared_config = [z_dim, 250, 250] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_s0_given_z = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_s0_given_z.init_biases(0.2) ############### # q_z_given_x # ############### params = {} shared_config = [obs_dim, 250, 250] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_z_given_x = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_z_given_x.init_biases(0.2) ################### # q_hi_given_x_si # ################### params = {} shared_config = [(obs_dim + obs_dim), 500, 500] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_hi_given_x_si = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_hi_given_x_si.init_biases(0.2) ################################################################ # Define parameters for the MultiStageModel, and initialize it # ################################################################ print("Building the MultiStageModel...") msm_params = {} msm_params['x_type'] = x_type msm_params['obs_transform'] = 'sigmoid' MSM = MultiStageModel(rng=rng, x_in=x_in_sym, x_out=x_out_sym, \ p_s0_given_z=p_s0_given_z, \ p_hi_given_si=p_hi_given_si, \ p_sip1_given_si_hi=p_sip1_given_si_hi, \ q_z_given_x=q_z_given_x, \ q_hi_given_x_si=q_hi_given_x_si, \ obs_dim=obs_dim, z_dim=z_dim, h_dim=h_dim, \ ir_steps=ir_steps, params=msm_params) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ out_file = open("MSM_A_RESULTS.txt", 'wb') costs = [0. for i in range(10)] learn_rate = 0.0003 momentum = 0.9 batch_idx = np.arange(batch_size) + tr_samples for i in range(250000): scale = min(1.0, ((i+1) / 3000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update MSM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.99) MSM.set_train_switch(1.0) MSM.set_lam_nll(lam_nll=1.0) MSM.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.8, lam_kld_p2q=0.2) MSM.set_lam_kld_l1l2(lam_kld_l1l2=1.0) MSM.set_lam_l2w(1e-4) MSM.set_drop_rate(0.0) MSM.q_hi_given_x_si.set_bias_noise(0.0) MSM.p_hi_given_si.set_bias_noise(0.0) MSM.p_sip1_given_si_hi.set_bias_noise(0.0) # perform a minibatch update and record the cost for this batch Xb_tr = to_fX( Xtr.take(batch_idx, axis=0) ) result = MSM.train_joint(Xb_tr, Xb_tr, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result)-1)] if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_cost : {0:.4f}".format(costs[1]) str4 = " kld_cost : {0:.4f}".format(costs[2]) str5 = " reg_cost : {0:.4f}".format(costs[3]) joint_str = "\n".join([str1, str2, str3, str4, str5]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))): MSM.set_drop_rate(0.0) MSM.q_hi_given_x_si.set_bias_noise(0.0) MSM.p_hi_given_si.set_bias_noise(0.0) MSM.p_sip1_given_si_hi.set_bias_noise(0.0) # Get some validation samples for computing diagnostics Xva = row_shuffle(Xva) Xb_va = to_fX( Xva[0:2000] ) # draw some independent random samples from the model samp_count = 200 model_samps = MSM.sample_from_prior(samp_count) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "MSM_A_SAMPLES_IND_b{0:d}.png".format(i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # draw some conditional random samples from the model samp_count = 200 Xs = np.vstack((Xb_tr[0:(samp_count/4)], Xb_va[0:(samp_count/4)])) Xs = np.repeat(Xs, 2, axis=0) # draw some conditional random samples from the model model_samps = MSM.sample_from_input(Xs, guided_decoding=False) model_samps.append(Xs) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "MSM_A_SAMPLES_CND_b{0:d}.png".format(i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # compute information about posterior KLds on validation set raw_klds = MSM.compute_raw_klds(Xb_va, Xb_va) init_kld, q2p_kld, p2q_kld = raw_klds file_name = "MSM_A_H0_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(init_kld.shape[1]), \ np.mean(init_kld, axis=0), file_name) file_name = "MSM_A_HI_Q2P_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(q2p_kld.shape[1]), \ np.mean(q2p_kld, axis=0), file_name) file_name = "MSM_A_HI_P2Q_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(p2q_kld.shape[1]), \ np.mean(p2q_kld, axis=0), file_name) Xb_tr = to_fX( Xtr[0:2000] ) fe_terms = MSM.compute_fe_terms(Xb_tr, Xb_tr, 30) fe_nll = np.mean(fe_terms[0]) fe_kld = np.mean(fe_terms[1]) fe_joint = fe_nll + fe_kld joint_str = " vfe-tr: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() fe_terms = MSM.compute_fe_terms(Xb_va, Xb_va, 30) fe_nll = np.mean(fe_terms[0]) fe_kld = np.mean(fe_terms[1]) fe_joint = fe_nll + fe_kld joint_str = " vfe-va: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) print(joint_str) out_file.write(joint_str+"\n") out_file.flush()
def check_tfd_recon(): # DERPA DERPA DOO KLD_PATH = "TFD_WALKOUT_TEST_KLD/" VAE_PATH = "TFD_WALKOUT_TEST_VAE/" RESULT_PATH = "TFD_WALKOUT_RESULTS/" # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with data_file = 'data/tfd_data_48x48.pkl' dataset = load_tfd(tfd_pkl_name=data_file, which_set='unlabeled', fold='all') Xtr_unlabeled = dataset[0] dataset = load_tfd(tfd_pkl_name=data_file, which_set='train', fold='all') Xtr_train = dataset[0] Xtr = np.vstack([Xtr_unlabeled, Xtr_train]) dataset = load_tfd(tfd_pkl_name=data_file, which_set='valid', fold='all') Xva = dataset[0] Xtr_mean = np.mean(Xtr, axis=0, keepdims=True) Xtr_mean = (0.0 * Xtr_mean) + np.mean(Xtr_mean) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 100 batch_reps = 5 # Construct a GenNet and an InfNet, then test constructor for GIPair. # Do basic testing, to make sure classes aren't completely broken. Xp = T.matrix('Xp_base') Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') data_dim = Xtr.shape[1] prior_sigma = 1.0 ############################################################# # Process the GIPair trained with strong KLd regularization # ############################################################# gn_fname = KLD_PATH + "pt_recon_params_b180000_GN.pkl" in_fname = KLD_PATH + "pt_recon_params_b180000_IN.pkl" IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp) IN.set_sigma_scale(1.0) prior_dim = GN.latent_dim # Initialize the GIPair GIP_KLD = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN, i_net=IN, \ data_dim=data_dim, prior_dim=prior_dim, params=None) ################################################################ # Process the GIPair trained with basic VAE KLd regularization # ################################################################ gn_fname = VAE_PATH + "pt_walk_params_b50000_GN.pkl" in_fname = VAE_PATH + "pt_walk_params_b50000_IN.pkl" IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp) IN.set_sigma_scale(1.0) prior_dim = GN.latent_dim # Initialize the GIPair GIP_VAE = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN, i_net=IN, \ data_dim=data_dim, prior_dim=prior_dim, params=None) for trial in range(15): ######################################################### # DRAW THE SAMPLE OBSERVATIONS AND MASKS FOR THIS TRIAL # ######################################################### va_idx = npr.randint(low=0, high=Xva.shape[0], size=(15,)) Xc_batch = Xva.take(va_idx, axis=0) Xd_batch = np.repeat(Xtr_mean, Xc_batch.shape[0], axis=0).astype(theano.config.floatX) Xm_rand = sample_masks(Xc_batch, drop_prob=0.001) Xm_patch = sample_patch_masks(Xc_batch, (48,48), (25,25)) Xm_batch = Xm_rand * Xm_patch ##################################### # COMPARE SAMPLES IN A NORMAL CHAIN # ##################################### # draw some chains of samples from the VAE loop result_kld = GIP_KLD.sample_from_chain(Xc_batch, loop_iters=19) result_vae = GIP_VAE.sample_from_chain(Xc_batch, loop_iters=19) chain_samples_kld = [] chain_samples_vae = [] for i in range(len(result_kld['data samples'])): if (((i % 3) == 0) or (i == 1)): chain_samples_kld.append(result_kld['data samples'][i]) chain_samples_vae.append(result_vae['data samples'][i]) # interleave the chain samples for beauteous display chain_samples_both = [] for i in range(len(chain_samples_kld)): Xs_kld = chain_samples_kld[i] Xs_vae = chain_samples_vae[i] joint_samples = np.zeros((2*Xs_kld.shape[0], Xs_kld.shape[1])) for j in range(Xs_kld.shape[0]): joint_samples[2*j] = Xs_kld[j] joint_samples[2*j + 1] = Xs_vae[j] chain_samples_both.append(joint_samples) chain_len = len(chain_samples_both) Xs = np.vstack(chain_samples_both) file_name = RESULT_PATH + "FIG_CHAIN_{0:d}.png".format(trial) utils.visualize_samples(Xs, file_name, num_rows=chain_len) ############################################# # COMPARE SAMPLES IN A RECONSTRUCTION CHAIN # ############################################# # draw some chains of samples from the VAE loop result_kld = GIP_KLD.sample_from_chain(Xd_batch, X_c=Xc_batch, \ X_m=Xm_batch, loop_iters=10) result_vae = GIP_VAE.sample_from_chain(Xd_batch, X_c=Xc_batch, \ X_m=Xm_batch, loop_iters=10) recon_samples_kld = [] recon_samples_vae = [] for i in range(len(result_kld['data samples'])): if (((i % 2) == 0) or (i == 1)): recon_samples_kld.append(result_kld['data samples'][i]) recon_samples_vae.append(result_vae['data samples'][i]) # interleave the recon samples for beauteous display recon_samples_both = [] for i in range(len(recon_samples_kld)): Xs_kld = recon_samples_kld[i] Xs_vae = recon_samples_vae[i] joint_samples = np.zeros((2*Xs_kld.shape[0], Xs_kld.shape[1])) for j in range(Xs_kld.shape[0]): joint_samples[2*j] = Xs_kld[j] joint_samples[2*j + 1] = Xs_vae[j] recon_samples_both.append(joint_samples) recon_len = len(recon_samples_both) Xs = np.vstack(recon_samples_both) file_name = RESULT_PATH + "FIG_RECON_{0:d}.png".format(trial) utils.visualize_samples(Xs, file_name, num_rows=recon_len) return
def pretrain_osm(lam_kld=0.0): # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr = datasets[0][0] Xtr = Xtr.get_value(borrow=False) Xva = datasets[2][0] Xva = Xva.get_value(borrow=False) print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape),str(Xva.shape))) # get and set some basic dataset information Xtr_mean = np.mean(Xtr, axis=0) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 100 batch_reps = 5 # setup some symbolic variables and stuff Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') data_dim = Xtr.shape[1] prior_sigma = 1.0 ########################## # NETWORK CONFIGURATIONS # ########################## gn_params = {} shared_config = [PRIOR_DIM, 1000, 1000] top_config = [shared_config[-1], data_dim] gn_params['shared_config'] = shared_config gn_params['mu_config'] = top_config gn_params['sigma_config'] = top_config gn_params['activation'] = relu_actfun gn_params['init_scale'] = 1.4 gn_params['lam_l2a'] = 0.0 gn_params['vis_drop'] = 0.0 gn_params['hid_drop'] = 0.0 gn_params['bias_noise'] = 0.0 gn_params['input_noise'] = 0.0 # choose some parameters for the continuous inferencer in_params = {} shared_config = [data_dim, 1000, 1000] top_config = [shared_config[-1], PRIOR_DIM] in_params['shared_config'] = shared_config in_params['mu_config'] = top_config in_params['sigma_config'] = top_config in_params['activation'] = relu_actfun in_params['init_scale'] = 1.4 in_params['lam_l2a'] = 0.0 in_params['vis_drop'] = 0.0 in_params['hid_drop'] = 0.0 in_params['bias_noise'] = 0.0 in_params['input_noise'] = 0.0 # Initialize the base networks for this OneStageModel IN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=in_params, shared_param_dicts=None) GN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=gn_params, shared_param_dicts=None) # Initialize biases in IN and GN IN.init_biases(0.2) GN.init_biases(0.2) ######################### # INITIALIZE THE GIPAIR # ######################### osm_params = {} osm_params['x_type'] = 'bernoulli' osm_params['xt_transform'] = 'sigmoid' osm_params['logvar_bound'] = LOGVAR_BOUND OSM = OneStageModel(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, \ p_x_given_z=GN, q_z_given_x=IN, \ x_dim=data_dim, z_dim=PRIOR_DIM, params=osm_params) OSM.set_lam_l2w(1e-5) safe_mean = (0.9 * Xtr_mean) + 0.05 safe_mean_logit = np.log(safe_mean / (1.0 - safe_mean)) OSM.set_output_bias(safe_mean_logit) OSM.set_input_bias(-Xtr_mean) ###################### # BASIC VAE TRAINING # ###################### out_file = open(RESULT_PATH+"pt_osm_results.txt", 'wb') # Set initial learning rate and basic SGD hyper parameters obs_costs = np.zeros((batch_size,)) costs = [0. for i in range(10)] learn_rate = 0.0005 for i in range(150000): scale = min(1.0, float(i) / 10000.0) if ((i > 1) and ((i % 20000) == 0)): learn_rate = learn_rate * 0.9 # do a minibatch update of the model, and compute some costs tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,)) Xd_batch = Xtr.take(tr_idx, axis=0) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # do a minibatch update of the model, and compute some costs OSM.set_sgd_params(lr_1=(scale*learn_rate), mom_1=0.5, mom_2=0.98) OSM.set_lam_nll(1.0) OSM.set_lam_kld(lam_kld_1=(1.0 + (scale*(lam_kld-1.0))), lam_kld_2=0.0) result = OSM.train_joint(Xd_batch, Xc_batch, Xm_batch, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 1000) == 0): # record and then reset the cost trackers costs = [(v / 1000.0) for v in costs] str_1 = "-- batch {0:d} --".format(i) str_2 = " joint_cost: {0:.4f}".format(costs[0]) str_3 = " nll_cost : {0:.4f}".format(costs[1]) str_4 = " kld_cost : {0:.4f}".format(costs[2]) str_5 = " reg_cost : {0:.4f}".format(costs[3]) costs = [0.0 for v in costs] # print out some diagnostic information joint_str = "\n".join([str_1, str_2, str_3, str_4, str_5]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() if ((i % 2000) == 0): Xva = row_shuffle(Xva) model_samps = OSM.sample_from_prior(500) file_name = RESULT_PATH+"pt_osm_samples_b{0:d}_XG.png".format(i) utils.visualize_samples(model_samps, file_name, num_rows=20) # compute information about free-energy on validation set file_name = RESULT_PATH+"pt_osm_free_energy_b{0:d}.png".format(i) fe_terms = OSM.compute_fe_terms(Xva[0:2500], 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) fe_str = " nll_bound : {0:.4f}".format(fe_mean) print(fe_str) out_file.write(fe_str+"\n") utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ x_label='Posterior KLd', y_label='Negative Log-likelihood') # compute information about posterior KLds on validation set file_name = RESULT_PATH+"pt_osm_post_klds_b{0:d}.png".format(i) post_klds = OSM.compute_post_klds(Xva[0:2500]) post_dim_klds = np.mean(post_klds, axis=0) utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, \ file_name) if ((i % 5000) == 0): IN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_b{0:d}_IN.pkl".format(i)) GN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_b{0:d}_GN.pkl".format(i)) IN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_IN.pkl") GN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_GN.pkl") return
def test_gip_sigma_scale_tfd(): from LogPDFs import cross_validate_sigma # Simple test code, to check that everything is basically functional. print("TESTING...") # Initialize a source of randomness rng = np.random.RandomState(12345) # Load some data to train/validate/test with data_file = 'data/tfd_data_48x48.pkl' dataset = load_tfd(tfd_pkl_name=data_file, which_set='unlabeled', fold='all') Xtr_unlabeled = dataset[0] dataset = load_tfd(tfd_pkl_name=data_file, which_set='train', fold='all') Xtr_train = dataset[0] Xtr = np.vstack([Xtr_unlabeled, Xtr_train]) dataset = load_tfd(tfd_pkl_name=data_file, which_set='test', fold='all') Xva = dataset[0] tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape), str(Xva.shape))) # get and set some basic dataset information tr_samples = Xtr.shape[0] data_dim = Xtr.shape[1] batch_size = 100 # Symbolic inputs Xd = T.matrix(name='Xd') Xc = T.matrix(name='Xc') Xm = T.matrix(name='Xm') Xt = T.matrix(name='Xt') # Load inferencer and generator from saved parameters gn_fname = "TFD_WALKOUT_TEST_KLD/pt_walk_params_b25000_GN.pkl" in_fname = "TFD_WALKOUT_TEST_KLD/pt_walk_params_b25000_IN.pkl" IN = load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) GN = load_infnet_from_file(f_name=gn_fname, rng=rng, Xd=Xd) x_dim = IN.shared_layers[0].in_dim z_dim = IN.mu_layers[-1].out_dim # construct a GIPair with the loaded InfNet and GenNet osm_params = {} osm_params['x_type'] = 'gaussian' osm_params['xt_transform'] = 'sigmoid' osm_params['logvar_bound'] = LOGVAR_BOUND OSM = OneStageModel(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, \ p_x_given_z=GN, q_z_given_x=IN, \ x_dim=x_dim, z_dim=z_dim, params=osm_params) # # compute variational likelihood bound and its sub-components Xva = row_shuffle(Xva) Xb = Xva[0:5000] # file_name = "A_TFD_POST_KLDS.png" # post_klds = OSM.compute_post_klds(Xb) # post_dim_klds = np.mean(post_klds, axis=0) # utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, \ # file_name) # compute information about free-energy on validation set file_name = "A_TFD_KLD_FREE_ENERGY.png" fe_terms = OSM.compute_fe_terms(Xb, 20) utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ x_label='Posterior KLd', y_label='Negative Log-likelihood') # bound_results = OSM.compute_ll_bound(Xva) # ll_bounds = bound_results[0] # post_klds = bound_results[1] # log_likelihoods = bound_results[2] # max_lls = bound_results[3] # print("mean ll bound: {0:.4f}".format(np.mean(ll_bounds))) # print("mean posterior KLd: {0:.4f}".format(np.mean(post_klds))) # print("mean log-likelihood: {0:.4f}".format(np.mean(log_likelihoods))) # print("mean max log-likelihood: {0:.4f}".format(np.mean(max_lls))) # print("min ll bound: {0:.4f}".format(np.min(ll_bounds))) # print("max posterior KLd: {0:.4f}".format(np.max(post_klds))) # print("min log-likelihood: {0:.4f}".format(np.min(log_likelihoods))) # print("min max log-likelihood: {0:.4f}".format(np.min(max_lls))) # # compute some information about the approximate posteriors # post_stats = OSM.compute_post_stats(Xva, 0.0*Xva, 0.0*Xva) # all_post_klds = np.sort(post_stats[0].ravel()) # post KLds for each obs and dim # obs_post_klds = np.sort(post_stats[1]) # summed post KLds for each obs # post_dim_klds = post_stats[2] # average post KLds for each post dim # post_dim_vars = post_stats[3] # average squared mean for each post dim # utils.plot_line(np.arange(all_post_klds.shape[0]), all_post_klds, "AAA_ALL_POST_KLDS.png") # utils.plot_line(np.arange(obs_post_klds.shape[0]), obs_post_klds, "AAA_OBS_POST_KLDS.png") # utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, "AAA_POST_DIM_KLDS.png") # utils.plot_stem(np.arange(post_dim_vars.shape[0]), post_dim_vars, "AAA_POST_DIM_VARS.png") # draw many samples from the GIP for i in range(5): tr_idx = npr.randint(low=0, high=tr_samples, size=(100, )) Xd_batch = Xtr.take(tr_idx, axis=0) Xs = [] for row in range(3): Xs.append([]) for col in range(3): sample_lists = OSM.sample_from_chain(Xd_batch[0:10,:], loop_iters=100, \ sigma_scale=1.0) Xs[row].append(group_chains(sample_lists['data samples'])) Xs, block_im_dim = block_video(Xs, (48, 48), (3, 3)) to_video(Xs, block_im_dim, "A_TFD_KLD_CHAIN_VIDEO_{0:d}.avi".format(i), frame_rate=10) #sample_lists = GIP.sample_from_chain(Xd_batch[0,:].reshape((1,data_dim)), loop_iters=300, \ # sigma_scale=1.0) #Xs = np.vstack(sample_lists["data samples"]) #file_name = "TFD_TEST_{0:d}.png".format(i) #utils.visualize_samples(Xs, file_name, num_rows=15) file_name = "A_TFD_KLD_PRIOR_SAMPLE.png" Xs = OSM.sample_from_prior(20 * 20) utils.visualize_samples(Xs, file_name, num_rows=20) # test Parzen density estimator built from prior samples # Xs = OSM.sample_from_prior(10000) # [best_sigma, best_ll, best_lls] = \ # cross_validate_sigma(Xs, Xva, [0.09, 0.095, 0.1, 0.105, 0.11], 10) # sort_idx = np.argsort(best_lls) # sort_idx = sort_idx[0:400] # utils.plot_line(np.arange(sort_idx.shape[0]), best_lls[sort_idx], "A_TFD_BEST_LLS_1.png") # utils.visualize_samples(Xva[sort_idx], "A_TFD_BAD_FACES_1.png", num_rows=20) return
def test_mnist(step_type='add', imp_steps=6, occ_dim=15, drop_prob=0.0): ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = "{}GPSI_conv_bn_OD{}_DP{}_IS{}_{}_NA".format(RESULT_PATH, occ_dim, dp_int, imp_steps, step_type) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xte = datasets[2][0] # Merge validation set and training set, and test on test set. Xtr = np.concatenate((Xtr, Xva), axis=0) Xva = Xte Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 batch_reps = 1 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX( all_pix_mean * np.ones((Xtr.shape[1],)) ) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ x_dim = Xtr.shape[1] z_dim = 100 init_scale = 1.0 use_bn = True x_in_sym = T.matrix('x_in_sym') x_out_sym = T.matrix('x_out_sym') x_mask_sym = T.matrix('x_mask_sym') ################# # p_zi_given_xi # ################# params = {} shared_config = \ [ {'layer_type': 'conv', 'in_chans': 1, # in shape: (batch, 784) 'out_chans': 64, # out shape: (batch, 64, 14, 14) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'double', 'apply_bn': use_bn, 'shape_func_in': lambda x: T.reshape(x, (-1, 1, 28, 28))}, \ {'layer_type': 'conv', 'in_chans': 64, # in shape: (batch, 64, 14, 14) 'out_chans': 128, # out shape: (batch, 128, 7, 7) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'double', 'apply_bn': use_bn, 'shape_func_out': lambda x: T.flatten(x, 2)}, \ {'layer_type': 'fc', 'in_chans': 128*7*7, 'out_chans': 256, 'activation': relu_actfun, 'apply_bn': use_bn} ] output_config = \ [ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': z_dim, 'activation': relu_actfun, 'apply_bn': False}, \ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': z_dim, 'activation': relu_actfun, 'apply_bn': False} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = init_scale params['build_theano_funcs'] = False p_zi_given_xi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_zi_given_xi.init_biases(0.0) ################### # p_sip1_given_zi # ################### params = {} shared_config = \ [ {'layer_type': 'fc', 'in_chans': z_dim, 'out_chans': 256, 'activation': relu_actfun, 'apply_bn': use_bn}, \ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': 7*7*128, 'activation': relu_actfun, 'apply_bn': use_bn, 'shape_func_out': lambda x: T.reshape(x, (-1, 128, 7, 7))}, \ {'layer_type': 'conv', 'in_chans': 128, # in shape: (batch, 128, 7, 7) 'out_chans': 64, # out shape: (batch, 64, 14, 14) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'half', 'apply_bn': use_bn} ] output_config = \ [ {'layer_type': 'conv', 'in_chans': 64, # in shape: (batch, 64, 14, 14) 'out_chans': 1, # out shape: (batch, 1, 28, 28) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'half', 'apply_bn': False, 'shape_func_out': lambda x: T.flatten(x, 2)}, \ {'layer_type': 'conv', 'in_chans': 64, 'out_chans': 1, 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'half', 'apply_bn': False, 'shape_func_out': lambda x: T.flatten(x, 2)}, \ {'layer_type': 'conv', 'in_chans': 64, 'out_chans': 1, 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'half', 'apply_bn': False, 'shape_func_out': lambda x: T.flatten(x, 2)} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = init_scale params['build_theano_funcs'] = False p_sip1_given_zi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_sip1_given_zi.init_biases(0.0) ################# # q_zi_given_xi # ################# params = {} shared_config = \ [ {'layer_type': 'conv', 'in_chans': 2, # in shape: (batch, 784+784) 'out_chans': 64, # out shape: (batch, 64, 14, 14) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'double', 'apply_bn': use_bn, 'shape_func_in': lambda x: T.reshape(x, (-1, 2, 28, 28))}, \ {'layer_type': 'conv', 'in_chans': 64, # in shape: (batch, 64, 14, 14) 'out_chans': 128, # out shape: (batch, 128, 7, 7) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'double', 'apply_bn': use_bn, 'shape_func_out': lambda x: T.flatten(x, 2)}, \ {'layer_type': 'fc', 'in_chans': 128*7*7, 'out_chans': 256, 'activation': relu_actfun, 'apply_bn': use_bn} ] output_config = \ [ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': z_dim, 'activation': relu_actfun, 'apply_bn': False}, \ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': z_dim, 'activation': relu_actfun, 'apply_bn': False} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = init_scale params['build_theano_funcs'] = False q_zi_given_xi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_zi_given_xi.init_biases(0.0) ########################################################### # Define parameters for the GPSImputer, and initialize it # ########################################################### print("Building the GPSImputer...") gpsi_params = {} gpsi_params['x_dim'] = x_dim gpsi_params['z_dim'] = z_dim # switch between direct construction and construction via p_x_given_si gpsi_params['imp_steps'] = imp_steps gpsi_params['step_type'] = step_type gpsi_params['x_type'] = 'bernoulli' gpsi_params['obs_transform'] = 'sigmoid' GPSI = GPSImputer(rng=rng, x_in=x_in_sym, x_out=x_out_sym, x_mask=x_mask_sym, p_zi_given_xi=p_zi_given_xi, p_sip1_given_zi=p_sip1_given_zi, q_zi_given_xi=q_zi_given_xi, params=gpsi_params, shared_param_dicts=None) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format(result_tag) out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.0001 momentum = 0.90 batch_idx = np.arange(batch_size) + tr_samples for i in range(200000): scale = min(1.0, ((i+1) / 5000.0)) if (((i + 1) % 15000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update GPSI.set_sgd_params(lr=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.98) GPSI.set_train_switch(1.0) GPSI.set_lam_nll(lam_nll=1.0) GPSI.set_lam_kld(lam_kld_q=1.0, lam_kld_p=0.1, lam_kld_g=0.0) GPSI.set_lam_l2w(1e-5) # perform a minibatch update and record the cost for this batch xb = to_fX( Xtr.take(batch_idx, axis=0) ) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) result = GPSI.train_joint(xi, xo, xm, batch_reps) # do diagnostics and general training tracking costs = [(costs[j] + result[j]) for j in range(len(result)-1)] if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_cost : {0:.4f}".format(costs[2]) str5 = " kld_cost : {0:.4f}".format(costs[3]) str6 = " reg_cost : {0:.4f}".format(costs[4]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva[0:5000], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) nll, kld = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10) vfe = np.mean(nll) + np.mean(kld) str1 = " va_nll_bound : {}".format(vfe) str2 = " va_nll_term : {}".format(np.mean(nll)) str3 = " va_kld_q2p : {}".format(np.mean(kld)) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() if ((i % 2000) == 0): #GPSI.save_to_file("{}_PARAMS.pkl".format(result_tag)) # Get some validation samples for evaluating model performance xb = to_fX( Xva[0:100] ) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) xi = np.repeat(xi, 2, axis=0) xo = np.repeat(xo, 2, axis=0) xm = np.repeat(xm, 2, axis=0) # draw some sample imputations from the model samp_count = xi.shape[0] _, model_samps = GPSI.sample_imputer(xi, xo, xm, use_guide_policy=False) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "{0:s}_samples_ng_b{1:d}.png".format(result_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=20)
def test_svhn(step_type='add', occ_dim=15, drop_prob=0.0): ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = "{}GPSI_OD{}_DP{}_{}_NA".format(RESULT_PATH, occ_dim, dp_int, step_type) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) tr_file = 'data/svhn_train_gray.pkl' te_file = 'data/svhn_test_gray.pkl' ex_file = 'data/svhn_extra_gray.pkl' data = load_svhn_gray(tr_file, te_file, ex_file=ex_file, ex_count=200000) Xtr = to_fX(shift_and_scale_into_01(np.vstack([data['Xtr'], data['Xex']]))) Xva = to_fX(shift_and_scale_into_01(data['Xte'])) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 250 batch_reps = 1 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1], ))) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ x_dim = Xtr.shape[1] z_dim = 200 imp_steps = 6 init_scale = 1.0 x_in_sym = T.matrix('x_in_sym') x_out_sym = T.matrix('x_out_sym') x_mask_sym = T.matrix('x_mask_sym') ################# # p_zi_given_xi # ################# params = {} shared_config = [x_dim, 1500, 1500] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_zi_given_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_zi_given_xi.init_biases(0.2) ################### # p_xip1_given_zi # ################### params = {} shared_config = [z_dim, 1500, 1500] output_config = [x_dim, x_dim] params['shared_config'] = shared_config params['output_config'] = output_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_xip1_given_zi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_xip1_given_zi.init_biases(0.2) ################### # q_zi_given_xi # ################### params = {} shared_config = [(x_dim + x_dim), 1500, 1500] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_zi_given_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_zi_given_xi.init_biases(0.2) ########################################################### # Define parameters for the GPSImputer, and initialize it # ########################################################### print("Building the GPSImputer...") gpsi_params = {} gpsi_params['x_dim'] = x_dim gpsi_params['z_dim'] = z_dim gpsi_params['imp_steps'] = imp_steps gpsi_params['step_type'] = step_type gpsi_params['x_type'] = 'bernoulli' gpsi_params['obs_transform'] = 'sigmoid' GPSI = GPSImputer(rng=rng, x_in=x_in_sym, x_out=x_out_sym, x_mask=x_mask_sym, \ p_zi_given_xi=p_zi_given_xi, \ p_xip1_given_zi=p_xip1_given_zi, \ q_zi_given_xi=q_zi_given_xi, \ params=gpsi_params, \ shared_param_dicts=None) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format(result_tag) out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.5 batch_idx = np.arange(batch_size) + tr_samples for i in range(200005): scale = min(1.0, ((i + 1) / 5000.0)) if (((i + 1) % 15000) == 0): learn_rate = learn_rate * 0.92 if (i > 10000): momentum = 0.90 else: momentum = 0.50 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update GPSI.set_sgd_params(lr=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.98) GPSI.set_train_switch(1.0) GPSI.set_lam_nll(lam_nll=1.0) GPSI.set_lam_kld(lam_kld_p=0.1, lam_kld_q=0.9) GPSI.set_lam_l2w(1e-4) # perform a minibatch update and record the cost for this batch xb = to_fX(Xtr.take(batch_idx, axis=0)) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) result = GPSI.train_joint(xi, xo, xm, batch_reps) # do diagnostics and general training tracking costs = [(costs[j] + result[j]) for j in range(len(result) - 1)] if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_cost : {0:.4f}".format(costs[2]) str5 = " kld_cost : {0:.4f}".format(costs[3]) str6 = " reg_cost : {0:.4f}".format(costs[4]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva[0:5000], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) nll, kld = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10) vfe = np.mean(nll) + np.mean(kld) str1 = " va_nll_bound : {}".format(vfe) str2 = " va_nll_term : {}".format(np.mean(nll)) str3 = " va_kld_q2p : {}".format(np.mean(kld)) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() GPSI.save_to_file("{}_PARAMS.pkl".format(result_tag)) if ((i % 20000) == 0): # Get some validation samples for evaluating model performance xb = to_fX(Xva[0:100]) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) xi = np.repeat(xi, 2, axis=0) xo = np.repeat(xo, 2, axis=0) xm = np.repeat(xm, 2, axis=0) # draw some sample imputations from the model samp_count = xi.shape[0] _, model_samps = GPSI.sample_imputer(xi, xo, xm, use_guide_policy=False) seq_len = len(model_samps) seq_samps = np.zeros( (seq_len * samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "{0:s}_samples_ng_b{1:d}.png".format(result_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=20)
def pretrain_osm(lam_kld=0.0): # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with data_file = 'data/tfd_data_48x48.pkl' dataset = load_tfd(tfd_pkl_name=data_file, which_set='unlabeled', fold='all') Xtr_unlabeled = dataset[0] dataset = load_tfd(tfd_pkl_name=data_file, which_set='train', fold='all') Xtr_train = dataset[0] Xtr = np.vstack([Xtr_unlabeled, Xtr_train]) dataset = load_tfd(tfd_pkl_name=data_file, which_set='valid', fold='all') Xva = dataset[0] tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 400 batch_reps = 6 carry_frac = 0.25 carry_size = int(batch_size * carry_frac) reset_prob = 0.04 # setup some symbolic variables and stuff Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') data_dim = Xtr.shape[1] prior_sigma = 1.0 Xtr_mean = np.mean(Xtr, axis=0) ########################## # NETWORK CONFIGURATIONS # ########################## gn_params = {} shared_config = [PRIOR_DIM, 1500, 1500] top_config = [shared_config[-1], data_dim] gn_params['shared_config'] = shared_config gn_params['mu_config'] = top_config gn_params['sigma_config'] = top_config gn_params['activation'] = relu_actfun gn_params['init_scale'] = 1.4 gn_params['lam_l2a'] = 0.0 gn_params['vis_drop'] = 0.0 gn_params['hid_drop'] = 0.0 gn_params['bias_noise'] = 0.0 gn_params['input_noise'] = 0.0 # choose some parameters for the continuous inferencer in_params = {} shared_config = [data_dim, 1500, 1500] top_config = [shared_config[-1], PRIOR_DIM] in_params['shared_config'] = shared_config in_params['mu_config'] = top_config in_params['sigma_config'] = top_config in_params['activation'] = relu_actfun in_params['init_scale'] = 1.4 in_params['lam_l2a'] = 0.0 in_params['vis_drop'] = 0.0 in_params['hid_drop'] = 0.0 in_params['bias_noise'] = 0.0 in_params['input_noise'] = 0.0 # Initialize the base networks for this OneStageModel IN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=in_params, shared_param_dicts=None) GN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=gn_params, shared_param_dicts=None) # Initialize biases in IN and GN IN.init_biases(0.2) GN.init_biases(0.2) ###################################### # LOAD AND RESTART FROM SAVED PARAMS # ###################################### # gn_fname = RESULT_PATH+"pt_osm_params_b110000_GN.pkl" # in_fname = RESULT_PATH+"pt_osm_params_b110000_IN.pkl" # IN = load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd, \ # new_params=None) # GN = load_infnet_from_file(f_name=gn_fname, rng=rng, Xd=Xd, \ # new_params=None) # in_params = IN.params # gn_params = GN.params ######################### # INITIALIZE THE GIPAIR # ######################### osm_params = {} osm_params['x_type'] = 'bernoulli' osm_params['xt_transform'] = 'sigmoid' osm_params['logvar_bound'] = LOGVAR_BOUND OSM = OneStageModel(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, \ p_x_given_z=GN, q_z_given_x=IN, \ x_dim=data_dim, z_dim=PRIOR_DIM, params=osm_params) OSM.set_lam_l2w(1e-5) safe_mean = (0.9 * Xtr_mean) + 0.05 safe_mean_logit = np.log(safe_mean / (1.0 - safe_mean)) OSM.set_output_bias(safe_mean_logit) OSM.set_input_bias(-Xtr_mean) ###################### # BASIC VAE TRAINING # ###################### out_file = open(RESULT_PATH+"pt_osm_results.txt", 'wb') # Set initial learning rate and basic SGD hyper parameters obs_costs = np.zeros((batch_size,)) costs = [0. for i in range(10)] learn_rate = 0.002 for i in range(200000): scale = min(1.0, float(i) / 5000.0) if ((i > 1) and ((i % 20000) == 0)): learn_rate = learn_rate * 0.8 if (i < 50000): momentum = 0.5 elif (i < 10000): momentum = 0.7 else: momentum = 0.9 if ((i == 0) or (npr.rand() < reset_prob)): # sample a fully random batch batch_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,)) else: # sample a partially random batch, which retains some portion of # the worst scoring examples from the previous batch fresh_idx = npr.randint(low=0,high=tr_samples,size=(batch_size-carry_size,)) batch_idx = np.concatenate((fresh_idx.ravel(), carry_idx.ravel())) # do a minibatch update of the model, and compute some costs tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,)) Xd_batch = Xtr.take(tr_idx, axis=0) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # do a minibatch update of the model, and compute some costs OSM.set_sgd_params(lr_1=(scale*learn_rate), \ mom_1=(scale*momentum), mom_2=0.98) OSM.set_lam_nll(1.0) OSM.set_lam_kld(lam_kld_1=scale*lam_kld, lam_kld_2=0.0, lam_kld_c=50.0) result = OSM.train_joint(Xd_batch, Xc_batch, Xm_batch, batch_reps) batch_costs = result[4] + result[5] obs_costs = collect_obs_costs(batch_costs, batch_reps) carry_idx = batch_idx[np.argsort(-obs_costs)[0:carry_size]] costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 1000) == 0): # record and then reset the cost trackers costs = [(v / 1000.0) for v in costs] str_1 = "-- batch {0:d} --".format(i) str_2 = " joint_cost: {0:.4f}".format(costs[0]) str_3 = " nll_cost : {0:.4f}".format(costs[1]) str_4 = " kld_cost : {0:.4f}".format(costs[2]) str_5 = " reg_cost : {0:.4f}".format(costs[3]) costs = [0.0 for v in costs] # print out some diagnostic information joint_str = "\n".join([str_1, str_2, str_3, str_4, str_5]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() if ((i % 2000) == 0): Xva = row_shuffle(Xva) model_samps = OSM.sample_from_prior(500) file_name = RESULT_PATH+"pt_osm_samples_b{0:d}_XG.png".format(i) utils.visualize_samples(model_samps, file_name, num_rows=20) file_name = RESULT_PATH+"pt_osm_inf_weights_b{0:d}.png".format(i) utils.visualize_samples(OSM.inf_weights.get_value(borrow=False).T, \ file_name, num_rows=30) file_name = RESULT_PATH+"pt_osm_gen_weights_b{0:d}.png".format(i) utils.visualize_samples(OSM.gen_weights.get_value(borrow=False), \ file_name, num_rows=30) # compute information about free-energy on validation set file_name = RESULT_PATH+"pt_osm_free_energy_b{0:d}.png".format(i) fe_terms = OSM.compute_fe_terms(Xva[0:2500], 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) fe_str = " nll_bound : {0:.4f}".format(fe_mean) print(fe_str) out_file.write(fe_str+"\n") utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ x_label='Posterior KLd', y_label='Negative Log-likelihood') # compute information about posterior KLds on validation set file_name = RESULT_PATH+"pt_osm_post_klds_b{0:d}.png".format(i) post_klds = OSM.compute_post_klds(Xva[0:2500]) post_dim_klds = np.mean(post_klds, axis=0) utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, \ file_name) if ((i % 5000) == 0): IN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_b{0:d}_IN.pkl".format(i)) GN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_b{0:d}_GN.pkl".format(i)) IN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_IN.pkl") GN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_GN.pkl") return
def test_one_stage_model(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/') Xtr = np.vstack((Xtr, Xva)) Xva = Xte #del Xte tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 128 batch_reps = 1 ############################################### # Setup some parameters for the OneStageModel # ############################################### x_dim = Xtr.shape[1] z_dim = 64 x_type = 'bernoulli' xin_sym = T.matrix('xin_sym') ############### # p_x_given_z # ############### params = {} shared_config = \ [ {'layer_type': 'fc', 'in_chans': z_dim, 'out_chans': 256, 'activation': relu_actfun, 'apply_bn': True}, \ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': 7*7*128, 'activation': relu_actfun, 'apply_bn': True, 'shape_func_out': lambda x: T.reshape(x, (-1, 128, 7, 7))}, \ {'layer_type': 'conv', 'in_chans': 128, # in shape: (batch, 128, 7, 7) 'out_chans': 64, # out shape: (batch, 64, 14, 14) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'half', 'apply_bn': True} ] output_config = \ [ {'layer_type': 'conv', 'in_chans': 64, # in shape: (batch, 64, 14, 14) 'out_chans': 1, # out shape: (batch, 1, 28, 28) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'half', 'apply_bn': False, 'shape_func_out': lambda x: T.flatten(x, 2)}, \ {'layer_type': 'conv', 'in_chans': 64, 'out_chans': 1, 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'half', 'apply_bn': False, 'shape_func_out': lambda x: T.flatten(x, 2)} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = 1.0 params['build_theano_funcs'] = False p_x_given_z = HydraNet(rng=rng, Xd=xin_sym, \ params=params, shared_param_dicts=None) p_x_given_z.init_biases(0.0) ############### # q_z_given_x # ############### params = {} shared_config = \ [ {'layer_type': 'conv', 'in_chans': 1, # in shape: (batch, 784) 'out_chans': 64, # out shape: (batch, 64, 14, 14) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'double', 'apply_bn': True, 'shape_func_in': lambda x: T.reshape(x, (-1, 1, 28, 28))}, \ {'layer_type': 'conv', 'in_chans': 64, # in shape: (batch, 64, 14, 14) 'out_chans': 128, # out shape: (batch, 128, 7, 7) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'double', 'apply_bn': True, 'shape_func_out': lambda x: T.flatten(x, 2)}, \ {'layer_type': 'fc', 'in_chans': 128*7*7, 'out_chans': 256, 'activation': relu_actfun, 'apply_bn': True} ] output_config = \ [ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': z_dim, 'activation': relu_actfun, 'apply_bn': False}, \ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': z_dim, 'activation': relu_actfun, 'apply_bn': False} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = 1.0 params['build_theano_funcs'] = False q_z_given_x = HydraNet(rng=rng, Xd=xin_sym, \ params=params, shared_param_dicts=None) q_z_given_x.init_biases(0.0) ############################################################## # Define parameters for the TwoStageModel, and initialize it # ############################################################## print("Building the OneStageModel...") osm_params = {} osm_params['x_type'] = x_type osm_params['obs_transform'] = 'sigmoid' OSM = OneStageModel(rng=rng, x_in=xin_sym, x_dim=x_dim, z_dim=z_dim, p_x_given_z=p_x_given_z, q_z_given_x=q_z_given_x, params=osm_params) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format("OSM_TEST") out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.0005 momentum = 0.9 batch_idx = np.arange(batch_size) + tr_samples for i in range(500000): scale = min(0.5, ((i+1) / 5000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) Xb = to_fX( Xtr.take(batch_idx, axis=0) ) #Xb = binarize_data(Xtr.take(batch_idx, axis=0)) # set sgd and objective function hyperparams for this update OSM.set_sgd_params(lr=scale*learn_rate, \ mom_1=(scale*momentum), mom_2=0.98) OSM.set_lam_nll(lam_nll=1.0) OSM.set_lam_kld(lam_kld=1.0) OSM.set_lam_l2w(1e-5) # perform a minibatch update and record the cost for this batch result = OSM.train_joint(Xb, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_cost : {0:.4f}".format(costs[1]) str4 = " kld_cost : {0:.4f}".format(costs[2]) str5 = " reg_cost : {0:.4f}".format(costs[3]) joint_str = "\n".join([str1, str2, str3, str4, str5]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if (((i % 5000) == 0) or ((i < 10000) and ((i % 1000) == 0))): # draw some independent random samples from the model samp_count = 300 model_samps = OSM.sample_from_prior(samp_count) file_name = "OSM_SAMPLES_b{0:d}.png".format(i) utils.visualize_samples(model_samps, file_name, num_rows=15) # compute free energy estimate for validation samples Xva = row_shuffle(Xva) fe_terms = OSM.compute_fe_terms(Xva[0:5000], 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) out_str = " nll_bound : {0:.4f}".format(fe_mean) print(out_str) out_file.write(out_str+"\n") out_file.flush() return
def test_with_model_init(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) #dataset = 'data/mnist.pkl.gz' #datasets = load_udm(dataset, as_shared=False, zero_mean=False) #Xtr = datasets[0][0] #Xva = datasets[1][0] Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/') del Xte tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 batch_reps = 1 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ x_dim = Xtr.shape[1] z_dim = 20 h_dim = 50 s_dim = 50 init_scale = 1.0 x_type = 'bernoulli' # some InfNet instances to build the TwoStageModel from x_in_sym = T.matrix('x_in_sym') x_out_sym = T.matrix('x_out_sym') ############### # p_h_given_s # ############### params = {} shared_config = [s_dim, 250, 250] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_h_given_s = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_h_given_s.init_biases(0.2) ################# # p_x_given_s_h # ################# params = {} shared_config = [(s_dim + h_dim), 250, 250] top_config = [shared_config[-1], x_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_x_given_s_h = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_x_given_s_h.init_biases(0.2) ############### # p_s_given_z # ############### params = {} shared_config = [z_dim, 250] top_config = [shared_config[-1], s_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_s_given_z = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_s_given_z.init_biases(0.2) ############### # q_z_given_x # ############### params = {} shared_config = [x_dim, 250, 250] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_z_given_x = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_z_given_x.init_biases(0.2) ################# # q_h_given_x_s # ################# params = {} shared_config = [(x_dim + s_dim), 500, 500] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_h_given_x_s = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_h_given_x_s.init_biases(0.2) ############################################################## # Define parameters for the TwoStageModel, and initialize it # ############################################################## print("Building the TwoStageModel...") msm_params = {} msm_params['x_type'] = x_type msm_params['obs_transform'] = 'sigmoid' TSM = TwoStageModel(rng=rng, \ x_in=x_in_sym, x_out=x_out_sym, \ p_s_given_z=p_s_given_z, \ p_h_given_s=p_h_given_s, \ p_x_given_s_h=p_x_given_s_h, \ q_z_given_x=q_z_given_x, \ q_h_given_x_s=q_h_given_x_s, \ x_dim=x_dim, \ z_dim=z_dim, s_dim=s_dim, h_dim=h_dim, \ params=msm_params) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ out_file = open("TSM_A_RESULTS.txt", 'wb') costs = [0. for i in range(10)] learn_rate = 0.0003 momentum = 0.5 batch_idx = np.arange(batch_size) + tr_samples for i in range(250000): scale = min(1.0, ((i+1) / 3000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 if (i > 50000): momentum = 0.90 else: momentum = 0.50 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # train on the training set lam_kld = 1.0 # set sgd and objective function hyperparams for this update TSM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.99) TSM.set_train_switch(1.0) TSM.set_lam_nll(lam_nll=1.0) TSM.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.8, lam_kld_p2q=0.2) TSM.set_lam_kld_l1l2(lam_kld_l1l2=scale) TSM.set_lam_l2w(1e-4) TSM.set_drop_rate(0.0) TSM.q_h_given_x_s.set_bias_noise(0.0) TSM.p_h_given_s.set_bias_noise(0.0) TSM.p_x_given_s_h.set_bias_noise(0.0) # perform a minibatch update and record the cost for this batch Xb_tr = to_fX( Xtr.take(batch_idx, axis=0) ) result = TSM.train_joint(Xb_tr, Xb_tr, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result)-1)] if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_cost : {0:.4f}".format(costs[1]) str4 = " kld_cost : {0:.4f}".format(costs[2]) str5 = " reg_cost : {0:.4f}".format(costs[3]) joint_str = "\n".join([str1, str2, str3, str4, str5]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))): TSM.set_drop_rate(0.0) TSM.q_h_given_x_s.set_bias_noise(0.0) TSM.p_h_given_s.set_bias_noise(0.0) TSM.p_x_given_s_h.set_bias_noise(0.0) # Get some validation samples for computing diagnostics Xva = row_shuffle(Xva) Xb_va = to_fX( Xva[0:2000] ) # draw some independent random samples from the model samp_count = 500 model_samps = TSM.sample_from_prior(samp_count) file_name = "TSM_A_SAMPLES_IND_b{0:d}.png".format(i) utils.visualize_samples(model_samps, file_name, num_rows=20) Xb_tr = to_fX( Xtr[0:2000] ) fe_terms = TSM.compute_fe_terms(Xb_tr, Xb_tr, 30) fe_nll = np.mean(fe_terms[0]) fe_kld = np.mean(fe_terms[1]) fe_joint = fe_nll + fe_kld joint_str = " vfe-tr: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() fe_terms = TSM.compute_fe_terms(Xb_va, Xb_va, 30) fe_nll = np.mean(fe_terms[0]) fe_kld = np.mean(fe_terms[1]) fe_joint = fe_nll + fe_kld joint_str = " vfe-va: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) print(joint_str) out_file.write(joint_str+"\n") out_file.flush()
def test_with_model_init(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr_shared = datasets[0][0] Xva_shared = datasets[1][0] Xtr = Xtr_shared.get_value(borrow=False).astype(theano.config.floatX) Xva = Xva_shared.get_value(borrow=False).astype(theano.config.floatX) tr_samples = Xtr.shape[0] batch_size = 500 batch_reps = 1 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ obs_dim = Xtr.shape[1] z_rnn_dim = 25 z_obs_dim = 5 jnt_dim = obs_dim + z_rnn_dim h_dim = 100 x_type = 'bernoulli' prior_sigma = 1.0 # some InfNet instances to build the TwoStageModel from X_sym = T.matrix('X_sym') ######################## # p_s0_obs_given_z_obs # ######################## params = {} shared_config = [z_obs_dim, 250, 250] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 1e-3 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_s0_obs_given_z_obs = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \ params=params, shared_param_dicts=None) p_s0_obs_given_z_obs.init_biases(0.2) ################# # p_hi_given_si # ################# params = {} shared_config = [jnt_dim, 500, 500] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_hi_given_si = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \ params=params, shared_param_dicts=None) p_hi_given_si.init_biases(0.2) ###################### # p_sip1_given_si_hi # ###################### params = {} shared_config = [(h_dim + z_rnn_dim), 500, 500] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_sip1_given_si_hi = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \ params=params, shared_param_dicts=None) p_sip1_given_si_hi.init_biases(0.2) ############### # q_z_given_x # ############### params = {} shared_config = [obs_dim, 250, 250] top_config = [shared_config[-1], (z_rnn_dim + z_obs_dim)] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_z_given_x = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \ params=params, shared_param_dicts=None) q_z_given_x.init_biases(0.2) ################### # q_hi_given_x_si # ################### params = {} shared_config = [(obs_dim + jnt_dim), 500, 500] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_hi_given_x_si = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \ params=params, shared_param_dicts=None) q_hi_given_x_si.init_biases(0.2) ################################################################ # Define parameters for the MultiStageModel, and initialize it # ################################################################ print("Building the MultiStageModel...") msm_params = {} msm_params['x_type'] = x_type msm_params['obs_transform'] = 'sigmoid' MSM = MultiStageModel(rng=rng, x_in=X_sym, \ p_s0_obs_given_z_obs=p_s0_obs_given_z_obs, \ p_hi_given_si=p_hi_given_si, \ p_sip1_given_si_hi=p_sip1_given_si_hi, \ q_z_given_x=q_z_given_x, \ q_hi_given_x_si=q_hi_given_x_si, \ obs_dim=obs_dim, z_rnn_dim=z_rnn_dim, z_obs_dim=z_obs_dim, \ h_dim=h_dim, model_init_obs=False, model_init_rnn=True, \ ir_steps=3, params=msm_params) obs_mean = (0.9 * np.mean(Xtr, axis=0)) + 0.05 obs_mean_logit = np.log(obs_mean / (1.0 - obs_mean)) MSM.set_input_bias(-obs_mean) MSM.set_obs_bias(0.1*obs_mean_logit) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ costs = [0. for i in range(10)] learn_rate = 0.003 momentum = 0.5 for i in range(300000): scale = min(1.0, ((i+1) / 5000.0)) l1l2_weight = 1.0 #min(1.0, ((i+1) / 2500.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.92 if (i > 100000): momentum = 0.80 if (i > 50000): momentum = 0.65 else: momentum = 0.50 # randomly sample a minibatch tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,)) Xb = binarize_data(Xtr.take(tr_idx, axis=0)) Xb = Xb.astype(theano.config.floatX) # set sgd and objective function hyperparams for this update MSM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \ mom_1=(scale*momentum), mom_2=0.99) MSM.set_train_switch(1.0) MSM.set_l1l2_weight(l1l2_weight) MSM.set_lam_nll(lam_nll=1.0) MSM.set_lam_kld(lam_kld_1=1.0, lam_kld_2=1.0) MSM.set_lam_l2w(1e-5) MSM.set_kzg_weight(0.01) # perform a minibatch update and record the cost for this batch result = MSM.train_joint(Xb, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] print("-- batch {0:d} --".format(i)) print(" joint_cost: {0:.4f}".format(costs[0])) print(" nll_cost : {0:.4f}".format(costs[1])) print(" kld_cost : {0:.4f}".format(costs[2])) print(" reg_cost : {0:.4f}".format(costs[3])) costs = [0.0 for v in costs] if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))): Xva = row_shuffle(Xva) # draw some independent random samples from the model samp_count = 200 model_samps = MSM.sample_from_prior(samp_count) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "MZ_SAMPLES_b{0:d}.png".format(i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # visualize some important weights in the model file_name = "MZ_INF_1_WEIGHTS_b{0:d}.png".format(i) W = MSM.inf_1_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MZ_INF_2_WEIGHTS_b{0:d}.png".format(i) W = MSM.inf_2_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MZ_GEN_1_WEIGHTS_b{0:d}.png".format(i) W = MSM.gen_1_weights.get_value(borrow=False) utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MZ_GEN_2_WEIGHTS_b{0:d}.png".format(i) W = MSM.gen_2_weights.get_value(borrow=False) utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MZ_GEN_INF_WEIGHTS_b{0:d}.png".format(i) W = MSM.gen_inf_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) # compute information about posterior KLds on validation set post_klds = MSM.compute_post_klds(Xva[0:5000]) file_name = "MZ_H0_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(post_klds[0].shape[1]), \ np.mean(post_klds[0], axis=0), file_name) file_name = "MZ_HI_COND_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(post_klds[1].shape[1]), \ np.mean(post_klds[1], axis=0), file_name) file_name = "MZ_HI_GLOB_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(post_klds[2].shape[1]), \ np.mean(post_klds[2], axis=0), file_name) # compute information about free-energy on validation set file_name = "MZ_FREE_ENERGY_b{0:d}.png".format(i) fe_terms = MSM.compute_fe_terms(binarize_data(Xva[0:5000]), 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) print(" nll_bound : {0:.4f}".format(fe_mean)) utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ x_label='Posterior KLd', y_label='Negative Log-likelihood') return
# configure a trajectory generator num_samples = 100 traj_len = 64 x_range = [-0.8, 0.8] y_range = [-0.8, 0.8] max_speed = 0.15 TRAJ = TrajectoryGenerator(x_range=x_range, y_range=y_range, \ max_speed=max_speed) # test the writer function start_time = time.time() batch_count = 50 for i in range(batch_count): # generate a minibatch of trajectories traj_pos, traj_vel = TRAJ.generate_trajectories(num_samples, traj_len) traj_x = traj_pos[:, :, 0] traj_y = traj_pos[:, :, 1] # draw the trajectories center_x = to_fX(traj_x.T.ravel()) center_y = to_fX(traj_y.T.ravel()) delta = to_fX(np.ones(center_x.shape)) sigma = to_fX(np.ones(center_x.shape)) W = write_func(center_y, center_x, delta, 0.2 * sigma) end_time = time.time() render_time = end_time - start_time render_bps = batch_count / render_time print("RENDER BATCH/SECOND: {0:.2f}".format(render_bps)) W = W[:20 * traj_len] utils.visualize_samples(W, "AAAAA.png", num_rows=20)
def evaluate_lenet5(learning_rate=0.05, n_epochs=500, dataset='./data/mnist.pkl.gz', nkerns=[48, 64], batch_size=256): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (28, 28) # this is the size of MNIST images start_rate = numpy.asarray([0.05]).astype(theano.config.floatX) learning_rate = theano.shared(value=start_rate, name='learning_rate') ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' tanh = lambda vals: T.tanh(vals) relu = lambda vals: relu_actfun(vals) # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_prep = Reshape2D4DLayer(input=x, out_shape=(1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-7+1,28-7+1)=(22,22) # maxpooling reduces this further to (22/2,22/2) = (11,11) # 4D output tensor is thus of shape (batch_size,nkerns[0],11,11) layer0 = ConvPoolLayer(rng, input=layer0_prep.output, \ filt_def=(nkerns[0], 1, 7, 7), pool_def=(2, 2), \ activation=relu, drop_rate=0.0, input_noise=0.1, bias_noise=0.05, \ W=None, b=None, name="layer0", W_scale=2.0) # Construct the second convolutional pooling layer # filtering reduces the image size to (11-4+1,11-4+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = ConvPoolLayer(rng, input=layer0.output, \ filt_def=(nkerns[1], nkerns[0], 4, 4), pool_def=(2, 2), \ activation=relu, drop_rate=0.0, input_noise=0.0, bias_noise=0.05, \ W=None, b=None, name="layer1", W_scale=2.0) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_prep = Reshape4D2DLayer(layer1.output) # construct a fully-connected relu layer layer2 = HiddenLayer(rng, layer2_prep.output, nkerns[1]*4*4, 512, \ activation=relu, pool_size=0, \ drop_rate=0.0, input_noise=0.0, bias_noise=0.05, \ W=None, b=None, name="layer2", W_scale=2.0) # construct an output layer to predict classes layer3 = HiddenLayer(rng, layer2.output, 512, 10, \ activation=relu, pool_size=0, \ drop_rate=0.5, input_noise=0.0, bias_noise=0.0, \ W=None, b=None, name="layer2", W_scale=2.0) # get a loss function to apply to the output layer loss_func = LogisticRegression(layer3) # the cost we minimize during training is the NLL of the model cost = loss_func.loss_func(y) # create a function to compute the mistakes that are made by the model test_model = theano.function([index], loss_func.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size]}) validate_model = theano.function([index], loss_func.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params moms = OrderedDict() for p in params: moms[p] = theano.shared(value=numpy.zeros( \ p.get_value(borrow=True).shape).astype(theano.config.floatX)) # create a list of gradients for all model parameters grads = OrderedDict() for p in params: grads[p] = T.grad(cost, p) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for p in params: mom_update = (moms[p], (0.8 * moms[p]) + (0.2 * grads[p])) param_update = (p, p - learning_rate[0] * moms[p]) updates.append(mom_update) updates.append(param_update) train_model = theano.function([index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size]}) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) / batch_size print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = numpy.mean(test_losses) / batch_size print((' epoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if ((epoch+1 % 30) == 0): new_rate = 0.5 * learning_rate.get_value(borrow=True) learning_rate.set_value(new_rate) if ((epoch % 10) == 0): W_l0 = layer0.W.get_value(borrow=False) W_l0 = W_l0.reshape((W_l0.shape[0], numpy.prod(W_l0.shape[1:]))) visualize_samples(W_l0, 'A1_CONV_FILTS.png') end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))