def test_with_model_init(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = to_fX(datasets[0][0]) Xva = to_fX(datasets[1][0]) Ytr = datasets[0][1] Yva = datasets[1][1] tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 BD = lambda ary: binarize_data(ary) ####################################### # Setup some parameters for the model # ####################################### obs_dim = Xtr.shape[1] z_dim = 64 init_scale = 0.2 # some InfNet instances to build the TwoStageModel from x_in = T.matrix('x_in') y_in = T.lvector('y_in') ############### # q_z_given_x # ############### print("Building q_z_given_x...") params = {} shared_config = [obs_dim, 1000, 1000] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.2 params['hid_drop'] = 0.5 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_z_given_x = InfNet(rng=rng, Xd=x_in, \ params=params, shared_param_dicts=None) q_z_given_x.init_biases(0.2) ########################################################### # Define parameters for the ClassModel, and initialize it # ########################################################### print("Building the ClassModel...") CM = ClassModel(rng=rng, \ x_in=x_in, y_in=y_in, \ q_z_given_x=q_z_given_x, \ class_count=10, \ z_dim=z_dim, \ use_samples=False) CM.set_drop_rate(0.5) CM.set_lam_nll(lam_nll=1.0) CM.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0) CM.set_lam_l2w(lam_l2w=1e-5) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ out_file = open("CM_RESULTS.txt", 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.9 batch_idx = np.arange(batch_size) + tr_samples for i in range(250000): scale = min(1.0, ((i+1) / 1000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr, Ytr = row_shuffle(Xtr, Ytr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update CM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.99) # perform a minibatch update and record the cost for this batch Xi_tr = Xtr.take(batch_idx, axis=0) Yi_tr = Ytr.take(batch_idx, axis=0) result = CM.train_joint(Xi_tr, Yi_tr) costs = [(costs[j] + result[j]) for j in range(len(result)-1)] # output useful information about training progress if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost : {0:.4f}".format(costs[0]) str3 = " nll_cost : {0:.4f}".format(costs[1]) str4 = " kld_cost : {0:.4f}".format(costs[2]) str5 = " reg_cost : {0:.4f}".format(costs[3]) joint_str = "\n".join([str1, str2, str3, str4, str5]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))): ##################################################### # compute multi-sample estimates of the free-energy # ##################################################### # training set... fe_terms = CM.compute_fe_terms(Xtr[0:2500],Ytr[0:2500], 30) fe_nll = np.mean(fe_terms[0]) fe_kld = np.mean(fe_terms[1]) fe_joint = fe_nll + fe_kld joint_str = " vfe-tr: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() # validation set... Xva, Yva = row_shuffle(Xva, Yva) fe_terms = CM.compute_fe_terms(Xva[0:2500], Yva[0:2500], 30) fe_nll = np.mean(fe_terms[0]) fe_kld = np.mean(fe_terms[1]) fe_joint = fe_nll + fe_kld joint_str = " vfe-va: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() ########################################################## # compute multi-sample estimates of classification error # ########################################################## # training set... va_error, va_preds = CM.class_error(Xtr[:2500], Ytr[:2500], samples=30) joint_str = " tr-class-error: {0:.4f}".format(va_error) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() # validation set... va_error, va_preds = CM.class_error(Xva[:2500], Yva[:2500], samples=30) joint_str = " va-class-error: {0:.4f}".format(va_error) print(joint_str) out_file.write(joint_str+"\n") out_file.flush()
def test_with_model_init(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr_shared = datasets[0][0] Xva_shared = datasets[1][0] Xtr = Xtr_shared.get_value(borrow=False).astype(theano.config.floatX) Xva = Xva_shared.get_value(borrow=False).astype(theano.config.floatX) tr_samples = Xtr.shape[0] batch_size = 200 batch_reps = 1 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ obs_dim = Xtr.shape[1] z_dim = 20 h_dim = 100 x_type = 'bernoulli' # some InfNet instances to build the TwoStageModel from X_sym = T.matrix('X_sym') ######################## # p_s0_obs_given_z_obs # ######################## params = {} shared_config = [z_dim, 250, 250] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 1e-3 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_s0_obs_given_z_obs = InfNet(rng=rng, Xd=X_sym, \ params=params, shared_param_dicts=None) p_s0_obs_given_z_obs.init_biases(0.2) ################# # p_hi_given_si # ################# params = {} shared_config = [obs_dim, 250, 250] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_hi_given_si = InfNet(rng=rng, Xd=X_sym, \ params=params, shared_param_dicts=None) p_hi_given_si.init_biases(0.2) ###################### # p_sip1_given_si_hi # ###################### params = {} shared_config = [h_dim, 250, 250] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_sip1_given_si_hi = InfNet(rng=rng, Xd=X_sym, \ params=params, shared_param_dicts=None) p_sip1_given_si_hi.init_biases(0.2) ############### # q_z_given_x # ############### params = {} shared_config = [obs_dim, 250, 250] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_z_given_x = InfNet(rng=rng, Xd=X_sym, \ params=params, shared_param_dicts=None) q_z_given_x.init_biases(0.2) ################### # q_hi_given_x_si # ################### params = {} shared_config = [(obs_dim + obs_dim), 500, 500] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_hi_given_x_si = InfNet(rng=rng, Xd=X_sym, \ params=params, shared_param_dicts=None) q_hi_given_x_si.init_biases(0.2) ################################################################ # Define parameters for the MultiStageModel, and initialize it # ################################################################ print("Building the MultiStageModel...") msm_params = {} msm_params['x_type'] = x_type msm_params['obs_transform'] = 'sigmoid' MSM = MultiStageModel(rng=rng, x_in=X_sym, \ p_s0_obs_given_z_obs=p_s0_obs_given_z_obs, \ p_hi_given_si=p_hi_given_si, \ p_sip1_given_si_hi=p_sip1_given_si_hi, \ q_z_given_x=q_z_given_x, \ q_hi_given_x_si=q_hi_given_x_si, \ obs_dim=obs_dim, z_dim=z_dim, h_dim=h_dim, \ model_init_obs=True, ir_steps=2, \ params=msm_params) obs_mean = (0.9 * np.mean(Xtr, axis=0)) + 0.05 obs_mean_logit = np.log(obs_mean / (1.0 - obs_mean)) MSM.set_input_bias(-obs_mean) MSM.set_obs_bias(0.1*obs_mean_logit) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ costs = [0. for i in range(10)] learn_rate = 0.0003 momentum = 0.8 for i in range(300000): scale = min(1.0, ((i+1) / 10000.0)) extra_kl = max(0.0, ((50000.0 - i) / 50000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # randomly sample a minibatch tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,)) Xb = binarize_data(Xtr.take(tr_idx, axis=0)) Xb = Xb.astype(theano.config.floatX) # set sgd and objective function hyperparams for this update MSM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \ mom_1=(scale*momentum), mom_2=0.98) MSM.set_train_switch(1.0) MSM.set_l1l2_weight(1.0) MSM.set_lam_nll(lam_nll=1.0) MSM.set_lam_kld(lam_kld_1=(1.0+extra_kl), lam_kld_2=(1.0+extra_kl)) MSM.set_lam_l2w(1e-6) MSM.set_kzg_weight(0.01) # perform a minibatch update and record the cost for this batch result = MSM.train_joint(Xb, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] print("-- batch {0:d} --".format(i)) print(" joint_cost: {0:.4f}".format(costs[0])) print(" nll_cost : {0:.4f}".format(costs[1])) print(" kld_cost : {0:.4f}".format(costs[2])) print(" reg_cost : {0:.4f}".format(costs[3])) costs = [0.0 for v in costs] if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))): Xva = row_shuffle(Xva) # draw some independent random samples from the model samp_count = 200 model_samps = MSM.sample_from_prior(samp_count) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "MX_SAMPLES_b{0:d}.png".format(i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # visualize some important weights in the model file_name = "MX_INF_1_WEIGHTS_b{0:d}.png".format(i) W = MSM.inf_1_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MX_INF_2_WEIGHTS_b{0:d}.png".format(i) W = MSM.inf_2_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MX_GEN_1_WEIGHTS_b{0:d}.png".format(i) W = MSM.gen_1_weights.get_value(borrow=False) utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MX_GEN_2_WEIGHTS_b{0:d}.png".format(i) W = MSM.gen_2_weights.get_value(borrow=False) utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MX_GEN_INF_WEIGHTS_b{0:d}.png".format(i) W = MSM.gen_inf_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) # compute information about posterior KLds on validation set post_klds = MSM.compute_post_klds(Xva[0:5000]) file_name = "MX_H0_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(post_klds[0].shape[1]), \ np.mean(post_klds[0], axis=0), file_name) file_name = "MX_HI_COND_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(post_klds[1].shape[1]), \ np.mean(post_klds[1], axis=0), file_name) file_name = "MX_HI_GLOB_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(post_klds[2].shape[1]), \ np.mean(post_klds[2], axis=0), file_name) # compute information about free-energy on validation set file_name = "MX_FREE_ENERGY_b{0:d}.png".format(i) fe_terms = MSM.compute_fe_terms(binarize_data(Xva[0:5000]), 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) print(" nll_bound : {0:.4f}".format(fe_mean)) utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ x_label='Posterior KLd', y_label='Negative Log-likelihood') return
def test_gi_trip(hyper_params=None, sup_count=600, rng_seed=1234): assert(not (hyper_params is None)) # Initialize a source of randomness rng = np.random.RandomState(rng_seed) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm_ss(dataset, sup_count, rng, zero_mean=False) Xtr_su = datasets[0][0].get_value(borrow=False) Ytr_su = datasets[0][1].get_value(borrow=False).astype(np.int32) Xtr_un = datasets[1][0].get_value(borrow=False) Ytr_un = datasets[1][1].get_value(borrow=False).astype(np.int32) # get the joint labeled and unlabeled data Xtr_un = np.vstack([Xtr_su, Xtr_un]).astype(theano.config.floatX) Ytr_un = np.vstack([Ytr_su[:,np.newaxis], Ytr_un[:,np.newaxis]]) Ytr_un = 0 * Ytr_un # KEEP CATS FIXED OR FREE? YES/NO? # get the labeled data Xtr_su = Xtr_su.astype(theano.config.floatX) Ytr_su = Ytr_su[:,np.newaxis] # get observations and labels for the validation set Xva = datasets[2][0].get_value(borrow=False).astype(theano.config.floatX) Yva = datasets[2][1].get_value(borrow=False).astype(np.int32) Yva = Yva[:,np.newaxis] # numpy is dumb # get size information for the data un_samples = Xtr_un.shape[0] su_samples = Xtr_su.shape[0] va_samples = Xva.shape[0] # set up some symbolic variables for input to the GITrip Xp = T.matrix('Xp_base') Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') Yd = T.icol('Yd_base') # set some "shape" parameters for the networks data_dim = Xtr_un.shape[1] label_dim = 10 prior_dim = 50 prior_sigma = 1.0 batch_size = 150 # set parameters for the generator network gn_params = {} gn_config = [(prior_dim + label_dim), 500, 500, data_dim] gn_params['mlp_config'] = gn_config gn_params['activation'] = softplus_actfun gn_params['lam_l2a'] = 1e-3 gn_params['vis_drop'] = 0.0 gn_params['hid_drop'] = 0.0 gn_params['bias_noise'] = 0.1 # choose some parameters for the continuous inferencer in_params = {} shared_config = [data_dim, 500, 500] top_config = [shared_config[-1], prior_dim] in_params['shared_config'] = shared_config in_params['mu_config'] = top_config in_params['sigma_config'] = top_config in_params['activation'] = softplus_actfun in_params['init_scale'] = 1.0 in_params['lam_l2a'] = 1e-3 in_params['vis_drop'] = 0.2 in_params['hid_drop'] = 0.0 in_params['bias_noise'] = 0.1 in_params['input_noise'] = 0.1 # choose some parameters for the categorical inferencer pn_params = {} pc0 = [data_dim, (200, 4), (200, 4), label_dim] pn_params['proto_configs'] = [pc0] # Set up some spawn networks sc0 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} sc1 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} pn_params['spawn_configs'] = [sc0, sc1] pn_params['spawn_weights'] = [0.5, 0.5] # Set remaining params pn_params['activation'] = relu_actfun pn_params['ear_type'] = 6 pn_params['lam_l2a'] = 1e-3 pn_params['vis_drop'] = 0.2 pn_params['hid_drop'] = 0.5 # Initialize the base networks for this GITrip GN = GenNet(rng=rng, Xp=Xp, prior_sigma=prior_sigma, \ params=gn_params, shared_param_dicts=None) IN = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, prior_sigma=prior_sigma, \ params=in_params, shared_param_dicts=None) PN = PeaNet(rng=rng, Xd=Xd, params=pn_params) # Initialize biases in GN, IN, and PN GN.init_biases(0.0) IN.init_biases(0.0) PN.init_biases(0.1) # Initialize the GITrip git_params = {} GIT = GITrip(rng=rng, \ Xd=Xd, Yd=Yd, Xc=Xc, Xm=Xm, \ g_net=GN, i_net=IN, p_net=PN, \ data_dim=data_dim, prior_dim=prior_dim, \ label_dim=label_dim, batch_size=batch_size, \ params=git_params, shared_param_dicts=None) # set weighting parameters for the various costs... GIT.set_lam_nll(1.0) GIT.set_lam_kld(1.0) GIT.set_lam_cat(0.0) GIT.set_lam_pea(0.0) GIT.set_lam_ent(0.0) # Set initial learning rate and basic SGD hyper parameters num_updates = hyper_params['num_updates'] learn_rate = hyper_params['learn_rate'] lam_cat = hyper_params['lam_cat'] lam_pea = hyper_params['lam_pea'] cat_prior = hyper_params['cat_prior'] lam_l2w = hyper_params['lam_l2w'] out_name = hyper_params['out_name'] out_file = open(out_name, 'wb') out_file.write("**TODO: More informative output, and maybe a real log**\n") out_file.write("sup_count: {0:d}\n".format(sup_count)) out_file.write("learn_rate: {0:.4f}\n".format(learn_rate)) out_file.write("lam_pea: {0:.4f}\n".format(lam_pea)) out_file.write("lam_cat: {0:.4f}\n".format(lam_cat)) out_file.write("lam_l2w: {0:.4f}\n".format(lam_l2w)) out_file.write("cat_prior: {0:s}\n".format(str(cat_prior))) out_file.flush() GIT.set_lam_l2w(lam_l2w) GIT.set_all_sgd_params(learn_rate=learn_rate, momentum=0.98) for i in range(num_updates): if i < 75000: scale = float(i + 1) / 75000.0 lam_ent = -1.0 lam_dir = 0.0 else: scale = 1.0 lam_ent = cat_prior['lam_ent'] lam_dir = cat_prior['lam_dir'] if ((i+1 % 100000) == 0): learn_rate = learn_rate * 0.75 # do a minibatch update using unlabeled data if True: # get some data to train with un_idx = npr.randint(low=0,high=un_samples,size=(batch_size,)) Xd_un = binarize_data(Xtr_un.take(un_idx, axis=0)) Yd_un = Ytr_un.take(un_idx, axis=0) Xc_un = 0.0 * Xd_un Xm_un = 0.0 * Xd_un # do a minibatch update of the model, and compute some costs GIT.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.98) GIT.set_lam_nll(1.0) GIT.set_lam_kld(0.1 + (0.9 * scale)) GIT.set_lam_cat(0.0) GIT.set_lam_pea(lam_pea) GIT.set_lam_ent(lam_ent) GIT.set_lam_dir(lam_dir) outputs = GIT.train_joint(Xd_un, Xc_un, Xm_un, Yd_un) joint_cost = 1.0 * outputs[0] data_nll_cost = 1.0 * outputs[1] post_kld_cost = 1.0 * outputs[2] post_cat_cost = 1.0 * outputs[3] post_pea_cost = 1.0 * outputs[4] post_ent_cost = 1.0 * outputs[5] post_dir_cost = 1.0 * outputs[6] other_reg_cost = 1.0 * outputs[7] # do another minibatch update incorporating label information if True: # get some data to train with su_idx = npr.randint(low=0,high=su_samples,size=(batch_size,)) Xd_su = binarize_data(Xtr_su.take(su_idx, axis=0)) Yd_su = Ytr_su.take(su_idx, axis=0) Xc_su = 0.0 * Xd_su Xm_su = 0.0 * Xd_su # update only based on the label-based classification cost GIT.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.98) GIT.set_lam_nll(0.0) GIT.set_lam_kld(0.0) GIT.set_lam_cat(lam_cat) GIT.set_lam_pea(lam_pea) GIT.set_lam_ent(0.0) GIT.set_lam_dir(0.0) outputs = GIT.train_joint(Xd_su, Xc_su, Xm_su, Yd_su) joint_2 = 1.0 * outputs[0] data_nll_2 = 1.0 * outputs[1] post_kld_2 = 1.0 * outputs[2] post_cat_cost = 1.0 * outputs[3] post_pea_2 = 1.0 * outputs[4] post_ent_2 = 1.0 * outputs[5] other_reg_cost = 1.0 * outputs[6] assert(not (np.isnan(joint_cost))) if ((i % 500) == 0): o_str = "batch: {0:d}, joint_cost: {1:.4f}, nll: {2:.4f}, kld: {3:.4f}, cat: {4:.4f}, pea: {5:.4f}, ent: {6:.4f}, dir: {7:.4f}, other_reg: {8:.4f}".format( \ i, joint_cost, data_nll_cost, post_kld_cost, post_cat_cost, post_pea_cost, post_ent_cost, post_dir_cost, other_reg_cost) print(o_str) out_file.write("{}\n".format(o_str)) if ((i % 1000) == 0): # check classification error on training and validation set train_err = GIT.classification_error(Xtr_su, Ytr_su) va_err = GIT.classification_error(Xva, Yva) o_str = " tr_err: {0:.4f}, va_err: {1:.4f}".format(train_err, va_err) print(o_str) out_file.write("{}\n".format(o_str)) out_file.flush() if ((i % 5000) == 0): # sample the VAE loop freely file_name = "GIT_CHAIN_SAMPLES_b{0:d}.png".format(i) va_idx = npr.randint(low=0,high=va_samples,size=(5,)) Xd_samps = np.vstack([Xd_un[0:5,:], binarize_data(Xva[va_idx,:])]) Xd_samps = np.repeat(Xd_samps, 3, axis=0) sample_lists = GIT.sample_git_from_data(Xd_samps, loop_iters=15) Xs = np.vstack(sample_lists["data samples"]) Ys = GIT.class_probs(Xs) Xs = mnist_prob_embed(Xs, Ys) utils.visualize_samples(Xs, file_name, num_rows=15) # sample the VAE loop with some labels held fixed file_name = "GIT_SYNTH_SAMPLES_b{0:d}.png".format(i) Xd_samps = Xd_su[0:10,:] Xd_samps = np.repeat(Xd_samps, 3, axis=0) Yd_samps = Yd_su[0:10,:].reshape((10,1)) Yd_samps = np.repeat(Yd_samps, 3, axis=0) SAMPS = GIT.sample_synth_labels(Xd_samps, Yd_samps, loop_iters=15, binarize=True) Xs = np.vstack(SAMPS["X_syn"]) Ys = one_hot_np(np.vstack(SAMPS["Y_syn"]), cat_dim=11) Ys = Ys[:,1:] Xs = mnist_prob_embed(Xs, Ys) utils.visualize_samples(Xs, file_name, num_rows=15) # draw samples freely from the generative model's prior file_name = "GIT_PRIOR_SAMPLES_b{0:d}.png".format(i) Xs = GIT.sample_from_prior(20*15) utils.visualize_samples(Xs, file_name, num_rows=15) # draw categorical inferencer's weights file_name = "GIT_PN_WEIGHTS_b{0:d}.png".format(i) utils.visualize_net_layer(GIT.PN.proto_nets[0][0], file_name) # draw continuous inferencer's weights file_name = "GIT_IN_WEIGHTS_b{0:d}.png".format(i) utils.visualize_net_layer(GIT.IN.shared_layers[0], file_name) # draw generator net final layer weights file_name = "GIT_GN_WEIGHTS_b{0:d}.png".format(i) utils.visualize_net_layer(GIT.GN.mlp_layers[-1], file_name, use_transpose=True) print("TESTING COMPLETE!") out_file.close() return
def test_tfd(step_type='add', occ_dim=15, drop_prob=0.0): ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = "{}GPSI_OD{}_DP{}_{}_NA".format(RESULT_PATH, occ_dim, dp_int, step_type) ########################## # Get some training data # ########################## data_file = 'data/tfd_data_48x48.pkl' dataset = load_tfd(tfd_pkl_name=data_file, which_set='unlabeled', fold='all') Xtr_unlabeled = dataset[0] dataset = load_tfd(tfd_pkl_name=data_file, which_set='train', fold='all') Xtr_train = dataset[0] Xtr = np.vstack([Xtr_unlabeled, Xtr_train]) dataset = load_tfd(tfd_pkl_name=data_file, which_set='valid', fold='all') Xva = dataset[0] Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 250 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX( all_pix_mean * np.ones((Xtr.shape[1],)) ) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ obs_dim = Xtr.shape[1] z_dim = 200 imp_steps = 6 init_scale = 1.0 x_in_sym = T.matrix('x_in_sym') x_out_sym = T.matrix('x_out_sym') x_mask_sym = T.matrix('x_mask_sym') ################# # p_zi_given_xi # ################# params = {} shared_config = [obs_dim, 1500, 1500] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_zi_given_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_zi_given_xi.init_biases(0.2) ################### # p_xip1_given_zi # ################### params = {} shared_config = [z_dim, 1500, 1500] output_config = [obs_dim, obs_dim] params['shared_config'] = shared_config params['output_config'] = output_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_xip1_given_zi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_xip1_given_zi.init_biases(0.2) ################### # q_zi_given_x_xi # ################### params = {} shared_config = [(obs_dim + obs_dim), 1500, 1500] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_zi_given_x_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_zi_given_x_xi.init_biases(0.2) ########################################################### # Define parameters for the GPSImputer, and initialize it # ########################################################### print("Building the GPSImputer...") gpsi_params = {} gpsi_params['obs_dim'] = obs_dim gpsi_params['z_dim'] = z_dim gpsi_params['imp_steps'] = imp_steps gpsi_params['step_type'] = step_type gpsi_params['x_type'] = 'bernoulli' gpsi_params['obs_transform'] = 'sigmoid' GPSI = GPSImputer(rng=rng, x_in=x_in_sym, x_out=x_out_sym, x_mask=x_mask_sym, \ p_zi_given_xi=p_zi_given_xi, \ p_xip1_given_zi=p_xip1_given_zi, \ q_zi_given_x_xi=q_zi_given_x_xi, \ params=gpsi_params, \ shared_param_dicts=None) # # test model saving # print("Testing model save to file...") # GPSI.save_to_file("AAA_GPSI_SAVE_TEST.pkl") # # test model loading # print("Testing model load from file...") # GPSI = load_gpsimputer_from_file(f_name="AAA_GPSI_SAVE_TEST.pkl", rng=rng) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format(result_tag) out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.5 batch_idx = np.arange(batch_size) + tr_samples for i in range(200005): scale = min(1.0, ((i+1) / 5000.0)) if (((i + 1) % 15000) == 0): learn_rate = learn_rate * 0.92 if (i > 10000): momentum = 0.90 else: momentum = 0.50 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update GPSI.set_sgd_params(lr=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.98) GPSI.set_train_switch(1.0) GPSI.set_lam_nll(lam_nll=1.0) GPSI.set_lam_kld(lam_kld_p=0.1, lam_kld_q=0.9) GPSI.set_lam_l2w(1e-4) # perform a minibatch update and record the cost for this batch xb = to_fX( Xtr.take(batch_idx, axis=0) ) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) result = GPSI.train_joint(xi, xo, xm, batch_reps) # do diagnostics and general training tracking costs = [(costs[j] + result[j]) for j in range(len(result)-1)] if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_cost : {0:.4f}".format(costs[2]) str5 = " kld_cost : {0:.4f}".format(costs[3]) str6 = " reg_cost : {0:.4f}".format(costs[4]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva[0:5000], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) nll, kld = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10) vfe = np.mean(nll) + np.mean(kld) str1 = " va_nll_bound : {}".format(vfe) str2 = " va_nll_term : {}".format(np.mean(nll)) str3 = " va_kld_q2p : {}".format(np.mean(kld)) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() GPSI.save_to_file("{}_PARAMS.pkl".format(result_tag)) if ((i % 20000) == 0): # Get some validation samples for evaluating model performance xb = to_fX( Xva[0:100] ) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) xi = np.repeat(xi, 2, axis=0) xo = np.repeat(xo, 2, axis=0) xm = np.repeat(xm, 2, axis=0) # draw some sample imputations from the model samp_count = xi.shape[0] _, model_samps = GPSI.sample_imputer(xi, xo, xm, use_guide_policy=False) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "{0:s}_samples_ng_b{1:d}.png".format(result_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # get visualizations of policy parameters file_name = "{0:s}_gen_gen_weights_b{1:d}.png".format(result_tag, i) W = GPSI.gen_gen_weights.get_value(borrow=False) utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "{0:s}_gen_inf_weights_b{1:d}.png".format(result_tag, i) W = GPSI.gen_inf_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
def pretrain_osm(lam_kld=0.0): # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr = datasets[0][0] Xtr = Xtr.get_value(borrow=False) Xva = datasets[2][0] Xva = Xva.get_value(borrow=False) print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape),str(Xva.shape))) # get and set some basic dataset information Xtr_mean = np.mean(Xtr, axis=0) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 100 batch_reps = 5 # setup some symbolic variables and stuff Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') data_dim = Xtr.shape[1] prior_sigma = 1.0 ########################## # NETWORK CONFIGURATIONS # ########################## gn_params = {} shared_config = [PRIOR_DIM, 1000, 1000] top_config = [shared_config[-1], data_dim] gn_params['shared_config'] = shared_config gn_params['mu_config'] = top_config gn_params['sigma_config'] = top_config gn_params['activation'] = relu_actfun gn_params['init_scale'] = 1.4 gn_params['lam_l2a'] = 0.0 gn_params['vis_drop'] = 0.0 gn_params['hid_drop'] = 0.0 gn_params['bias_noise'] = 0.0 gn_params['input_noise'] = 0.0 # choose some parameters for the continuous inferencer in_params = {} shared_config = [data_dim, 1000, 1000] top_config = [shared_config[-1], PRIOR_DIM] in_params['shared_config'] = shared_config in_params['mu_config'] = top_config in_params['sigma_config'] = top_config in_params['activation'] = relu_actfun in_params['init_scale'] = 1.4 in_params['lam_l2a'] = 0.0 in_params['vis_drop'] = 0.0 in_params['hid_drop'] = 0.0 in_params['bias_noise'] = 0.0 in_params['input_noise'] = 0.0 # Initialize the base networks for this OneStageModel IN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=in_params, shared_param_dicts=None) GN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=gn_params, shared_param_dicts=None) # Initialize biases in IN and GN IN.init_biases(0.2) GN.init_biases(0.2) ######################### # INITIALIZE THE GIPAIR # ######################### osm_params = {} osm_params['x_type'] = 'bernoulli' osm_params['xt_transform'] = 'sigmoid' osm_params['logvar_bound'] = LOGVAR_BOUND OSM = OneStageModel(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, \ p_x_given_z=GN, q_z_given_x=IN, \ x_dim=data_dim, z_dim=PRIOR_DIM, params=osm_params) OSM.set_lam_l2w(1e-5) safe_mean = (0.9 * Xtr_mean) + 0.05 safe_mean_logit = np.log(safe_mean / (1.0 - safe_mean)) OSM.set_output_bias(safe_mean_logit) OSM.set_input_bias(-Xtr_mean) ###################### # BASIC VAE TRAINING # ###################### out_file = open(RESULT_PATH+"pt_osm_results.txt", 'wb') # Set initial learning rate and basic SGD hyper parameters obs_costs = np.zeros((batch_size,)) costs = [0. for i in range(10)] learn_rate = 0.0005 for i in range(150000): scale = min(1.0, float(i) / 10000.0) if ((i > 1) and ((i % 20000) == 0)): learn_rate = learn_rate * 0.9 # do a minibatch update of the model, and compute some costs tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,)) Xd_batch = Xtr.take(tr_idx, axis=0) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # do a minibatch update of the model, and compute some costs OSM.set_sgd_params(lr_1=(scale*learn_rate), mom_1=0.5, mom_2=0.98) OSM.set_lam_nll(1.0) OSM.set_lam_kld(lam_kld_1=(1.0 + (scale*(lam_kld-1.0))), lam_kld_2=0.0) result = OSM.train_joint(Xd_batch, Xc_batch, Xm_batch, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 1000) == 0): # record and then reset the cost trackers costs = [(v / 1000.0) for v in costs] str_1 = "-- batch {0:d} --".format(i) str_2 = " joint_cost: {0:.4f}".format(costs[0]) str_3 = " nll_cost : {0:.4f}".format(costs[1]) str_4 = " kld_cost : {0:.4f}".format(costs[2]) str_5 = " reg_cost : {0:.4f}".format(costs[3]) costs = [0.0 for v in costs] # print out some diagnostic information joint_str = "\n".join([str_1, str_2, str_3, str_4, str_5]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() if ((i % 2000) == 0): Xva = row_shuffle(Xva) model_samps = OSM.sample_from_prior(500) file_name = RESULT_PATH+"pt_osm_samples_b{0:d}_XG.png".format(i) utils.visualize_samples(model_samps, file_name, num_rows=20) # compute information about free-energy on validation set file_name = RESULT_PATH+"pt_osm_free_energy_b{0:d}.png".format(i) fe_terms = OSM.compute_fe_terms(Xva[0:2500], 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) fe_str = " nll_bound : {0:.4f}".format(fe_mean) print(fe_str) out_file.write(fe_str+"\n") utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ x_label='Posterior KLd', y_label='Negative Log-likelihood') # compute information about posterior KLds on validation set file_name = RESULT_PATH+"pt_osm_post_klds_b{0:d}.png".format(i) post_klds = OSM.compute_post_klds(Xva[0:2500]) post_dim_klds = np.mean(post_klds, axis=0) utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, \ file_name) if ((i % 5000) == 0): IN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_b{0:d}_IN.pkl".format(i)) GN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_b{0:d}_GN.pkl".format(i)) IN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_IN.pkl") GN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_GN.pkl") return
def test_with_model_init(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) #dataset = 'data/mnist.pkl.gz' #datasets = load_udm(dataset, as_shared=False, zero_mean=False) #Xtr = datasets[0][0] #Xva = datasets[1][0] Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/') del Xte tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 batch_reps = 1 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ x_dim = Xtr.shape[1] z_dim = 20 h_dim = 50 s_dim = 50 init_scale = 1.0 x_type = 'bernoulli' # some InfNet instances to build the TwoStageModel from x_in_sym = T.matrix('x_in_sym') x_out_sym = T.matrix('x_out_sym') ############### # p_h_given_s # ############### params = {} shared_config = [s_dim, 250, 250] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_h_given_s = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_h_given_s.init_biases(0.2) ################# # p_x_given_s_h # ################# params = {} shared_config = [(s_dim + h_dim), 250, 250] top_config = [shared_config[-1], x_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_x_given_s_h = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_x_given_s_h.init_biases(0.2) ############### # p_s_given_z # ############### params = {} shared_config = [z_dim, 250] top_config = [shared_config[-1], s_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_s_given_z = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_s_given_z.init_biases(0.2) ############### # q_z_given_x # ############### params = {} shared_config = [x_dim, 250, 250] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_z_given_x = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_z_given_x.init_biases(0.2) ################# # q_h_given_x_s # ################# params = {} shared_config = [(x_dim + s_dim), 500, 500] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_h_given_x_s = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_h_given_x_s.init_biases(0.2) ############################################################## # Define parameters for the TwoStageModel, and initialize it # ############################################################## print("Building the TwoStageModel...") msm_params = {} msm_params['x_type'] = x_type msm_params['obs_transform'] = 'sigmoid' TSM = TwoStageModel(rng=rng, \ x_in=x_in_sym, x_out=x_out_sym, \ p_s_given_z=p_s_given_z, \ p_h_given_s=p_h_given_s, \ p_x_given_s_h=p_x_given_s_h, \ q_z_given_x=q_z_given_x, \ q_h_given_x_s=q_h_given_x_s, \ x_dim=x_dim, \ z_dim=z_dim, s_dim=s_dim, h_dim=h_dim, \ params=msm_params) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ out_file = open("TSM_A_RESULTS.txt", 'wb') costs = [0. for i in range(10)] learn_rate = 0.0003 momentum = 0.5 batch_idx = np.arange(batch_size) + tr_samples for i in range(250000): scale = min(1.0, ((i+1) / 3000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 if (i > 50000): momentum = 0.90 else: momentum = 0.50 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # train on the training set lam_kld = 1.0 # set sgd and objective function hyperparams for this update TSM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.99) TSM.set_train_switch(1.0) TSM.set_lam_nll(lam_nll=1.0) TSM.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.8, lam_kld_p2q=0.2) TSM.set_lam_kld_l1l2(lam_kld_l1l2=scale) TSM.set_lam_l2w(1e-4) TSM.set_drop_rate(0.0) TSM.q_h_given_x_s.set_bias_noise(0.0) TSM.p_h_given_s.set_bias_noise(0.0) TSM.p_x_given_s_h.set_bias_noise(0.0) # perform a minibatch update and record the cost for this batch Xb_tr = to_fX( Xtr.take(batch_idx, axis=0) ) result = TSM.train_joint(Xb_tr, Xb_tr, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result)-1)] if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_cost : {0:.4f}".format(costs[1]) str4 = " kld_cost : {0:.4f}".format(costs[2]) str5 = " reg_cost : {0:.4f}".format(costs[3]) joint_str = "\n".join([str1, str2, str3, str4, str5]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))): TSM.set_drop_rate(0.0) TSM.q_h_given_x_s.set_bias_noise(0.0) TSM.p_h_given_s.set_bias_noise(0.0) TSM.p_x_given_s_h.set_bias_noise(0.0) # Get some validation samples for computing diagnostics Xva = row_shuffle(Xva) Xb_va = to_fX( Xva[0:2000] ) # draw some independent random samples from the model samp_count = 500 model_samps = TSM.sample_from_prior(samp_count) file_name = "TSM_A_SAMPLES_IND_b{0:d}.png".format(i) utils.visualize_samples(model_samps, file_name, num_rows=20) Xb_tr = to_fX( Xtr[0:2000] ) fe_terms = TSM.compute_fe_terms(Xb_tr, Xb_tr, 30) fe_nll = np.mean(fe_terms[0]) fe_kld = np.mean(fe_terms[1]) fe_joint = fe_nll + fe_kld joint_str = " vfe-tr: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() fe_terms = TSM.compute_fe_terms(Xb_va, Xb_va, 30) fe_nll = np.mean(fe_terms[0]) fe_kld = np.mean(fe_terms[1]) fe_joint = fe_nll + fe_kld joint_str = " vfe-va: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) print(joint_str) out_file.write(joint_str+"\n") out_file.flush()
def test_mnist(step_type='add', imp_steps=6, occ_dim=15, drop_prob=0.0): ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = "{}GPSI_OD{}_DP{}_IS{}_{}_NA".format(RESULT_PATH, occ_dim, dp_int, imp_steps, step_type) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/') Xtr = np.vstack((Xtr, Xva)) Xva = Xte #del Xte tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] ########################## # Get some training data # ########################## # rng = np.random.RandomState(1234) # dataset = 'data/mnist.pkl.gz' # datasets = load_udm(dataset, as_shared=False, zero_mean=False) # Xtr = datasets[0][0] # Xva = datasets[1][0] # Xte = datasets[2][0] # # Merge validation set and training set, and test on test set. # #Xtr = np.concatenate((Xtr, Xva), axis=0) # #Xva = Xte # Xtr = to_fX(shift_and_scale_into_01(Xtr)) # Xva = to_fX(shift_and_scale_into_01(Xva)) # tr_samples = Xtr.shape[0] # va_samples = Xva.shape[0] batch_size = 200 batch_reps = 1 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX( all_pix_mean * np.ones((Xtr.shape[1],)) ) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ x_dim = Xtr.shape[1] s_dim = x_dim h_dim = 50 z_dim = 100 init_scale = 0.6 x_in_sym = T.matrix('x_in_sym') x_out_sym = T.matrix('x_out_sym') x_mask_sym = T.matrix('x_mask_sym') ############### # p_h_given_x # ############### params = {} shared_config = [x_dim, 250] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = tanh_actfun #relu_actfun params['init_scale'] = 'xg' #init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_h_given_x = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_h_given_x.init_biases(0.0) ################ # p_s0_given_h # ################ params = {} shared_config = [h_dim, 250] output_config = [s_dim, s_dim, s_dim] params['shared_config'] = shared_config params['output_config'] = output_config params['activation'] = tanh_actfun #relu_actfun params['init_scale'] = 'xg' #init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_s0_given_h = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_s0_given_h.init_biases(0.0) ################# # p_zi_given_xi # ################# params = {} shared_config = [(x_dim + x_dim), 500, 500] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = tanh_actfun #relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_zi_given_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_zi_given_xi.init_biases(0.0) ################### # p_sip1_given_zi # ################### params = {} shared_config = [z_dim, 500, 500] output_config = [s_dim, s_dim, s_dim] params['shared_config'] = shared_config params['output_config'] = output_config params['activation'] = tanh_actfun #relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_sip1_given_zi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_sip1_given_zi.init_biases(0.0) ################ # p_x_given_si # ################ params = {} shared_config = [s_dim] output_config = [x_dim, x_dim] params['shared_config'] = shared_config params['output_config'] = output_config params['activation'] = tanh_actfun #relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_x_given_si = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_x_given_si.init_biases(0.0) ############### # q_h_given_x # ############### params = {} shared_config = [x_dim, 250] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = tanh_actfun #relu_actfun params['init_scale'] = 'xg' #init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_h_given_x = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_h_given_x.init_biases(0.0) ################# # q_zi_given_xi # ################# params = {} shared_config = [(x_dim + x_dim), 500, 500] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = tanh_actfun #relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_zi_given_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_zi_given_xi.init_biases(0.0) ########################################################### # Define parameters for the GPSImputer, and initialize it # ########################################################### print("Building the GPSImputer...") gpsi_params = {} gpsi_params['x_dim'] = x_dim gpsi_params['h_dim'] = h_dim gpsi_params['z_dim'] = z_dim gpsi_params['s_dim'] = s_dim # switch between direct construction and construction via p_x_given_si gpsi_params['use_p_x_given_si'] = False gpsi_params['imp_steps'] = imp_steps gpsi_params['step_type'] = step_type gpsi_params['x_type'] = 'bernoulli' gpsi_params['obs_transform'] = 'sigmoid' GPSI = GPSImputerWI(rng=rng, x_in=x_in_sym, x_out=x_out_sym, x_mask=x_mask_sym, \ p_h_given_x=p_h_given_x, \ p_s0_given_h=p_s0_given_h, \ p_zi_given_xi=p_zi_given_xi, \ p_sip1_given_zi=p_sip1_given_zi, \ p_x_given_si=p_x_given_si, \ q_h_given_x=q_h_given_x, \ q_zi_given_xi=q_zi_given_xi, \ params=gpsi_params, \ shared_param_dicts=None) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format(result_tag) out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.5 batch_idx = np.arange(batch_size) + tr_samples for i in range(250000): scale = min(1.0, ((i+1) / 5000.0)) lam_scale = 1.0 - min(1.0, ((i+1) / 100000.0)) # decays from 1.0->0.0 if (((i + 1) % 15000) == 0): learn_rate = learn_rate * 0.93 if (i > 10000): momentum = 0.90 else: momentum = 0.75 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update GPSI.set_sgd_params(lr=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.98) GPSI.set_train_switch(1.0) GPSI.set_lam_nll(lam_nll=1.0) GPSI.set_lam_kld(lam_kld_p=0.05, lam_kld_q=0.95, \ lam_kld_g=(0.1 * lam_scale), lam_kld_s=(0.1 * lam_scale)) GPSI.set_lam_l2w(1e-5) # perform a minibatch update and record the cost for this batch xb = to_fX( Xtr.take(batch_idx, axis=0) ) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) result = GPSI.train_joint(xi, xo, xm, batch_reps) # do diagnostics and general training tracking costs = [(costs[j] + result[j]) for j in range(len(result)-1)] if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_cost : {0:.4f}".format(costs[2]) str5 = " kld_cost : {0:.4f}".format(costs[3]) str6 = " reg_cost : {0:.4f}".format(costs[4]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva[0:5000], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) nll, kld = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10) vfe = np.mean(nll) + np.mean(kld) str1 = " va_nll_bound : {}".format(vfe) str2 = " va_nll_term : {}".format(np.mean(nll)) str3 = " va_kld_q2p : {}".format(np.mean(kld)) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() if ((i % 2000) == 0): GPSI.save_to_file("{}_PARAMS.pkl".format(result_tag)) # Get some validation samples for evaluating model performance xb = to_fX( Xva[0:100] ) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) xi = np.repeat(xi, 2, axis=0) xo = np.repeat(xo, 2, axis=0) xm = np.repeat(xm, 2, axis=0) # draw some sample imputations from the model samp_count = xi.shape[0] _, model_samps = GPSI.sample_imputer(xi, xo, xm, use_guide_policy=False) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "{0:s}_samples_ng_b{1:d}.png".format(result_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # show KLds and NLLs on a step-by-step basis xb = to_fX( Xva[0:1000] ) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) step_costs = GPSI.compute_per_step_cost(xi, xo, xm) step_nlls = step_costs[0] step_klds = step_costs[1] step_nums = np.arange(step_nlls.shape[0]) file_name = "{0:s}_NLL_b{1:d}.png".format(result_tag, i) utils.plot_stem(step_nums, step_nlls, file_name) file_name = "{0:s}_KLD_b{1:d}.png".format(result_tag, i) utils.plot_stem(step_nums, step_klds, file_name)
def test_with_model_init(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/') del Xte tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 batch_reps = 1 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ obs_dim = Xtr.shape[1] z_dim = 20 h_dim = 200 ir_steps = 6 init_scale = 1.0 x_type = 'bernoulli' # some InfNet instances to build the TwoStageModel from x_in_sym = T.matrix('x_in_sym') x_out_sym = T.matrix('x_out_sym') ################# # p_hi_given_si # ################# params = {} shared_config = [obs_dim, 300, 300] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_hi_given_si = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_hi_given_si.init_biases(0.2) ###################### # p_sip1_given_si_hi # ###################### params = {} shared_config = [h_dim, 300, 300] output_config = [obs_dim, obs_dim, obs_dim] params['shared_config'] = shared_config params['output_config'] = output_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_sip1_given_si_hi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_sip1_given_si_hi.init_biases(0.2) ################ # p_s0_given_z # ################ params = {} shared_config = [z_dim, 250, 250] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_s0_given_z = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_s0_given_z.init_biases(0.2) ############### # q_z_given_x # ############### params = {} shared_config = [obs_dim, 250, 250] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_z_given_x = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_z_given_x.init_biases(0.2) ################### # q_hi_given_x_si # ################### params = {} shared_config = [(obs_dim + obs_dim), 500, 500] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_hi_given_x_si = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_hi_given_x_si.init_biases(0.2) ################################################################ # Define parameters for the MultiStageModel, and initialize it # ################################################################ print("Building the MultiStageModel...") msm_params = {} msm_params['x_type'] = x_type msm_params['obs_transform'] = 'sigmoid' MSM = MultiStageModel(rng=rng, x_in=x_in_sym, x_out=x_out_sym, \ p_s0_given_z=p_s0_given_z, \ p_hi_given_si=p_hi_given_si, \ p_sip1_given_si_hi=p_sip1_given_si_hi, \ q_z_given_x=q_z_given_x, \ q_hi_given_x_si=q_hi_given_x_si, \ obs_dim=obs_dim, z_dim=z_dim, h_dim=h_dim, \ ir_steps=ir_steps, params=msm_params) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ out_file = open("MSM_A_RESULTS.txt", 'wb') costs = [0. for i in range(10)] learn_rate = 0.0003 momentum = 0.9 batch_idx = np.arange(batch_size) + tr_samples for i in range(250000): scale = min(1.0, ((i+1) / 3000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update MSM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.99) MSM.set_train_switch(1.0) MSM.set_lam_nll(lam_nll=1.0) MSM.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.8, lam_kld_p2q=0.2) MSM.set_lam_kld_l1l2(lam_kld_l1l2=1.0) MSM.set_lam_l2w(1e-4) MSM.set_drop_rate(0.0) MSM.q_hi_given_x_si.set_bias_noise(0.0) MSM.p_hi_given_si.set_bias_noise(0.0) MSM.p_sip1_given_si_hi.set_bias_noise(0.0) # perform a minibatch update and record the cost for this batch Xb_tr = to_fX( Xtr.take(batch_idx, axis=0) ) result = MSM.train_joint(Xb_tr, Xb_tr, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result)-1)] if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_cost : {0:.4f}".format(costs[1]) str4 = " kld_cost : {0:.4f}".format(costs[2]) str5 = " reg_cost : {0:.4f}".format(costs[3]) joint_str = "\n".join([str1, str2, str3, str4, str5]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))): MSM.set_drop_rate(0.0) MSM.q_hi_given_x_si.set_bias_noise(0.0) MSM.p_hi_given_si.set_bias_noise(0.0) MSM.p_sip1_given_si_hi.set_bias_noise(0.0) # Get some validation samples for computing diagnostics Xva = row_shuffle(Xva) Xb_va = to_fX( Xva[0:2000] ) # draw some independent random samples from the model samp_count = 200 model_samps = MSM.sample_from_prior(samp_count) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "MSM_A_SAMPLES_IND_b{0:d}.png".format(i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # draw some conditional random samples from the model samp_count = 200 Xs = np.vstack((Xb_tr[0:(samp_count/4)], Xb_va[0:(samp_count/4)])) Xs = np.repeat(Xs, 2, axis=0) # draw some conditional random samples from the model model_samps = MSM.sample_from_input(Xs, guided_decoding=False) model_samps.append(Xs) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "MSM_A_SAMPLES_CND_b{0:d}.png".format(i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # compute information about posterior KLds on validation set raw_klds = MSM.compute_raw_klds(Xb_va, Xb_va) init_kld, q2p_kld, p2q_kld = raw_klds file_name = "MSM_A_H0_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(init_kld.shape[1]), \ np.mean(init_kld, axis=0), file_name) file_name = "MSM_A_HI_Q2P_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(q2p_kld.shape[1]), \ np.mean(q2p_kld, axis=0), file_name) file_name = "MSM_A_HI_P2Q_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(p2q_kld.shape[1]), \ np.mean(p2q_kld, axis=0), file_name) Xb_tr = to_fX( Xtr[0:2000] ) fe_terms = MSM.compute_fe_terms(Xb_tr, Xb_tr, 30) fe_nll = np.mean(fe_terms[0]) fe_kld = np.mean(fe_terms[1]) fe_joint = fe_nll + fe_kld joint_str = " vfe-tr: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() fe_terms = MSM.compute_fe_terms(Xb_va, Xb_va, 30) fe_nll = np.mean(fe_terms[0]) fe_kld = np.mean(fe_terms[1]) fe_joint = fe_nll + fe_kld joint_str = " vfe-va: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) print(joint_str) out_file.write(joint_str+"\n") out_file.flush()
def test_svhn(occ_dim=15, drop_prob=0.0): RESULT_PATH = "IMP_SVHN_VAE/" ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = "{}VAE_OD{}_DP{}".format(RESULT_PATH, occ_dim, dp_int) ########################## # Get some training data # ########################## tr_file = 'data/svhn_train_gray.pkl' te_file = 'data/svhn_test_gray.pkl' ex_file = 'data/svhn_extra_gray.pkl' data = load_svhn_gray(tr_file, te_file, ex_file=ex_file, ex_count=200000) Xtr = to_fX( shift_and_scale_into_01(np.vstack([data['Xtr'], data['Xex']])) ) Xva = to_fX( shift_and_scale_into_01(data['Xte']) ) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 250 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX( all_pix_mean * np.ones((Xtr.shape[1],)) ) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ obs_dim = Xtr.shape[1] z_dim = 100 imp_steps = 15 # we'll check for the best step count (found oracularly) init_scale = 1.0 x_in_sym = T.matrix('x_in_sym') x_out_sym = T.matrix('x_out_sym') x_mask_sym = T.matrix('x_mask_sym') ################# # p_zi_given_xi # ################# params = {} shared_config = [obs_dim, 1000, 1000] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_zi_given_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_zi_given_xi.init_biases(0.2) ################### # p_xip1_given_zi # ################### params = {} shared_config = [z_dim, 1000, 1000] output_config = [obs_dim, obs_dim] params['shared_config'] = shared_config params['output_config'] = output_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_xip1_given_zi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_xip1_given_zi.init_biases(0.2) ################### # q_zi_given_x_xi # ################### params = {} shared_config = [(obs_dim + obs_dim), 1000, 1000] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_zi_given_x_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_zi_given_x_xi.init_biases(0.2) ########################################################### # Define parameters for the GPSImputer, and initialize it # ########################################################### print("Building the GPSImputer...") gpsi_params = {} gpsi_params['obs_dim'] = obs_dim gpsi_params['z_dim'] = z_dim gpsi_params['imp_steps'] = imp_steps gpsi_params['step_type'] = 'jump' gpsi_params['x_type'] = 'bernoulli' gpsi_params['obs_transform'] = 'sigmoid' gpsi_params['use_osm_mode'] = True GPSI = GPSImputer(rng=rng, x_in=x_in_sym, x_out=x_out_sym, x_mask=x_mask_sym, \ p_zi_given_xi=p_zi_given_xi, \ p_xip1_given_zi=p_xip1_given_zi, \ q_zi_given_x_xi=q_zi_given_x_xi, \ params=gpsi_params, \ shared_param_dicts=None) ######################################################################### # Define parameters for the underlying OneStageModel, and initialize it # ######################################################################### print("Building the OneStageModel...") osm_params = {} osm_params['x_type'] = 'bernoulli' osm_params['xt_transform'] = 'sigmoid' OSM = OneStageModel(rng=rng, \ x_in=x_in_sym, \ p_x_given_z=p_xip1_given_zi, \ q_z_given_x=p_zi_given_xi, \ x_dim=obs_dim, z_dim=z_dim, \ params=osm_params) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format(result_tag) out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.5 batch_idx = np.arange(batch_size) + tr_samples for i in range(200005): scale = min(1.0, ((i+1) / 5000.0)) if (((i + 1) % 15000) == 0): learn_rate = learn_rate * 0.92 if (i > 10000): momentum = 0.90 else: momentum = 0.50 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update OSM.set_sgd_params(lr=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.99) OSM.set_lam_nll(lam_nll=1.0) OSM.set_lam_kld(lam_kld_1=1.0, lam_kld_2=0.0) OSM.set_lam_l2w(1e-4) # perform a minibatch update and record the cost for this batch xb = to_fX( Xtr.take(batch_idx, axis=0) ) result = OSM.train_joint(xb, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result)-1)] if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_cost : {0:.4f}".format(costs[1]) str4 = " kld_cost : {0:.4f}".format(costs[2]) str5 = " reg_cost : {0:.4f}".format(costs[3]) joint_str = "\n".join([str1, str2, str3, str4, str5]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva[0:5000], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) step_nll, step_kld = GPSI.compute_per_step_cost(xi, xo, xm, sample_count=10) min_nll = np.min(step_nll) str1 = " va_nll_bound : {}".format(min_nll) str2 = " va_nll_min : {}".format(min_nll) str3 = " va_nll_final : {}".format(step_nll[-1]) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() if ((i % 10000) == 0): # Get some validation samples for evaluating model performance xb = to_fX( Xva[0:100] ) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) xi = np.repeat(xi, 2, axis=0) xo = np.repeat(xo, 2, axis=0) xm = np.repeat(xm, 2, axis=0) # draw some sample imputations from the model samp_count = xi.shape[0] _, model_samps = GPSI.sample_imputer(xi, xo, xm, use_guide_policy=False) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "{}_samples_ng_b{}.png".format(result_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # get visualizations of policy parameters file_name = "{}_gen_gen_weights_b{}.png".format(result_tag, i) W = GPSI.gen_gen_weights.get_value(borrow=False) utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "{}_gen_inf_weights_b{}.png".format(result_tag, i) W = GPSI.gen_inf_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
def pretrain_gip(extra_lam_kld=0.0, kld2_scale=0.0): # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with tr_file = 'data/svhn_train_gray.pkl' te_file = 'data/svhn_test_gray.pkl' ex_file = 'data/svhn_extra_gray.pkl' data = load_svhn_gray(tr_file, te_file, ex_file=ex_file, ex_count=200000) #all_file = 'data/svhn_all_gray_zca.pkl' #data = load_svhn_all_gray_zca(all_file) Xtr = np.vstack([data['Xtr'], data['Xex']]) Xtr = Xtr - np.mean(Xtr, axis=1, keepdims=True) Xtr = Xtr / np.std(Xtr, axis=1, keepdims=True) Xtr = shift_and_scale_into_01(Xtr) Xtr, Xva = train_valid_split(Xtr, valid_count=5000) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 100 batch_reps = 5 # setup some symbolic variables and stuff Xp = T.matrix('Xp_base') Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') data_dim = Xtr.shape[1] prior_sigma = 1.0 ########################## # NETWORK CONFIGURATIONS # ########################## gn_params = {} gn_config = [PRIOR_DIM, 2400, 2400, data_dim] gn_params['mlp_config'] = gn_config gn_params['activation'] = relu_actfun gn_params['out_type'] = 'gaussian' gn_params['mean_transform'] = 'sigmoid' gn_params['logvar_type'] = 'single_shared' gn_params['init_scale'] = 1.2 gn_params['lam_l2a'] = 1e-2 gn_params['vis_drop'] = 0.0 gn_params['hid_drop'] = 0.0 gn_params['bias_noise'] = 0.1 # choose some parameters for the continuous inferencer in_params = {} shared_config = [data_dim, 2400, 2400] top_config = [shared_config[-1], PRIOR_DIM] in_params['shared_config'] = shared_config in_params['mu_config'] = top_config in_params['sigma_config'] = top_config in_params['activation'] = relu_actfun in_params['init_scale'] = 1.2 in_params['lam_l2a'] = 1e-2 in_params['vis_drop'] = 0.2 in_params['hid_drop'] = 0.0 in_params['bias_noise'] = 0.1 in_params['input_noise'] = 0.0 in_params['kld2_scale'] = kld2_scale # Initialize the base networks for this GIPair IN = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, prior_sigma=prior_sigma, \ params=in_params, shared_param_dicts=None) GN = GenNet(rng=rng, Xp=Xp, prior_sigma=prior_sigma, \ params=gn_params, shared_param_dicts=None) # Initialize biases in IN and GN IN.init_biases(0.1) GN.init_biases(0.1) ###################################### # LOAD AND RESTART FROM SAVED PARAMS # ###################################### # new_in_params = {'kld2_scale': kld2_scale, 'bias_noise': 0.2} # new_gn_params = {'bias_noise': 0.2} # # Load inferencer and generator from saved parameters # gn_fname = "TMS_RESULTS_DROPLESS/pt_params_b50000_GN.pkl" # in_fname = "TMS_RESULTS_DROPLESS/pt_params_b50000_IN.pkl" # IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd, \ # Xc=Xc, Xm=Xm, new_params=new_in_params) # GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp, \ # new_params=new_gn_params) # in_params = IN.params # gn_params = GN.params ######################### # INITIALIZE THE GIPAIR # ######################### GIP = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN, i_net=IN, \ data_dim=data_dim, prior_dim=PRIOR_DIM, params=None) GIP.set_lam_l2w(1e-4) #################### # RICA PRETRAINING # #################### IN.W_rica.set_value(0.05 * IN.W_rica.get_value(borrow=False)) GN.W_rica.set_value(0.05 * GN.W_rica.get_value(borrow=False)) for i in range(6000): scale = min(1.0, (float(i+1) / 6000.0)) l_rate = 0.0001 * scale lam_l1 = 0.025 tr_idx = npr.randint(low=0,high=tr_samples,size=(1000,)) Xd_batch = Xtr.take(tr_idx, axis=0) inr_out = IN.train_rica(Xd_batch, l_rate, lam_l1) gnr_out = GN.train_rica(Xd_batch, l_rate, lam_l1) inr_out = [v for v in gnr_out] if ((i % 1000) == 0): print("rica batch {0:d}: in_recon={1:.4f}, in_spars={2:.4f}, gn_recon={3:.4f}, gn_spars={4:.4f}".format( \ i, 1.*inr_out[1], 1.*inr_out[2], 1.*gnr_out[1], 1.*gnr_out[2])) # draw inference net first layer weights file_name = RESULT_PATH+"pt_rica_inf_weights.png".format(i) utils.visualize_samples(IN.W_rica.get_value(borrow=False).T, file_name, num_rows=20) # draw generator net final layer weights file_name = RESULT_PATH+"pt_rica_gen_weights.png".format(i) if ('gaussian' in gn_params['out_type']): lay_num = -2 else: lay_num = -1 utils.visualize_samples(GN.W_rica.get_value(borrow=False), file_name, num_rows=20) ###################### # BASIC VAE TRAINING # ###################### out_file = open(RESULT_PATH+"pt_gip_results.txt", 'wb') # Set initial learning rate and basic SGD hyper parameters cost_1 = [0. for i in range(10)] learn_rate = 0.0002 for i in range(300000): scale = min(1.0, float(i) / 40000.0) if ((i + 1) % 100000 == 0): learn_rate = learn_rate * 0.8 # do a minibatch update of the model, and compute some costs tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,)) Xd_batch = Xtr.take(tr_idx, axis=0) Xd_batch = np.repeat(Xd_batch, batch_reps, axis=0) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # do a minibatch update of the model, and compute some costs GIP.set_all_sgd_params(lr_gn=(scale*learn_rate), \ lr_in=(scale*learn_rate), mom_1=0.9, mom_2=0.999) GIP.set_lam_nll(1.0) GIP.set_lam_kld(1.0 + extra_lam_kld*scale) outputs = GIP.train_joint(Xd_batch, Xc_batch, Xm_batch) cost_1 = [(cost_1[k] + 1.*outputs[k]) for k in range(len(outputs))] if ((i % 1000) == 0): cost_1 = [(v / 1000.) for v in cost_1] o_str = "batch: {0:d}, joint_cost: {1:.4f}, data_nll_cost: {2:.4f}, post_kld_cost: {3:.4f}, other_reg_cost: {4:.4f}".format( \ i, cost_1[0], cost_1[1], cost_1[2], cost_1[3]) print(o_str) out_file.write(o_str+"\n") out_file.flush() cost_1 = [0. for v in cost_1] if ((i % 5000) == 0): cost_2 = GIP.compute_costs(Xva, 0.*Xva, 0.*Xva) o_str = "--val: {0:d}, joint_cost: {1:.4f}, data_nll_cost: {2:.4f}, post_kld_cost: {3:.4f}, other_reg_cost: {4:.4f}".format( \ i, 1.*cost_2[0], 1.*cost_2[1], 1.*cost_2[2], 1.*cost_2[3]) print(o_str) out_file.write(o_str+"\n") out_file.flush() if ((i % 5000) == 0): tr_idx = npr.randint(low=0,high=va_samples,size=(100,)) Xd_batch = Xva.take(tr_idx, axis=0) file_name = RESULT_PATH+"pt_gip_chain_samples_b{0:d}.png".format(i) Xd_samps = np.repeat(Xd_batch[0:10,:], 3, axis=0) sample_lists = GIP.sample_from_chain(Xd_samps, loop_iters=20) Xs = np.vstack(sample_lists["data samples"]) utils.visualize_samples(Xs, file_name, num_rows=20) # draw samples freely from the generative model's prior file_name = RESULT_PATH+"pt_gip_prior_samples_b{0:d}.png".format(i) Xs = GIP.sample_from_prior(20*20) utils.visualize_samples(Xs, file_name, num_rows=20) # draw inference net first layer weights file_name = RESULT_PATH+"pt_gip_inf_weights_b{0:d}.png".format(i) utils.visualize_net_layer(GIP.IN.shared_layers[0], file_name) # draw generator net final layer weights file_name = RESULT_PATH+"pt_gip_gen_weights_b{0:d}.png".format(i) if (gn_params['out_type'] == 'gaussian'): lay_num = -2 else: lay_num = -1 utils.visualize_net_layer(GIP.GN.mlp_layers[lay_num], file_name, \ colorImg=False, use_transpose=True) ######################### # Check posterior KLds. # ######################### post_klds = posterior_klds(IN, Xtr, 5000, 5) file_name = RESULT_PATH+"pt_gip_post_klds_b{0:d}.png".format(i) utils.plot_kde_histogram2( \ np.asarray(post_klds), np.asarray(post_klds), file_name, bins=30) if ((i % 10000) == 0): IN.save_to_file(f_name=RESULT_PATH+"pt_gip_params_b{0:d}_IN.pkl".format(i)) GN.save_to_file(f_name=RESULT_PATH+"pt_gip_params_b{0:d}_GN.pkl".format(i)) IN.save_to_file(f_name=RESULT_PATH+"pt_gip_params_IN.pkl") GN.save_to_file(f_name=RESULT_PATH+"pt_gip_params_GN.pkl") return
def test_mnist(step_type='add', \ rev_sched=None): ######################################### # Format the result tag more thoroughly # ######################################### result_tag = "{}AAA_SRRM_ST{}".format(RESULT_PATH, step_type) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/') Xtr = np.vstack((Xtr, Xva)) Xva = Xte #del Xte tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ x_dim = Xtr.shape[1] s_dim = x_dim #s_dim = 300 z_dim = 100 init_scale = 0.66 x_out_sym = T.matrix('x_out_sym') ################# # p_zi_given_xi # ################# params = {} shared_config = [(x_dim + x_dim), 500, 500] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = tanh_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_zi_given_xi = InfNet(rng=rng, Xd=x_out_sym, \ params=params, shared_param_dicts=None) p_zi_given_xi.init_biases(0.0) ################### # p_sip1_given_zi # ################### params = {} shared_config = [z_dim, 500, 500] output_config = [s_dim, s_dim, s_dim] params['shared_config'] = shared_config params['output_config'] = output_config params['activation'] = tanh_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_sip1_given_zi = HydraNet(rng=rng, Xd=x_out_sym, \ params=params, shared_param_dicts=None) p_sip1_given_zi.init_biases(0.0) ################ # p_x_given_si # ################ params = {} shared_config = [s_dim, 500] output_config = [x_dim, x_dim] params['shared_config'] = shared_config params['output_config'] = output_config params['activation'] = tanh_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_x_given_si = HydraNet(rng=rng, Xd=x_out_sym, \ params=params, shared_param_dicts=None) p_x_given_si.init_biases(0.0) ################### # q_zi_given_xi # ################### params = {} shared_config = [(x_dim + x_dim), 500, 500] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = tanh_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_zi_given_xi = InfNet(rng=rng, Xd=x_out_sym, \ params=params, shared_param_dicts=None) q_zi_given_xi.init_biases(0.0) ################################################# # Setup a revelation schedule if none was given # ################################################# # if rev_sched is None: # rev_sched = [(10, 1.0)] # rev_masks = None p_masks = np.zeros((16,x_dim)) p_masks[7] = npr.uniform(size=(1,x_dim)) < 0.25 p_masks[-1] = np.ones((1,x_dim)) p_masks = p_masks.astype(theano.config.floatX) q_masks = np.ones(p_masks.shape).astype(theano.config.floatX) rev_masks = [p_masks, q_masks] ######################################################### # Define parameters for the SRRModel, and initialize it # ######################################################### print("Building the SRRModel...") srrm_params = {} srrm_params['x_dim'] = x_dim srrm_params['z_dim'] = z_dim srrm_params['s_dim'] = s_dim srrm_params['use_p_x_given_si'] = False srrm_params['rev_sched'] = rev_sched srrm_params['rev_masks'] = rev_masks srrm_params['step_type'] = step_type srrm_params['x_type'] = 'bernoulli' srrm_params['obs_transform'] = 'sigmoid' SRRM = SRRModel(rng=rng, x_out=x_out_sym, \ p_zi_given_xi=p_zi_given_xi, \ p_sip1_given_zi=p_sip1_given_zi, \ p_x_given_si=p_x_given_si, \ q_zi_given_xi=q_zi_given_xi, \ params=srrm_params, \ shared_param_dicts=None) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format(result_tag) out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.00015 momentum = 0.5 batch_idx = np.arange(batch_size) + tr_samples for i in range(250000): scale = min(1.0, ((i+1) / 5000.0)) lam_scale = 1.0 - min(1.0, ((i+1) / 50000.0)) # decays from 1.0->0.0 if (((i + 1) % 15000) == 0): learn_rate = learn_rate * 0.93 if (i > 10000): momentum = 0.95 else: momentum = 0.80 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update SRRM.set_sgd_params(lr=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.98) SRRM.set_train_switch(1.0) SRRM.set_lam_kld(lam_kld_p=0.0, lam_kld_q=1.0, \ lam_kld_g=0.0, lam_kld_s=0.0) SRRM.set_lam_l2w(1e-5) # perform a minibatch update and record the cost for this batch xb = to_fX( Xtr.take(batch_idx, axis=0) ) result = SRRM.train_joint(xb) # do diagnostics and general training tracking costs = [(costs[j] + result[j]) for j in range(len(result)-1)] if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_cost : {0:.4f}".format(costs[2]) str5 = " kld_cost : {0:.4f}".format(costs[3]) str6 = " reg_cost : {0:.4f}".format(costs[4]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): Xva = row_shuffle(Xva) # record an estimate of performance on the test set xb = Xva[0:5000] nll, kld = SRRM.compute_fe_terms(xb, sample_count=10) vfe = np.mean(nll) + np.mean(kld) str1 = " va_nll_bound : {}".format(vfe) str2 = " va_nll_term : {}".format(np.mean(nll)) str3 = " va_kld_q2p : {}".format(np.mean(kld)) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() # draw some sample imputations from the model xo = Xva[0:100] samp_count = xo.shape[0] xm_seq, xi_seq, mi_seq = SRRM.sequence_sampler(xo, use_guide_policy=True) seq_len = len(xm_seq) seq_samps = np.zeros((seq_len*samp_count, xm_seq[0].shape[1])) ###### # xm # ###### idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = xm_seq[s2,s1,:] idx += 1 file_name = "{0:s}_xm_samples_b{1:d}.png".format(result_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=20) ###### # xi # ###### idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = xi_seq[s2,s1,:] idx += 1 file_name = "{0:s}_xi_samples_b{1:d}.png".format(result_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=20) ###### # mi # ###### idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = mi_seq[s2,s1,:] idx += 1 file_name = "{0:s}_mi_samples_b{1:d}.png".format(result_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=20)
def test_git_on_gip(hyper_params=None, rng_seed=1234): assert(not (hyper_params is None)) # Initialize a source of randomness rng = np.random.RandomState(rng_seed) sup_count = 100 # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm_ss(dataset, sup_count, rng, zero_mean=False) Xtr_su = datasets[0][0].get_value(borrow=False) Ytr_su = datasets[0][1].get_value(borrow=False).astype(np.int32) Xtr_un = datasets[1][0].get_value(borrow=False) Ytr_un = datasets[1][1].get_value(borrow=False).astype(np.int32) # get the joint labeled and unlabeled data Xtr_un = np.vstack([Xtr_su, Xtr_un]).astype(theano.config.floatX) Ytr_un = np.vstack([Ytr_su[:,np.newaxis], Ytr_un[:,np.newaxis]]) # get the labeled data Xtr_su = Xtr_su.astype(theano.config.floatX) Ytr_su = Ytr_su[:,np.newaxis] # get observations and labels for the validation set Xva = datasets[2][0].get_value(borrow=False).astype(theano.config.floatX) Yva = datasets[2][1].get_value(borrow=False).astype(np.int32) Yva = Yva[:,np.newaxis] # numpy is dumb # get size information for the data un_samples = Xtr_un.shape[0] su_samples = Xtr_su.shape[0] va_samples = Xva.shape[0] # set up some symbolic variables for input/output Xp = T.matrix('Xp_base') Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') Yd = T.icol('Yd_base') # set some "shape" parameters for the networks data_dim = Xtr_un.shape[1] label_dim = 10 prior_1_dim = 50 prior_2_dim = 50 prior_sigma = 1.0 batch_size = 100 ################## # SETUP A GIPAIR # ################## gn1_params = {} gn1_config = [prior_1_dim, 600, 600, data_dim] gn1_params['mlp_config'] = gn1_config gn1_params['activation'] = softplus_actfun gn1_params['out_type'] = 'bernoulli' gn1_params['lam_l2a'] = 1e-3 gn1_params['vis_drop'] = 0.0 gn1_params['hid_drop'] = 0.0 gn1_params['bias_noise'] = 0.1 # choose some parameters for the continuous inferencer in1_params = {} shared_config = [data_dim, 600, 600] top_config = [shared_config[-1], prior_1_dim] in1_params['shared_config'] = shared_config in1_params['mu_config'] = top_config in1_params['sigma_config'] = top_config in1_params['activation'] = softplus_actfun in1_params['lam_l2a'] = 1e-3 in1_params['vis_drop'] = 0.0 in1_params['hid_drop'] = 0.0 in1_params['bias_noise'] = 0.1 in1_params['input_noise'] = 0.0 # Initialize the base networks for this GIPair IN1 = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, prior_sigma=prior_sigma, \ params=in1_params, shared_param_dicts=None) GN1 = GenNet(rng=rng, Xp=Xp, prior_sigma=prior_sigma, \ params=gn1_params, shared_param_dicts=None) # Initialize biases in IN and GN IN1.init_biases(0.0) GN1.init_biases(0.0) # Initialize the GIPair GIP = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN1, i_net=IN1, \ data_dim=data_dim, prior_dim=prior_1_dim, \ params=None, shared_param_dicts=None) # Set cost weighting parameters GIP.set_lam_nll(1.0) GIP.set_lam_kld(1.0) GIP.set_lam_l2w(1e-4) ################## # SETUP A GITRIP # ################## # set parameters for the generator network gn2_params = {} gn2_config = [(prior_2_dim + label_dim), 300, prior_1_dim] gn2_params['mlp_config'] = gn2_config gn2_params['activation'] = softplus_actfun gn2_params['out_type'] = 'gaussian' gn2_params['lam_l2a'] = 1e-3 gn2_params['vis_drop'] = 0.0 gn2_params['hid_drop'] = 0.0 gn2_params['bias_noise'] = 0.1 # choose some parameters for the continuous inferencer in2_params = {} shared_config = [prior_1_dim, 300] top_config = [shared_config[-1], prior_2_dim] in2_params['shared_config'] = shared_config in2_params['mu_config'] = top_config in2_params['sigma_config'] = top_config in2_params['activation'] = softplus_actfun in2_params['lam_l2a'] = 1e-3 in2_params['vis_drop'] = 0.0 in2_params['hid_drop'] = 0.0 in2_params['bias_noise'] = 0.1 in2_params['input_noise'] = 0.0 # choose some parameters for the categorical inferencer pn2_params = {} pc0 = [prior_1_dim, 300, label_dim] pn2_params['proto_configs'] = [pc0] # Set up some spawn networks sc0 = {'proto_key': 0, 'input_noise': 0.0, 'bias_noise': 0.1, 'do_dropout': False} #sc1 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} pn2_params['spawn_configs'] = [sc0] #[sc0, sc1] pn2_params['spawn_weights'] = [1.0] #[0.5, 0.5] # Set remaining params pn2_params['activation'] = softplus_actfun pn2_params['ear_type'] = 6 pn2_params['lam_l2a'] = 1e-3 pn2_params['vis_drop'] = 0.0 pn2_params['hid_drop'] = 0.0 # Initialize the base networks for this GITrip GN2 = GenNet(rng=rng, Xp=Xp, prior_sigma=prior_sigma, \ params=gn2_params, shared_param_dicts=None) IN2 = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, prior_sigma=prior_sigma, \ params=in2_params, shared_param_dicts=None) PN2 = PeaNet(rng=rng, Xd=Xd, params=pn2_params) # Initialize biases in GN, IN, and PN GN2.init_biases(0.0) IN2.init_biases(0.0) PN2.init_biases(0.0) # Initialize the GITrip GIT = GITrip(rng=rng, \ Xd=Xd, Yd=Yd, Xc=Xc, Xm=Xm, \ g_net=GN2, i_net=IN2, p_net=PN2, \ data_dim=prior_1_dim, prior_dim=prior_2_dim, \ label_dim=label_dim, batch_size=batch_size, \ params=None, shared_param_dicts=None) # Set cost weighting parameters GIT.set_lam_nll(1.0) GIT.set_lam_kld(1.0) GIT.set_lam_cat(0.0) GIT.set_lam_pea(0.0) GIT.set_lam_ent(0.0) GIT.set_lam_l2w(1e-4) ##################################################### # CONSTRUCT A GITonGIP STACKED, SEMI-SUPERVISED VAE # ##################################################### GOG = GITonGIP(rng=rng, \ Xd=Xd, Yd=Yd, Xc=Xc, Xm=Xm, \ gip_vae=GIP, git_vae=GIT, \ data_dim=data_dim, prior_1_dim=prior_1_dim, \ prior_2_dim=prior_2_dim, label_dim=label_dim, \ batch_size=batch_size, \ params=None, shared_param_dicts=None) ################################# # WRITE SOME INFO TO "LOG" FILE # ################################# learn_rate_git = hyper_params['learn_rate_git'] lam_pea_git = hyper_params['lam_pea_git'] lam_cat_git = hyper_params['lam_cat_git'] lam_ent_git = hyper_params['lam_ent_git'] lam_l2w_git = hyper_params['lam_l2w_git'] out_name = hyper_params['out_name'] out_file = open(out_name, 'wb') out_file.write("**TODO: More informative output, and maybe a real log**\n") out_file.write("learn_rate_git: {0:.4f}\n".format(learn_rate_git)) out_file.write("lam_pea_git: {0:.4f}\n".format(lam_pea_git)) out_file.write("lam_cat_git: {0:.4f}\n".format(lam_cat_git)) out_file.write("lam_ent_git: {0:.4f}\n".format(lam_ent_git)) out_file.write("lam_l2w_git: {0:.4f}\n".format(lam_l2w_git)) out_file.flush() ################################################## # TRAIN THE GIPair FOR SOME NUMBER OF ITERATIONS # ################################################## learn_rate = 0.002 for i in range(250000): if ((i+1 % 100000) == 0): learn_rate = learn_rate * 0.8 scale = min(1.0, (float(i+1) / 50000.0)) GIP.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.98) GIP.set_lam_nll(lam_nll=1.0) GIP.set_lam_kld(lam_kld=scale) # sample some unlabeled data to train with tr_idx = npr.randint(low=0,high=un_samples,size=(batch_size,)) Xd_batch = binarize_data(Xtr_un.take(tr_idx, axis=0)) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # do a minibatch update of the model, and compute some costs outputs = GOG.train_gip(Xd_batch, Xc_batch, Xm_batch) joint_cost = 1.0 * outputs[0] data_nll_cost = 1.0 * outputs[1] post_kld_cost = 1.0 * outputs[2] other_reg_cost = 1.0 * outputs[3] if ((i % 1000) == 0): o_str = "batch: {0:d}, joint_cost: {1:.4f}, data_nll_cost: {2:.4f}, post_kld_cost: {3:.4f}, other_reg_cost: {4:.4f}".format( \ i, joint_cost, data_nll_cost, post_kld_cost, other_reg_cost) print(o_str) out_file.write("{}\n".format(o_str)) out_file.flush() if ((i % 5000) == 0): file_name = "GOG_GIP_SAMPLES_b{0:d}.png".format(i) Xd_samps = np.repeat(Xd_batch[0:10,:], 3, axis=0) sample_lists = GIP.sample_gil_from_data(Xd_samps, loop_iters=10) Xs = np.vstack(sample_lists["data samples"]) utils.visualize_samples(Xs, file_name) ######################################################## # REMOVE (SORT OF) UNUSED DIMENSIONS FROM LATENT SPACE # ######################################################## #tr_idx = npr.randint(low=0,high=un_samples,size=(10000,)) #Xd_batch = binarize_data(Xtr_un.take(tr_idx, axis=0)) #Xp_batch = GIP.IN.mean_posterior(Xd_batch, 0.0*Xd_batch, 0.0*Xd_batch) #Xp_std = np.std(Xp_batch, axis=0, keepdims=True) #dim_mask = 1.0 * (Xp_std > 0.1) #GIT.set_input_mask(dim_mask) #print("MASK NNZ: {0:.4f}".format(np.sum(dim_mask))) ################################################## # TRAIN THE GITrip FOR SOME NUMBER OF ITERATIONS # ################################################## GIT.set_lam_l2w(lam_l2w=lam_l2w_git) learn_rate = learn_rate_git GIT.set_all_sgd_params(learn_rate=learn_rate, momentum=0.98) for i in range(250000): scale = 1.0 if (i < 25000): scale = float(i+1) / 25000.0 if ((i+1 % 50000) == 0): learn_rate = learn_rate * 0.8 # do a minibatch update using unlabeled data if True: # get some data to train with un_idx = npr.randint(low=0,high=un_samples,size=(batch_size,)) Xd_un = binarize_data(Xtr_un.take(un_idx, axis=0)) Yd_un = Ytr_un.take(un_idx, axis=0) Xc_un = 0.0 * Xd_un Xm_un = 0.0 * Xd_un # do a minibatch update of the model, and compute some costs GIT.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.98) GIT.set_lam_nll(1.0) GIT.set_lam_kld(scale * 1.0) GIT.set_lam_cat(0.0) GIT.set_lam_pea(scale * lam_pea_git) GIT.set_lam_ent(scale * lam_ent_git) outputs = GOG.train_git(Xd_un, Xc_un, Xm_un, Yd_un) joint_cost = 1.0 * outputs[0] data_nll_cost = 1.0 * outputs[1] post_kld_cost = 1.0 * outputs[2] post_cat_cost = 1.0 * outputs[3] post_pea_cost = 1.0 * outputs[4] post_ent_cost = 1.0 * outputs[5] other_reg_cost = 1.0 * outputs[6] if True: # get some data to train with su_idx = npr.randint(low=0,high=su_samples,size=(batch_size,)) Xd_su = binarize_data(Xtr_su.take(su_idx, axis=0)) Yd_su = Ytr_su.take(su_idx, axis=0) Xc_su = 0.0 * Xd_su Xm_su = 0.0 * Xd_su # update only based on the label-based classification cost GIT.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.98) GIT.set_lam_nll(0.0) GIT.set_lam_kld(0.0) GIT.set_lam_cat(scale * lam_cat_git) GIT.set_lam_pea(scale * lam_pea_git) GIT.set_lam_ent(0.0) outputs = GOG.train_git(Xd_su, Xc_su, Xm_su, Yd_su) joint_2 = 1.0 * outputs[0] data_nll_2 = 1.0 * outputs[1] post_kld_2 = 1.0 * outputs[2] post_cat_cost = 1.0 * outputs[3] post_pea_2 = 1.0 * outputs[4] post_ent_2 = 1.0 * outputs[5] other_reg_cost = 1.0 * outputs[6] if ((i % 500) == 0): o_str = "batch: {0:d}, joint_cost: {1:.4f}, nll: {2:.4f}, kld: {3:.4f}, cat: {4:.4f}, pea: {5:.4f}, ent: {6:.4f}, other_reg: {7:.4f}".format( \ i, joint_cost, data_nll_cost, post_kld_cost, post_cat_cost, post_pea_cost, post_ent_cost, other_reg_cost) print(o_str) out_file.write("{}\n".format(o_str)) out_file.flush() if ((i % 2500) == 0): # check classification error on training and validation set train_err = GOG.classification_error(Xtr_su, Ytr_su) va_err = GOG.classification_error(Xva, Yva) o_str = " tr_err: {0:.4f}, va_err: {1:.4f}".format(train_err, va_err) print(o_str) out_file.write("{}\n".format(o_str)) out_file.flush() if ((i % 5000) == 0): file_name = "GoG_GIT_SAMPLES_b{0:d}.png".format(i) va_idx = npr.randint(low=0,high=va_samples,size=(5,)) Xd_samps = np.vstack([Xd_un[0:5,:], binarize_data(Xva[va_idx,:])]) Xd_samps = np.repeat(Xd_samps, 3, axis=0) sample_lists = GOG.sample_git_from_data(Xd_samps, loop_iters=10) Xs = np.vstack(sample_lists["data samples"]) Ys = GOG.class_probs(Xs) Xs = mnist_prob_embed(Xs, Ys) utils.visualize_samples(Xs, file_name)
def test_gi_stack(hyper_params=None, sup_count=600, rng_seed=1234): assert(not (hyper_params is None)) # Initialize a source of randomness rng = np.random.RandomState(rng_seed) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm_ss(dataset, sup_count, rng, zero_mean=False) Xtr_su = datasets[0][0].get_value(borrow=False) Ytr_su = datasets[0][1].get_value(borrow=False) Xtr_un = datasets[1][0].get_value(borrow=False) Ytr_un = datasets[1][1].get_value(borrow=False) # get the unlabeled data Xtr_un = np.vstack([Xtr_su, Xtr_un]).astype(theano.config.floatX) Ytr_un = np.vstack([Ytr_su[:,np.newaxis], Ytr_un[:,np.newaxis]]).astype(np.int32) Ytr_un = 0 * Ytr_un # get the labeled data Xtr_su = Xtr_su.astype(theano.config.floatX) Ytr_su = Ytr_su[:,np.newaxis].astype(np.int32) # get observations and labels for the validation set Xva = datasets[2][0].get_value(borrow=False).astype(theano.config.floatX) Yva = datasets[2][1].get_value(borrow=False).astype(np.int32) Yva = Yva[:,np.newaxis] # numpy is dumb # get size information for the data un_samples = Xtr_un.shape[0] su_samples = Xtr_su.shape[0] va_samples = Xva.shape[0] # Construct a GenNet and an InfNet, then test constructor for GIPair. # Do basic testing, to make sure classes aren't completely broken. Xp = T.matrix('Xp_base') Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') Yd = T.icol('Yd_base') data_dim = Xtr_un.shape[1] label_dim = 10 prior_dim = 50 prior_sigma = 1.0 batch_size = 150 # Choose some parameters for the generator network gn_params = {} gn_config = [prior_dim, 600, 600, data_dim] gn_params['mlp_config'] = gn_config gn_params['activation'] = softplus_actfun gn_params['lam_l2a'] = 1e-3 gn_params['vis_drop'] = 0.0 gn_params['hid_drop'] = 0.0 gn_params['bias_noise'] = 0.1 # choose some parameters for the continuous inferencer in_params = {} shared_config = [data_dim, 600, 600] top_config = [shared_config[-1], prior_dim] in_params['shared_config'] = shared_config in_params['mu_config'] = top_config in_params['sigma_config'] = top_config in_params['activation'] = softplus_actfun in_params['init_scale'] = 2.0 in_params['lam_l2a'] = 1e-3 in_params['vis_drop'] = 0.0 in_params['hid_drop'] = 0.0 in_params['bias_noise'] = 0.1 in_params['input_noise'] = 0.1 # choose some parameters for the categorical inferencer pn_params = {} pc0 = [prior_dim, 800, 800, label_dim] pn_params['proto_configs'] = [pc0] # Set up some spawn networks sc0 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} sc1 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} pn_params['spawn_configs'] = [sc0, sc1] pn_params['spawn_weights'] = [0.5, 0.5] # Set remaining params pn_params['activation'] = relu_actfun pn_params['init_scale'] = 2.0 pn_params['ear_type'] = 6 pn_params['lam_l2a'] = 1e-3 pn_params['vis_drop'] = 0.0 pn_params['hid_drop'] = 0.5 # Initialize the base networks for this GIPair GN = GenNet(rng=rng, Xp=Xp, prior_sigma=prior_sigma, \ params=gn_params, shared_param_dicts=None) IN = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, prior_sigma=prior_sigma, \ params=in_params, shared_param_dicts=None) PN = PeaNet(rng=rng, Xd=Xd, params=pn_params) # Initialize biases in GN, IN, and PN GN.init_biases(0.0) IN.init_biases(0.0) PN.init_biases(0.1) # Initialize the GIStack GIS = GIStack(rng=rng, \ Xd=Xd, Yd=Yd, Xc=Xc, Xm=Xm, \ g_net=GN, i_net=IN, p_net=PN, \ data_dim=data_dim, prior_dim=prior_dim, \ label_dim=label_dim, batch_size=batch_size, \ params=None, shared_param_dicts=None) # set weighting parameters for the various costs... GIS.set_lam_nll(1.0) GIS.set_lam_kld(1.0) GIS.set_lam_cat(0.0) GIS.set_lam_pea(0.0) GIS.set_lam_ent(0.0) # Set initial learning rate and basic SGD hyper parameters num_updates = hyper_params['num_updates'] learn_rate = hyper_params['learn_rate'] lam_pea = hyper_params['lam_pea'] lam_cat = hyper_params['lam_cat'] lam_ent = hyper_params['lam_ent'] lam_l2w = hyper_params['lam_l2w'] out_name = hyper_params['out_name'] out_file = open(out_name, 'wb') out_file.write("**TODO: More informative output, and maybe a real log**\n") out_file.write("sup_count: {0:d}\n".format(sup_count)) out_file.write("learn_rate: {0:.4f}\n".format(learn_rate)) out_file.write("lam_pea: {0:.4f}\n".format(lam_pea)) out_file.write("lam_cat: {0:.4f}\n".format(lam_cat)) out_file.write("lam_ent: {0:.4f}\n".format(lam_ent)) out_file.write("lam_l2w: {0:.4f}\n".format(lam_l2w)) out_file.flush() GIS.set_lam_l2w(lam_l2w) GIS.set_all_sgd_params(learn_rate=learn_rate, momentum=0.98) for i in range(num_updates): if (i < 100000): # start with some updates only for the VAE (InfNet and GenNet) scale = float(min(i+1, 50000)) / 50000.0 lam_cat = 0.0 lam_pea = 0.0 lam_ent = 0.0 learn_rate_pn = 0.0 else: # move on to updates that include loss from the PeaNet scale = 1.0 lam_cat = hyper_params['lam_cat'] lam_pea = hyper_params['lam_pea'] if i < 150000: lam_ent = float(i - 99999) * hyper_params['lam_ent'] else: lam_ent = hyper_params['lam_ent'] learn_rate_pn = learn_rate if ((i+1 % 100000) == 0): learn_rate = learn_rate * 0.7 # do a minibatch update using unlabeled data if True: # get some data to train with un_idx = npr.randint(low=0,high=un_samples,size=(batch_size,)) Xd_un = binarize_data(Xtr_un.take(un_idx, axis=0)) Yd_un = Ytr_un.take(un_idx, axis=0) Xc_un = 0.0 * Xd_un Xm_un = 0.0 * Xd_un # do a minibatch update of the model, and compute some costs GIS.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.98) GIS.set_pn_sgd_params(learn_rate=(scale*learn_rate_pn), momentum=0.98) GIS.set_lam_nll(1.0) GIS.set_lam_kld(0.01 + (0.99*scale)) GIS.set_lam_cat(0.0) GIS.set_lam_pea(lam_pea) GIS.set_lam_ent(lam_ent) outputs = GIS.train_joint(Xd_un, Xc_un, Xm_un, Yd_un) joint_cost = 1.0 * outputs[0] data_nll_cost = 1.0 * outputs[1] post_kld_cost = 1.0 * outputs[2] post_cat_cost = 1.0 * outputs[3] post_pea_cost = 1.0 * outputs[4] post_ent_cost = 1.0 * outputs[5] other_reg_cost = 1.0 * outputs[6] # do another minibatch update incorporating label information if (i >= 100000): # get some data to train with su_idx = npr.randint(low=0,high=su_samples,size=(batch_size,)) Xd_su = binarize_data(Xtr_su.take(su_idx, axis=0)) Yd_su = Ytr_su.take(su_idx, axis=0) Xc_su = 0.0 * Xd_su Xm_su = 0.0 * Xd_su # update only based on the label-based classification cost GIS.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.98) GIS.set_pn_sgd_params(learn_rate=(scale*learn_rate_pn), momentum=0.98) GIS.set_lam_nll(0.0) GIS.set_lam_kld(0.0) GIS.set_lam_cat(lam_cat) GIS.set_lam_pea(lam_pea) GIS.set_lam_ent(0.0) outputs = GIS.train_joint(Xd_su, Xc_su, Xm_su, Yd_su) post_cat_cost = 1.0 * outputs[3] assert(not (np.isnan(joint_cost))) if ((i % 500) == 0): o_str = "batch: {0:d}, joint_cost: {1:.4f}, nll: {2:.4f}, kld: {3:.4f}, cat: {4:.4f}, pea: {5:.4f}, ent: {6:.4f}, other_reg: {7:.4f}".format( \ i, joint_cost, data_nll_cost, post_kld_cost, post_cat_cost, post_pea_cost, post_ent_cost, other_reg_cost) print(o_str) out_file.write("{}\n".format(o_str)) if ((i % 1000) == 0): # check classification error on training and validation set train_err = GIS.classification_error(Xtr_su, Ytr_su) va_err = GIS.classification_error(Xva, Yva) o_str = " tr_err: {0:.4f}, va_err: {1:.4f}".format(train_err, va_err) print(o_str) out_file.write("{}\n".format(o_str)) out_file.flush() if ((i % 5000) == 0): file_name = "GIS_SAMPLES_b{0:d}.png".format(i) va_idx = npr.randint(low=0,high=va_samples,size=(5,)) Xd_samps = np.vstack([Xd_un[0:5,:], binarize_data(Xva[va_idx,:])]) Xd_samps = np.repeat(Xd_samps, 3, axis=0) sample_lists = GIS.sample_gis_from_data(Xd_samps, loop_iters=10) Xs = np.vstack(sample_lists["data samples"]) Ys = GIS.class_probs(Xs) Xs = mnist_prob_embed(Xs, Ys) utils.visualize_samples(Xs, file_name) print("TESTING COMPLETE!") out_file.close() return
def pretrain_osm(lam_kld=0.0): # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr = datasets[0][0] Xtr = Xtr.get_value(borrow=False) Xva = datasets[2][0] Xva = Xva.get_value(borrow=False) print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape), str(Xva.shape))) # get and set some basic dataset information Xtr_mean = np.mean(Xtr, axis=0) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 100 batch_reps = 5 # setup some symbolic variables and stuff Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') data_dim = Xtr.shape[1] prior_sigma = 1.0 ########################## # NETWORK CONFIGURATIONS # ########################## gn_params = {} shared_config = [PRIOR_DIM, 1000, 1000] top_config = [shared_config[-1], data_dim] gn_params['shared_config'] = shared_config gn_params['mu_config'] = top_config gn_params['sigma_config'] = top_config gn_params['activation'] = relu_actfun gn_params['init_scale'] = 1.4 gn_params['lam_l2a'] = 0.0 gn_params['vis_drop'] = 0.0 gn_params['hid_drop'] = 0.0 gn_params['bias_noise'] = 0.0 gn_params['input_noise'] = 0.0 # choose some parameters for the continuous inferencer in_params = {} shared_config = [data_dim, 1000, 1000] top_config = [shared_config[-1], PRIOR_DIM] in_params['shared_config'] = shared_config in_params['mu_config'] = top_config in_params['sigma_config'] = top_config in_params['activation'] = relu_actfun in_params['init_scale'] = 1.4 in_params['lam_l2a'] = 0.0 in_params['vis_drop'] = 0.0 in_params['hid_drop'] = 0.0 in_params['bias_noise'] = 0.0 in_params['input_noise'] = 0.0 # Initialize the base networks for this OneStageModel IN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=in_params, shared_param_dicts=None) GN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=gn_params, shared_param_dicts=None) # Initialize biases in IN and GN IN.init_biases(0.2) GN.init_biases(0.2) ######################### # INITIALIZE THE GIPAIR # ######################### osm_params = {} osm_params['x_type'] = 'bernoulli' osm_params['xt_transform'] = 'sigmoid' osm_params['logvar_bound'] = LOGVAR_BOUND OSM = OneStageModel(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, \ p_x_given_z=GN, q_z_given_x=IN, \ x_dim=data_dim, z_dim=PRIOR_DIM, params=osm_params) OSM.set_lam_l2w(1e-5) safe_mean = (0.9 * Xtr_mean) + 0.05 safe_mean_logit = np.log(safe_mean / (1.0 - safe_mean)) OSM.set_output_bias(safe_mean_logit) OSM.set_input_bias(-Xtr_mean) ###################### # BASIC VAE TRAINING # ###################### out_file = open(RESULT_PATH + "pt_osm_results.txt", 'wb') # Set initial learning rate and basic SGD hyper parameters obs_costs = np.zeros((batch_size, )) costs = [0. for i in range(10)] learn_rate = 0.0005 for i in range(150000): scale = min(1.0, float(i) / 10000.0) if ((i > 1) and ((i % 20000) == 0)): learn_rate = learn_rate * 0.9 # do a minibatch update of the model, and compute some costs tr_idx = npr.randint(low=0, high=tr_samples, size=(batch_size, )) Xd_batch = Xtr.take(tr_idx, axis=0) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # do a minibatch update of the model, and compute some costs OSM.set_sgd_params(lr_1=(scale * learn_rate), mom_1=0.5, mom_2=0.98) OSM.set_lam_nll(1.0) OSM.set_lam_kld(lam_kld_1=(1.0 + (scale * (lam_kld - 1.0))), lam_kld_2=0.0) result = OSM.train_joint(Xd_batch, Xc_batch, Xm_batch, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 1000) == 0): # record and then reset the cost trackers costs = [(v / 1000.0) for v in costs] str_1 = "-- batch {0:d} --".format(i) str_2 = " joint_cost: {0:.4f}".format(costs[0]) str_3 = " nll_cost : {0:.4f}".format(costs[1]) str_4 = " kld_cost : {0:.4f}".format(costs[2]) str_5 = " reg_cost : {0:.4f}".format(costs[3]) costs = [0.0 for v in costs] # print out some diagnostic information joint_str = "\n".join([str_1, str_2, str_3, str_4, str_5]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() if ((i % 2000) == 0): Xva = row_shuffle(Xva) model_samps = OSM.sample_from_prior(500) file_name = RESULT_PATH + "pt_osm_samples_b{0:d}_XG.png".format(i) utils.visualize_samples(model_samps, file_name, num_rows=20) # compute information about free-energy on validation set file_name = RESULT_PATH + "pt_osm_free_energy_b{0:d}.png".format(i) fe_terms = OSM.compute_fe_terms(Xva[0:2500], 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) fe_str = " nll_bound : {0:.4f}".format(fe_mean) print(fe_str) out_file.write(fe_str + "\n") utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ x_label='Posterior KLd', y_label='Negative Log-likelihood') # compute information about posterior KLds on validation set file_name = RESULT_PATH + "pt_osm_post_klds_b{0:d}.png".format(i) post_klds = OSM.compute_post_klds(Xva[0:2500]) post_dim_klds = np.mean(post_klds, axis=0) utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, \ file_name) if ((i % 5000) == 0): IN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_b{0:d}_IN.pkl".format(i)) GN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_b{0:d}_GN.pkl".format(i)) IN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_IN.pkl") GN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_GN.pkl") return
def test_svhn(step_type='add', occ_dim=15, drop_prob=0.0): ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = "{}GPSI_OD{}_DP{}_{}_NA".format(RESULT_PATH, occ_dim, dp_int, step_type) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) tr_file = 'data/svhn_train_gray.pkl' te_file = 'data/svhn_test_gray.pkl' ex_file = 'data/svhn_extra_gray.pkl' data = load_svhn_gray(tr_file, te_file, ex_file=ex_file, ex_count=200000) Xtr = to_fX(shift_and_scale_into_01(np.vstack([data['Xtr'], data['Xex']]))) Xva = to_fX(shift_and_scale_into_01(data['Xte'])) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 250 batch_reps = 1 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1], ))) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ x_dim = Xtr.shape[1] z_dim = 200 imp_steps = 6 init_scale = 1.0 x_in_sym = T.matrix('x_in_sym') x_out_sym = T.matrix('x_out_sym') x_mask_sym = T.matrix('x_mask_sym') ################# # p_zi_given_xi # ################# params = {} shared_config = [x_dim, 1500, 1500] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_zi_given_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_zi_given_xi.init_biases(0.2) ################### # p_xip1_given_zi # ################### params = {} shared_config = [z_dim, 1500, 1500] output_config = [x_dim, x_dim] params['shared_config'] = shared_config params['output_config'] = output_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_xip1_given_zi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_xip1_given_zi.init_biases(0.2) ################### # q_zi_given_xi # ################### params = {} shared_config = [(x_dim + x_dim), 1500, 1500] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_zi_given_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_zi_given_xi.init_biases(0.2) ########################################################### # Define parameters for the GPSImputer, and initialize it # ########################################################### print("Building the GPSImputer...") gpsi_params = {} gpsi_params['x_dim'] = x_dim gpsi_params['z_dim'] = z_dim gpsi_params['imp_steps'] = imp_steps gpsi_params['step_type'] = step_type gpsi_params['x_type'] = 'bernoulli' gpsi_params['obs_transform'] = 'sigmoid' GPSI = GPSImputer(rng=rng, x_in=x_in_sym, x_out=x_out_sym, x_mask=x_mask_sym, \ p_zi_given_xi=p_zi_given_xi, \ p_xip1_given_zi=p_xip1_given_zi, \ q_zi_given_xi=q_zi_given_xi, \ params=gpsi_params, \ shared_param_dicts=None) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format(result_tag) out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.5 batch_idx = np.arange(batch_size) + tr_samples for i in range(200005): scale = min(1.0, ((i + 1) / 5000.0)) if (((i + 1) % 15000) == 0): learn_rate = learn_rate * 0.92 if (i > 10000): momentum = 0.90 else: momentum = 0.50 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update GPSI.set_sgd_params(lr=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.98) GPSI.set_train_switch(1.0) GPSI.set_lam_nll(lam_nll=1.0) GPSI.set_lam_kld(lam_kld_p=0.1, lam_kld_q=0.9) GPSI.set_lam_l2w(1e-4) # perform a minibatch update and record the cost for this batch xb = to_fX(Xtr.take(batch_idx, axis=0)) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) result = GPSI.train_joint(xi, xo, xm, batch_reps) # do diagnostics and general training tracking costs = [(costs[j] + result[j]) for j in range(len(result) - 1)] if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_cost : {0:.4f}".format(costs[2]) str5 = " kld_cost : {0:.4f}".format(costs[3]) str6 = " reg_cost : {0:.4f}".format(costs[4]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva[0:5000], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) nll, kld = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10) vfe = np.mean(nll) + np.mean(kld) str1 = " va_nll_bound : {}".format(vfe) str2 = " va_nll_term : {}".format(np.mean(nll)) str3 = " va_kld_q2p : {}".format(np.mean(kld)) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() GPSI.save_to_file("{}_PARAMS.pkl".format(result_tag)) if ((i % 20000) == 0): # Get some validation samples for evaluating model performance xb = to_fX(Xva[0:100]) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) xi = np.repeat(xi, 2, axis=0) xo = np.repeat(xo, 2, axis=0) xm = np.repeat(xm, 2, axis=0) # draw some sample imputations from the model samp_count = xi.shape[0] _, model_samps = GPSI.sample_imputer(xi, xo, xm, use_guide_policy=False) seq_len = len(model_samps) seq_samps = np.zeros( (seq_len * samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "{0:s}_samples_ng_b{1:d}.png".format(result_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=20)
def pretrain_osm(lam_kld=0.0): # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with data_file = 'data/tfd_data_48x48.pkl' dataset = load_tfd(tfd_pkl_name=data_file, which_set='unlabeled', fold='all') Xtr_unlabeled = dataset[0] dataset = load_tfd(tfd_pkl_name=data_file, which_set='train', fold='all') Xtr_train = dataset[0] Xtr = np.vstack([Xtr_unlabeled, Xtr_train]) dataset = load_tfd(tfd_pkl_name=data_file, which_set='valid', fold='all') Xva = dataset[0] tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 400 batch_reps = 6 carry_frac = 0.25 carry_size = int(batch_size * carry_frac) reset_prob = 0.04 # setup some symbolic variables and stuff Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') data_dim = Xtr.shape[1] prior_sigma = 1.0 Xtr_mean = np.mean(Xtr, axis=0) ########################## # NETWORK CONFIGURATIONS # ########################## gn_params = {} shared_config = [PRIOR_DIM, 1500, 1500] top_config = [shared_config[-1], data_dim] gn_params['shared_config'] = shared_config gn_params['mu_config'] = top_config gn_params['sigma_config'] = top_config gn_params['activation'] = relu_actfun gn_params['init_scale'] = 1.4 gn_params['lam_l2a'] = 0.0 gn_params['vis_drop'] = 0.0 gn_params['hid_drop'] = 0.0 gn_params['bias_noise'] = 0.0 gn_params['input_noise'] = 0.0 # choose some parameters for the continuous inferencer in_params = {} shared_config = [data_dim, 1500, 1500] top_config = [shared_config[-1], PRIOR_DIM] in_params['shared_config'] = shared_config in_params['mu_config'] = top_config in_params['sigma_config'] = top_config in_params['activation'] = relu_actfun in_params['init_scale'] = 1.4 in_params['lam_l2a'] = 0.0 in_params['vis_drop'] = 0.0 in_params['hid_drop'] = 0.0 in_params['bias_noise'] = 0.0 in_params['input_noise'] = 0.0 # Initialize the base networks for this OneStageModel IN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=in_params, shared_param_dicts=None) GN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=gn_params, shared_param_dicts=None) # Initialize biases in IN and GN IN.init_biases(0.2) GN.init_biases(0.2) ###################################### # LOAD AND RESTART FROM SAVED PARAMS # ###################################### # gn_fname = RESULT_PATH+"pt_osm_params_b110000_GN.pkl" # in_fname = RESULT_PATH+"pt_osm_params_b110000_IN.pkl" # IN = load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd, \ # new_params=None) # GN = load_infnet_from_file(f_name=gn_fname, rng=rng, Xd=Xd, \ # new_params=None) # in_params = IN.params # gn_params = GN.params ######################### # INITIALIZE THE GIPAIR # ######################### osm_params = {} osm_params['x_type'] = 'bernoulli' osm_params['xt_transform'] = 'sigmoid' osm_params['logvar_bound'] = LOGVAR_BOUND OSM = OneStageModel(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, \ p_x_given_z=GN, q_z_given_x=IN, \ x_dim=data_dim, z_dim=PRIOR_DIM, params=osm_params) OSM.set_lam_l2w(1e-5) safe_mean = (0.9 * Xtr_mean) + 0.05 safe_mean_logit = np.log(safe_mean / (1.0 - safe_mean)) OSM.set_output_bias(safe_mean_logit) OSM.set_input_bias(-Xtr_mean) ###################### # BASIC VAE TRAINING # ###################### out_file = open(RESULT_PATH + "pt_osm_results.txt", 'wb') # Set initial learning rate and basic SGD hyper parameters obs_costs = np.zeros((batch_size, )) costs = [0. for i in range(10)] learn_rate = 0.002 for i in range(200000): scale = min(1.0, float(i) / 5000.0) if ((i > 1) and ((i % 20000) == 0)): learn_rate = learn_rate * 0.8 if (i < 50000): momentum = 0.5 elif (i < 10000): momentum = 0.7 else: momentum = 0.9 if ((i == 0) or (npr.rand() < reset_prob)): # sample a fully random batch batch_idx = npr.randint(low=0, high=tr_samples, size=(batch_size, )) else: # sample a partially random batch, which retains some portion of # the worst scoring examples from the previous batch fresh_idx = npr.randint(low=0, high=tr_samples, size=(batch_size - carry_size, )) batch_idx = np.concatenate((fresh_idx.ravel(), carry_idx.ravel())) # do a minibatch update of the model, and compute some costs tr_idx = npr.randint(low=0, high=tr_samples, size=(batch_size, )) Xd_batch = Xtr.take(tr_idx, axis=0) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # do a minibatch update of the model, and compute some costs OSM.set_sgd_params(lr_1=(scale*learn_rate), \ mom_1=(scale*momentum), mom_2=0.98) OSM.set_lam_nll(1.0) OSM.set_lam_kld(lam_kld_1=scale * lam_kld, lam_kld_2=0.0, lam_kld_c=50.0) result = OSM.train_joint(Xd_batch, Xc_batch, Xm_batch, batch_reps) batch_costs = result[4] + result[5] obs_costs = collect_obs_costs(batch_costs, batch_reps) carry_idx = batch_idx[np.argsort(-obs_costs)[0:carry_size]] costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 1000) == 0): # record and then reset the cost trackers costs = [(v / 1000.0) for v in costs] str_1 = "-- batch {0:d} --".format(i) str_2 = " joint_cost: {0:.4f}".format(costs[0]) str_3 = " nll_cost : {0:.4f}".format(costs[1]) str_4 = " kld_cost : {0:.4f}".format(costs[2]) str_5 = " reg_cost : {0:.4f}".format(costs[3]) costs = [0.0 for v in costs] # print out some diagnostic information joint_str = "\n".join([str_1, str_2, str_3, str_4, str_5]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() if ((i % 2000) == 0): Xva = row_shuffle(Xva) model_samps = OSM.sample_from_prior(500) file_name = RESULT_PATH + "pt_osm_samples_b{0:d}_XG.png".format(i) utils.visualize_samples(model_samps, file_name, num_rows=20) file_name = RESULT_PATH + "pt_osm_inf_weights_b{0:d}.png".format(i) utils.visualize_samples(OSM.inf_weights.get_value(borrow=False).T, \ file_name, num_rows=30) file_name = RESULT_PATH + "pt_osm_gen_weights_b{0:d}.png".format(i) utils.visualize_samples(OSM.gen_weights.get_value(borrow=False), \ file_name, num_rows=30) # compute information about free-energy on validation set file_name = RESULT_PATH + "pt_osm_free_energy_b{0:d}.png".format(i) fe_terms = OSM.compute_fe_terms(Xva[0:2500], 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) fe_str = " nll_bound : {0:.4f}".format(fe_mean) print(fe_str) out_file.write(fe_str + "\n") utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ x_label='Posterior KLd', y_label='Negative Log-likelihood') # compute information about posterior KLds on validation set file_name = RESULT_PATH + "pt_osm_post_klds_b{0:d}.png".format(i) post_klds = OSM.compute_post_klds(Xva[0:2500]) post_dim_klds = np.mean(post_klds, axis=0) utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, \ file_name) if ((i % 5000) == 0): IN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_b{0:d}_IN.pkl".format(i)) GN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_b{0:d}_GN.pkl".format(i)) IN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_IN.pkl") GN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_GN.pkl") return
def test_two_stage_model1(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/') Xtr = np.vstack((Xtr, Xva)) Xva = Xte #del Xte tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 2500 batch_reps = 1 ############################################### # Setup some parameters for the TwoStageModel # ############################################### x_dim = Xtr.shape[1] z_dim = 50 h_dim = 100 x_type = 'bernoulli' # some InfNet instances to build the TwoStageModel from xin_sym = T.matrix('xin_sym') xout_sym = T.matrix('xout_sym') ############### # p_h_given_z # ############### params = {} shared_config = [z_dim, 100, 100] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = tanh_actfun params['init_scale'] = 'xg' params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_h_given_z = InfNet(rng=rng, Xd=xin_sym, \ params=params, shared_param_dicts=None) p_h_given_z.init_biases(0.0) ############### # p_x_given_h # ############### params = {} shared_config = [h_dim, 200, 200] top_config = [shared_config[-1], x_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = tanh_actfun params['init_scale'] = 'xg' params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_x_given_h = InfNet(rng=rng, Xd=xin_sym, \ params=params, shared_param_dicts=None) p_x_given_h.init_biases(0.0) ############### # q_z_given_x # ############### params = {} shared_config = [x_dim, 200, 200] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = tanh_actfun params['init_scale'] = 'xg' params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_z_given_x = InfNet(rng=rng, Xd=xin_sym, \ params=params, shared_param_dicts=None) q_z_given_x.init_biases(0.0) ################# # q_h_given_z_x # ################# params = {} shared_config = [(2*h_dim + x_dim), 200, 200] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = tanh_actfun params['init_scale'] = 'xg' params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_h_given_z_x = InfNet(rng=rng, Xd=xin_sym, \ params=params, shared_param_dicts=None) q_h_given_z_x.init_biases(0.0) ############################################################## # Define parameters for the TwoStageModel, and initialize it # ############################################################## print("Building the TwoStageModel...") tsm_params = {} tsm_params['x_type'] = x_type tsm_params['obs_transform'] = 'sigmoid' TSM = TwoStageModel1(rng=rng, x_in=xin_sym, x_out=xout_sym, \ x_dim=x_dim, z_dim=z_dim, h_dim=h_dim, \ q_z_given_x=q_z_given_x, \ q_h_given_z_x=q_h_given_z_x, \ p_h_given_z=p_h_given_z, \ p_x_given_h=p_x_given_h, \ params=tsm_params) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format("TSM1_TEST") out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.00015 momentum = 0.9 batch_idx = np.arange(batch_size) + tr_samples for i in range(500000): scale = min(0.5, ((i+1) / 10000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) Xb = to_fX( Xtr.take(batch_idx, axis=0) ) #Xb = binarize_data(Xtr.take(batch_idx, axis=0)) # set sgd and objective function hyperparams for this update TSM.set_sgd_params(lr=scale*learn_rate, \ mom_1=(scale*momentum), mom_2=0.98) TSM.set_train_switch(1.0) TSM.set_lam_nll(lam_nll=1.0) TSM.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0) TSM.set_lam_l2w(1e-5) # perform a minibatch update and record the cost for this batch result = TSM.train_joint(Xb, Xb, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_cost : {0:.4f}".format(costs[1]) str4 = " kld_cost : {0:.4f}".format(costs[2]) str5 = " reg_cost : {0:.4f}".format(costs[3]) str6 = " nll : {0:.4f}".format(np.mean(costs[4])) str7 = " kld_z : {0:.4f}".format(np.mean(costs[5])) str8 = " kld_h : {0:.4f}".format(np.mean(costs[6])) joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7, str8]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if (((i % 5000) == 0) or ((i < 10000) and ((i % 1000) == 0))): # draw some independent random samples from the model samp_count = 300 model_samps = TSM.sample_from_prior(samp_count) file_name = "TSM1_SAMPLES_b{0:d}.png".format(i) utils.visualize_samples(model_samps, file_name, num_rows=15) # compute free energy estimate for validation samples Xva = row_shuffle(Xva) fe_terms = TSM.compute_fe_terms(Xva[0:5000], Xva[0:5000], 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) out_str = " nll_bound : {0:.4f}".format(fe_mean) print(out_str) out_file.write(out_str+"\n") out_file.flush() return
# choose some parameters for the continuous inferencer in_params = {} shared_config = [data_dim, (300, 4), (300, 4)] top_config = [shared_config[-1], (150, 4), prior_dim] in_params['shared_config'] = shared_config in_params['mu_config'] = top_config in_params['sigma_config'] = top_config in_params['activation'] = relu_actfun in_params['init_scale'] = 2.0 in_params['lam_l2a'] = 1e-2 in_params['vis_drop'] = 0.0 in_params['hid_drop'] = 0.0 in_params['bias_noise'] = 0.1 in_params['out_noise'] = 0.1 in_params['input_noise'] = 0.0 IN = InfNet(rng=rng, Xd=Xd_sym, Xc=Xc_sym, Xm=Xm_sym, \ prior_sigma=prior_sigma, params=in_params) IN.init_biases(0.0) ######################################################################## # Initialize the joint controller for the generator/discriminator pair # ######################################################################## vcg_params = {} vcg_params['lam_l2d'] = 1e-2 vcg_params['mom_mix_rate'] = 0.05 vcg_params['mom_match_weight'] = 0.05 vcg_params['mom_match_proj'] = P vcg_params['target_mean'] = target_mean vcg_params['target_cov'] = target_cov batch_idx = T.lvector('batch_idx') batch_sample = theano.function(inputs=[ batch_idx ], \
def test_with_model_init(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr_shared = datasets[0][0] Xva_shared = datasets[1][0] Xtr = Xtr_shared.get_value(borrow=False).astype(theano.config.floatX) Xva = Xva_shared.get_value(borrow=False).astype(theano.config.floatX) tr_samples = Xtr.shape[0] batch_size = 500 batch_reps = 1 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ obs_dim = Xtr.shape[1] z_rnn_dim = 25 z_obs_dim = 5 jnt_dim = obs_dim + z_rnn_dim h_dim = 100 x_type = 'bernoulli' prior_sigma = 1.0 # some InfNet instances to build the TwoStageModel from X_sym = T.matrix('X_sym') ######################## # p_s0_obs_given_z_obs # ######################## params = {} shared_config = [z_obs_dim, 250, 250] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 1e-3 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_s0_obs_given_z_obs = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \ params=params, shared_param_dicts=None) p_s0_obs_given_z_obs.init_biases(0.2) ################# # p_hi_given_si # ################# params = {} shared_config = [jnt_dim, 500, 500] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_hi_given_si = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \ params=params, shared_param_dicts=None) p_hi_given_si.init_biases(0.2) ###################### # p_sip1_given_si_hi # ###################### params = {} shared_config = [(h_dim + z_rnn_dim), 500, 500] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_sip1_given_si_hi = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \ params=params, shared_param_dicts=None) p_sip1_given_si_hi.init_biases(0.2) ############### # q_z_given_x # ############### params = {} shared_config = [obs_dim, 250, 250] top_config = [shared_config[-1], (z_rnn_dim + z_obs_dim)] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_z_given_x = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \ params=params, shared_param_dicts=None) q_z_given_x.init_biases(0.2) ################### # q_hi_given_x_si # ################### params = {} shared_config = [(obs_dim + jnt_dim), 500, 500] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_hi_given_x_si = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \ params=params, shared_param_dicts=None) q_hi_given_x_si.init_biases(0.2) ################################################################ # Define parameters for the MultiStageModel, and initialize it # ################################################################ print("Building the MultiStageModel...") msm_params = {} msm_params['x_type'] = x_type msm_params['obs_transform'] = 'sigmoid' MSM = MultiStageModel(rng=rng, x_in=X_sym, \ p_s0_obs_given_z_obs=p_s0_obs_given_z_obs, \ p_hi_given_si=p_hi_given_si, \ p_sip1_given_si_hi=p_sip1_given_si_hi, \ q_z_given_x=q_z_given_x, \ q_hi_given_x_si=q_hi_given_x_si, \ obs_dim=obs_dim, z_rnn_dim=z_rnn_dim, z_obs_dim=z_obs_dim, \ h_dim=h_dim, model_init_obs=False, model_init_rnn=True, \ ir_steps=3, params=msm_params) obs_mean = (0.9 * np.mean(Xtr, axis=0)) + 0.05 obs_mean_logit = np.log(obs_mean / (1.0 - obs_mean)) MSM.set_input_bias(-obs_mean) MSM.set_obs_bias(0.1*obs_mean_logit) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ costs = [0. for i in range(10)] learn_rate = 0.003 momentum = 0.5 for i in range(300000): scale = min(1.0, ((i+1) / 5000.0)) l1l2_weight = 1.0 #min(1.0, ((i+1) / 2500.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.92 if (i > 100000): momentum = 0.80 if (i > 50000): momentum = 0.65 else: momentum = 0.50 # randomly sample a minibatch tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,)) Xb = binarize_data(Xtr.take(tr_idx, axis=0)) Xb = Xb.astype(theano.config.floatX) # set sgd and objective function hyperparams for this update MSM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \ mom_1=(scale*momentum), mom_2=0.99) MSM.set_train_switch(1.0) MSM.set_l1l2_weight(l1l2_weight) MSM.set_lam_nll(lam_nll=1.0) MSM.set_lam_kld(lam_kld_1=1.0, lam_kld_2=1.0) MSM.set_lam_l2w(1e-5) MSM.set_kzg_weight(0.01) # perform a minibatch update and record the cost for this batch result = MSM.train_joint(Xb, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] print("-- batch {0:d} --".format(i)) print(" joint_cost: {0:.4f}".format(costs[0])) print(" nll_cost : {0:.4f}".format(costs[1])) print(" kld_cost : {0:.4f}".format(costs[2])) print(" reg_cost : {0:.4f}".format(costs[3])) costs = [0.0 for v in costs] if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))): Xva = row_shuffle(Xva) # draw some independent random samples from the model samp_count = 200 model_samps = MSM.sample_from_prior(samp_count) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "MZ_SAMPLES_b{0:d}.png".format(i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # visualize some important weights in the model file_name = "MZ_INF_1_WEIGHTS_b{0:d}.png".format(i) W = MSM.inf_1_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MZ_INF_2_WEIGHTS_b{0:d}.png".format(i) W = MSM.inf_2_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MZ_GEN_1_WEIGHTS_b{0:d}.png".format(i) W = MSM.gen_1_weights.get_value(borrow=False) utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MZ_GEN_2_WEIGHTS_b{0:d}.png".format(i) W = MSM.gen_2_weights.get_value(borrow=False) utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MZ_GEN_INF_WEIGHTS_b{0:d}.png".format(i) W = MSM.gen_inf_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) # compute information about posterior KLds on validation set post_klds = MSM.compute_post_klds(Xva[0:5000]) file_name = "MZ_H0_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(post_klds[0].shape[1]), \ np.mean(post_klds[0], axis=0), file_name) file_name = "MZ_HI_COND_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(post_klds[1].shape[1]), \ np.mean(post_klds[1], axis=0), file_name) file_name = "MZ_HI_GLOB_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(post_klds[2].shape[1]), \ np.mean(post_klds[2], axis=0), file_name) # compute information about free-energy on validation set file_name = "MZ_FREE_ENERGY_b{0:d}.png".format(i) fe_terms = MSM.compute_fe_terms(binarize_data(Xva[0:5000]), 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) print(" nll_bound : {0:.4f}".format(fe_mean)) utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ x_label='Posterior KLd', y_label='Negative Log-likelihood') return
def pretrain_osm(lam_kld=0.0): # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with data_file = 'data/tfd_data_48x48.pkl' dataset = load_tfd(tfd_pkl_name=data_file, which_set='unlabeled', fold='all') Xtr_unlabeled = dataset[0] dataset = load_tfd(tfd_pkl_name=data_file, which_set='train', fold='all') Xtr_train = dataset[0] Xtr = np.vstack([Xtr_unlabeled, Xtr_train]) dataset = load_tfd(tfd_pkl_name=data_file, which_set='valid', fold='all') Xva = dataset[0] tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 400 batch_reps = 6 carry_frac = 0.25 carry_size = int(batch_size * carry_frac) reset_prob = 0.04 # setup some symbolic variables and stuff Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') data_dim = Xtr.shape[1] prior_sigma = 1.0 Xtr_mean = np.mean(Xtr, axis=0) ########################## # NETWORK CONFIGURATIONS # ########################## gn_params = {} shared_config = [PRIOR_DIM, 1500, 1500] top_config = [shared_config[-1], data_dim] gn_params['shared_config'] = shared_config gn_params['mu_config'] = top_config gn_params['sigma_config'] = top_config gn_params['activation'] = relu_actfun gn_params['init_scale'] = 1.4 gn_params['lam_l2a'] = 0.0 gn_params['vis_drop'] = 0.0 gn_params['hid_drop'] = 0.0 gn_params['bias_noise'] = 0.0 gn_params['input_noise'] = 0.0 # choose some parameters for the continuous inferencer in_params = {} shared_config = [data_dim, 1500, 1500] top_config = [shared_config[-1], PRIOR_DIM] in_params['shared_config'] = shared_config in_params['mu_config'] = top_config in_params['sigma_config'] = top_config in_params['activation'] = relu_actfun in_params['init_scale'] = 1.4 in_params['lam_l2a'] = 0.0 in_params['vis_drop'] = 0.0 in_params['hid_drop'] = 0.0 in_params['bias_noise'] = 0.0 in_params['input_noise'] = 0.0 # Initialize the base networks for this OneStageModel IN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=in_params, shared_param_dicts=None) GN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=gn_params, shared_param_dicts=None) # Initialize biases in IN and GN IN.init_biases(0.2) GN.init_biases(0.2) ###################################### # LOAD AND RESTART FROM SAVED PARAMS # ###################################### # gn_fname = RESULT_PATH+"pt_osm_params_b110000_GN.pkl" # in_fname = RESULT_PATH+"pt_osm_params_b110000_IN.pkl" # IN = load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd, \ # new_params=None) # GN = load_infnet_from_file(f_name=gn_fname, rng=rng, Xd=Xd, \ # new_params=None) # in_params = IN.params # gn_params = GN.params ######################### # INITIALIZE THE GIPAIR # ######################### osm_params = {} osm_params['x_type'] = 'bernoulli' osm_params['xt_transform'] = 'sigmoid' osm_params['logvar_bound'] = LOGVAR_BOUND OSM = OneStageModel(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, \ p_x_given_z=GN, q_z_given_x=IN, \ x_dim=data_dim, z_dim=PRIOR_DIM, params=osm_params) OSM.set_lam_l2w(1e-5) safe_mean = (0.9 * Xtr_mean) + 0.05 safe_mean_logit = np.log(safe_mean / (1.0 - safe_mean)) OSM.set_output_bias(safe_mean_logit) OSM.set_input_bias(-Xtr_mean) ###################### # BASIC VAE TRAINING # ###################### out_file = open(RESULT_PATH+"pt_osm_results.txt", 'wb') # Set initial learning rate and basic SGD hyper parameters obs_costs = np.zeros((batch_size,)) costs = [0. for i in range(10)] learn_rate = 0.002 for i in range(200000): scale = min(1.0, float(i) / 5000.0) if ((i > 1) and ((i % 20000) == 0)): learn_rate = learn_rate * 0.8 if (i < 50000): momentum = 0.5 elif (i < 10000): momentum = 0.7 else: momentum = 0.9 if ((i == 0) or (npr.rand() < reset_prob)): # sample a fully random batch batch_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,)) else: # sample a partially random batch, which retains some portion of # the worst scoring examples from the previous batch fresh_idx = npr.randint(low=0,high=tr_samples,size=(batch_size-carry_size,)) batch_idx = np.concatenate((fresh_idx.ravel(), carry_idx.ravel())) # do a minibatch update of the model, and compute some costs tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,)) Xd_batch = Xtr.take(tr_idx, axis=0) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # do a minibatch update of the model, and compute some costs OSM.set_sgd_params(lr_1=(scale*learn_rate), \ mom_1=(scale*momentum), mom_2=0.98) OSM.set_lam_nll(1.0) OSM.set_lam_kld(lam_kld_1=scale*lam_kld, lam_kld_2=0.0, lam_kld_c=50.0) result = OSM.train_joint(Xd_batch, Xc_batch, Xm_batch, batch_reps) batch_costs = result[4] + result[5] obs_costs = collect_obs_costs(batch_costs, batch_reps) carry_idx = batch_idx[np.argsort(-obs_costs)[0:carry_size]] costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 1000) == 0): # record and then reset the cost trackers costs = [(v / 1000.0) for v in costs] str_1 = "-- batch {0:d} --".format(i) str_2 = " joint_cost: {0:.4f}".format(costs[0]) str_3 = " nll_cost : {0:.4f}".format(costs[1]) str_4 = " kld_cost : {0:.4f}".format(costs[2]) str_5 = " reg_cost : {0:.4f}".format(costs[3]) costs = [0.0 for v in costs] # print out some diagnostic information joint_str = "\n".join([str_1, str_2, str_3, str_4, str_5]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() if ((i % 2000) == 0): Xva = row_shuffle(Xva) model_samps = OSM.sample_from_prior(500) file_name = RESULT_PATH+"pt_osm_samples_b{0:d}_XG.png".format(i) utils.visualize_samples(model_samps, file_name, num_rows=20) file_name = RESULT_PATH+"pt_osm_inf_weights_b{0:d}.png".format(i) utils.visualize_samples(OSM.inf_weights.get_value(borrow=False).T, \ file_name, num_rows=30) file_name = RESULT_PATH+"pt_osm_gen_weights_b{0:d}.png".format(i) utils.visualize_samples(OSM.gen_weights.get_value(borrow=False), \ file_name, num_rows=30) # compute information about free-energy on validation set file_name = RESULT_PATH+"pt_osm_free_energy_b{0:d}.png".format(i) fe_terms = OSM.compute_fe_terms(Xva[0:2500], 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) fe_str = " nll_bound : {0:.4f}".format(fe_mean) print(fe_str) out_file.write(fe_str+"\n") utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ x_label='Posterior KLd', y_label='Negative Log-likelihood') # compute information about posterior KLds on validation set file_name = RESULT_PATH+"pt_osm_post_klds_b{0:d}.png".format(i) post_klds = OSM.compute_post_klds(Xva[0:2500]) post_dim_klds = np.mean(post_klds, axis=0) utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, \ file_name) if ((i % 5000) == 0): IN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_b{0:d}_IN.pkl".format(i)) GN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_b{0:d}_GN.pkl".format(i)) IN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_IN.pkl") GN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_GN.pkl") return
def test_with_model_init(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = to_fX(datasets[0][0]) Xva = to_fX(datasets[1][0]) Ytr = datasets[0][1] Yva = datasets[1][1] Xtr_class_groups = make_class_groups(Xtr, Ytr) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 300 BD = lambda ary: binarize_data(ary) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ obs_dim = Xtr.shape[1] z_dim = 32 h_dim = 100 ir_steps = 2 init_scale = 1.0 x_type = 'bernoulli' # some InfNet instances to build the TwoStageModel from x_in = T.matrix('x_in') x_pos = T.matrix('x_pos') y_in = T.lvector('y_in') ################# # p_hi_given_si # ################# params = {} shared_config = [obs_dim, 500, 500] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_hi_given_si = InfNet(rng=rng, Xd=x_in, \ params=params, shared_param_dicts=None) p_hi_given_si.init_biases(0.2) ###################### # p_sip1_given_si_hi # ###################### params = {} shared_config = [(h_dim + obs_dim), 500, 500] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_sip1_given_si_hi = InfNet(rng=rng, Xd=x_in, \ params=params, shared_param_dicts=None) p_sip1_given_si_hi.init_biases(0.2) ################ # p_s0_given_z # ################ params = {} shared_config = [z_dim, 500, 500] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_s0_given_z = InfNet(rng=rng, Xd=x_in, \ params=params, shared_param_dicts=None) p_s0_given_z.init_biases(0.2) ############### # q_z_given_x # ############### params = {} shared_config = [obs_dim, (500, 4), (500, 4)] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.2 params['hid_drop'] = 0.5 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_z_given_x = InfNet(rng=rng, Xd=x_in, \ params=params, shared_param_dicts=None) q_z_given_x.init_biases(0.0) ################### # q_hi_given_x_si # ################### params = {} shared_config = [(obs_dim + obs_dim), 800, 800] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_hi_given_x_si = InfNet(rng=rng, Xd=x_in, \ params=params, shared_param_dicts=None) q_hi_given_x_si.init_biases(0.2) ################################################################ # Define parameters for the MultiStageModel, and initialize it # ################################################################ print("Building the MultiStageModel...") msm_params = {} msm_params['x_type'] = x_type msm_params['obs_transform'] = 'sigmoid' MSM = MultiStageModelSS(rng=rng, \ x_in=x_in, x_pos=x_pos, y_in=y_in, \ p_s0_given_z=p_s0_given_z, \ p_hi_given_si=p_hi_given_si, \ p_sip1_given_si_hi=p_sip1_given_si_hi, \ q_z_given_x=q_z_given_x, \ q_hi_given_x_si=q_hi_given_x_si, \ class_count=10, \ obs_dim=obs_dim, z_dim=z_dim, h_dim=h_dim, \ ir_steps=ir_steps, params=msm_params) MSM.set_lam_class(lam_class=20.0) MSM.set_lam_nll(lam_nll=1.0) MSM.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.9, \ lam_kld_p2q=0.1) MSM.set_lam_l2w(1e-4) MSM.set_drop_rate(0.0) MSM.q_hi_given_x_si.set_bias_noise(0.0) MSM.p_hi_given_si.set_bias_noise(0.0) MSM.p_sip1_given_si_hi.set_bias_noise(0.0) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ out_file = open("MSS_A_RESULTS.txt", 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.5 batch_idx = np.arange(batch_size) + tr_samples for i in range(250000): scale = min(1.0, ((i+1) / 2000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 if (i > 20000): momentum = 0.90 else: momentum = 0.50 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr, Ytr = row_shuffle(Xtr, Ytr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update MSM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.99) MSM.set_train_switch(1.0) # perform a minibatch update and record the cost for this batch Xi_tr = Xtr.take(batch_idx, axis=0) Yi_tr = Ytr.take(batch_idx, axis=0) Xp_tr, Xn_tr = sample_class_groups(Yi_tr, Xtr_class_groups) result = MSM.train_joint(BD(Xi_tr), BD(Xp_tr), Yi_tr) costs = [(costs[j] + result[j]) for j in range(len(result)-1)] # output useful information about training progress if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost : {0:.4f}".format(costs[0]) str3 = " class_cost : {0:.4f}".format(costs[1]) str4 = " nll_cost : {0:.4f}".format(costs[2]) str5 = " kld_cost : {0:.4f}".format(costs[3]) str6 = " reg_cost : {0:.4f}".format(costs[4]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))): # Get some validation samples for computing diagnostics Xva, Yva = row_shuffle(Xva, Yva) Xb_va = Xva[0:2500] Yb_va = Yva[0:2500] # draw some independent random samples from the model samp_count = 200 model_samps = MSM.sample_from_prior(samp_count) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "MSS_A_SAMPLES_IND_b{0:d}.png".format(i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # draw some conditional random samples from the model Xs = Xb_va[0:50] # only use validation set samples Xs = np.repeat(Xs, 4, axis=0) samp_count = Xs.shape[0] utils.visualize_samples(seq_samps, file_name, num_rows=20) # draw some conditional random samples from the model model_samps = MSM.sample_from_input(BD(Xs), guided_decoding=False) model_samps.append(Xs) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "MSS_A_SAMPLES_CND_UD_b{0:d}.png".format(i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # compute information about posterior KLds on validation set raw_costs = MSM.compute_raw_costs(BD(Xb_va), BD(Xb_va)) init_nll, init_kld, q2p_kld, p2q_kld, step_nll, step_kld = raw_costs file_name = "MSS_A_H0_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(init_kld.shape[1]), \ np.mean(init_kld, axis=0), file_name) file_name = "MSS_A_HI_Q2P_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(q2p_kld.shape[1]), \ np.mean(q2p_kld, axis=0), file_name) file_name = "MSS_A_HI_P2Q_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(p2q_kld.shape[1]), \ np.mean(p2q_kld, axis=0), file_name) # draw weights for the initial encoder/classifier file_name = "MSS_A_QZX_WEIGHTS_b{0:d}.png".format(i) W = q_z_given_x.shared_layers[0].W.get_value(borrow=False).T utils.visualize_samples(W, file_name, num_rows=20) # compute free-energy terms on training samples fe_terms = MSM.compute_fe_terms(BD(Xtr[0:2500]), BD(Xtr[0:2500]), 30) fe_nll = np.mean(fe_terms[0]) fe_kld = np.mean(fe_terms[1]) fe_joint = fe_nll + fe_kld joint_str = " vfe-tr: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() # compute free-energy terms on validation samples fe_terms = MSM.compute_fe_terms(BD(Xb_va), BD(Xb_va), 30) fe_nll = np.mean(fe_terms[0]) fe_kld = np.mean(fe_terms[1]) fe_joint = fe_nll + fe_kld joint_str = " vfe-va: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() # compute multi-sample estimate of classification error err_rate, err_idx, y_preds = MSM.class_error(Xb_va, Yb_va, \ samples=30, prep_func=BD) joint_str = " va-class-error: {0:.4f}".format(err_rate) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() # draw some conditional random samples from the model Xs = Xb_va[err_idx] # use validation samples with class errors if (Xs.shape[0] > 50): Xs = Xs[:50] Xs = np.repeat(Xs, 4, axis=0) if ((Xs.shape[0] % 20) != 0): # round-off the number of error examples, for nice display remainder = Xs.shape[0] % 20 Xs = Xs[:-remainder] samp_count = Xs.shape[0] # draw some conditional random samples from the model model_samps = MSM.sample_from_input(BD(Xs), guided_decoding=False) model_samps.append(Xs) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "MSS_A_SAMPLES_CND_ERR_b{0:d}.png".format(i) utils.visualize_samples(seq_samps, file_name, num_rows=20)
# choose some parameters for the continuous inferencer in_params = {} shared_config = [data_dim, 800, 800] top_config = [shared_config[-1], prior_dim] in_params['shared_config'] = shared_config in_params['mu_config'] = top_config in_params['sigma_config'] = top_config in_params['activation'] = relu_actfun in_params['init_scale'] = 1.0 in_params['lam_l2a'] = 1e-3 in_params['vis_drop'] = 0.0 in_params['hid_drop'] = 0.0 in_params['bias_noise'] = 0.0 in_params['input_noise'] = 0.0 # Initialize the base networks for this ADPair IN = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, prior_sigma=prior_sigma, \ params=in_params, shared_param_dicts=None) GN = GenNet(rng=rng, Xp=Xp, prior_sigma=prior_sigma, \ params=gn_params, shared_param_dicts=None) # Initialize biases in IN and GN IN.init_biases(0.1) GN.init_biases(0.1) ################################################## # Initialize and train a PeaNetSeq to antagonize # ################################################## # choose some parameters for the categorical inferencer pn_params = {} pc0 = [data_dim, 800, 800, label_dim] pn_params['proto_configs'] = [pc0] # Set up some spawn networks
def test_gi_pair(): # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr = datasets[0][0].get_value(borrow=False).astype(theano.config.floatX) tr_samples = Xtr.shape[0] # Construct a GenNet and an InfNet, then test constructor for GIPair. # Do basic testing, to make sure classes aren't completely broken. Xp = T.matrix('Xp_base') Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') data_dim = Xtr.shape[1] prior_dim = 64 prior_sigma = 1.0 # Choose some parameters for the generator network gn_params = {} gn_config = [prior_dim, 1000, 1000, data_dim] gn_params['mlp_config'] = gn_config gn_params['activation'] = relu_actfun gn_params['out_type'] = 'bernoulli' gn_params['init_scale'] = 2.0 gn_params['lam_l2a'] = 1e-2 gn_params['vis_drop'] = 0.0 gn_params['hid_drop'] = 0.0 gn_params['bias_noise'] = 0.1 # choose some parameters for the continuous inferencer in_params = {} shared_config = [data_dim, (250, 4), (250, 4)] top_config = [shared_config[-1], (125, 4), prior_dim] in_params['shared_config'] = shared_config in_params['mu_config'] = top_config in_params['sigma_config'] = top_config in_params['activation'] = relu_actfun in_params['init_scale'] = 2.0 in_params['lam_l2a'] = 1e-2 in_params['vis_drop'] = 0.0 in_params['hid_drop'] = 0.0 in_params['bias_noise'] = 0.1 in_params['input_noise'] = 0.1 # Initialize the base networks for this GIPair IN = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, prior_sigma=prior_sigma, \ params=in_params, shared_param_dicts=None) GN = GenNet(rng=rng, Xp=Xp, prior_sigma=prior_sigma, \ params=gn_params, shared_param_dicts=None) # Initialize biases in IN and GN IN.init_biases(0.0) GN.init_biases(0.1) # Initialize the GIPair GIP = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN, i_net=IN, \ data_dim=data_dim, prior_dim=prior_dim, params=None) GIP.set_lam_l2w(1e-4) # Set initial learning rate and basic SGD hyper parameters learn_rate = 0.001 for i in range(750000): scale = min(1.0, float(i) / 25000.0) if ((i+1 % 100000) == 0): learn_rate = learn_rate * 0.75 GIP.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.95) GIP.set_lam_nll(lam_nll=1.0) GIP.set_lam_kld(lam_kld=(1.0 * scale)) # get some data to train with tr_idx = npr.randint(low=0,high=tr_samples,size=(100,)) Xd_batch = Xtr.take(tr_idx, axis=0) #binarize_data(Xtr.take(tr_idx, axis=0)) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # do a minibatch update of the model, and compute some costs outputs = GIP.train_joint(Xd_batch, Xc_batch, Xm_batch) joint_cost = 1.0 * outputs[0] data_nll_cost = 1.0 * outputs[1] post_kld_cost = 1.0 * outputs[2] other_reg_cost = 1.0 * outputs[3] if ((i % 1000) == 0): print("batch: {0:d}, joint_cost: {1:.4f}, data_nll_cost: {2:.4f}, post_kld_cost: {3:.4f}, other_reg_cost: {4:.4f}".format( \ i, joint_cost, data_nll_cost, post_kld_cost, other_reg_cost)) if ((i % 5000) == 0): file_name = "GIP_CHAIN_SAMPLES_b{0:d}.png".format(i) Xd_samps = np.repeat(Xd_batch[0:10,:], 3, axis=0) sample_lists = GIP.sample_gil_from_data(Xd_samps, loop_iters=20) Xs = np.vstack(sample_lists["data samples"]) utils.visualize_samples(Xs, file_name, num_rows=20) # draw inference net first layer weights file_name = "GIP_INF_WEIGHTS_b{0:d}.png".format(i) utils.visualize_net_layer(GIP.IN.shared_layers[0], file_name) # draw generator net final layer weights file_name = "GIP_GEN_WEIGHTS_b{0:d}.png".format(i) utils.visualize_net_layer(GIP.GN.mlp_layers[-1], file_name, use_transpose=True) print("TESTING COMPLETE!") return