def train_dae(NET, dae_layer, mlp_params, sgd_params): """Run DAE training test.""" # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset) # Run denoising autoencoder training on the given layer of NET NT.train_dae(NET=NET, \ dae_layer=dae_layer, \ mlp_params=mlp_params, \ sgd_params=sgd_params, \ datasets=datasets) return
def train_dae(NET, dae_layer, mlp_params, sgd_params): """Run DAE training test.""" # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset) # Run denoising autoencoder training on the given layer of NET NT.train_dae(NET=NET, \ dae_layer=dae_layer, \ mlp_params=mlp_params, \ sgd_params=sgd_params, \ datasets=datasets) return 1
def test_mnist_img(occ_dim=15, drop_prob=0.0): ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = RESULT_PATH + "TM_OD{}_DP{}".format(occ_dim, dp_int) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 batch_reps = 1 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1],))) TM = TemplateMatchImputer(x_train=Xtr, x_type='bernoulli') Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva[:500], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) img_match_on_known, img_match_on_unknown = TM.best_match_img(xo, xm) display_count = 100 # visualize matches on known elements Xs = np.zeros((2*display_count, Xva.shape[1])) for idx in range(display_count): Xs[2*idx] = xi[idx] Xs[(2*idx)+1] = img_match_on_known[idx] file_name = "{0:s}_SAMPLES_MOK.png".format(result_tag) utils.visualize_samples(Xs, file_name, num_rows=20) # visualize matches on unknown elements Xs = np.zeros((2*display_count, Xva.shape[1])) for idx in range(display_count): Xs[2*idx] = xi[idx] Xs[(2*idx)+1] = img_match_on_unknown[idx] file_name = "{0:s}_SAMPLES_MOU.png".format(result_tag) utils.visualize_samples(Xs, file_name, num_rows=20) return
def test_mnist_img(occ_dim=15, drop_prob=0.0): ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = RESULT_PATH + "TM_OD{}_DP{}".format(occ_dim, dp_int) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 batch_reps = 1 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1], ))) TM = TemplateMatchImputer(x_train=Xtr, x_type='bernoulli') Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva[:500], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) img_match_on_known, img_match_on_unknown = TM.best_match_img(xo, xm) display_count = 100 # visualize matches on known elements Xs = np.zeros((2 * display_count, Xva.shape[1])) for idx in range(display_count): Xs[2 * idx] = xi[idx] Xs[(2 * idx) + 1] = img_match_on_known[idx] file_name = "{0:s}_SAMPLES_MOK.png".format(result_tag) utils.visualize_samples(Xs, file_name, num_rows=20) # visualize matches on unknown elements Xs = np.zeros((2 * display_count, Xva.shape[1])) for idx in range(display_count): Xs[2 * idx] = xi[idx] Xs[(2 * idx) + 1] = img_match_on_unknown[idx] file_name = "{0:s}_SAMPLES_MOU.png".format(result_tag) utils.visualize_samples(Xs, file_name, num_rows=20) return
def test_mnist_nll(occ_dim=15, drop_prob=0.0): RESULT_PATH = "IMP_MNIST_TM/" ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = RESULT_PATH + "TM_OD{}_DP{}".format(occ_dim, dp_int) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 batch_reps = 1 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1],))) TM = TemplateMatchImputer(x_train=Xtr, x_type='bernoulli') log_name = "{}_RESULTS.txt".format(result_tag) out_file = open(log_name, 'wb') Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) result = TM.best_match_nll(xo, xm) match_on_known = np.mean(result[0]) match_on_unknown = np.mean(result[1]) str0 = "Test 1:" str1 = " match on known : {}".format(match_on_known) str2 = " match on unknown : {}".format(match_on_unknown) joint_str = "\n".join([str0, str1, str2]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() out_file.close() return
def test_mnist_nll(occ_dim=15, drop_prob=0.0): RESULT_PATH = "IMP_MNIST_TM/" ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = RESULT_PATH + "TM_OD{}_DP{}".format(occ_dim, dp_int) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 batch_reps = 1 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1], ))) TM = TemplateMatchImputer(x_train=Xtr, x_type='bernoulli') log_name = "{}_RESULTS.txt".format(result_tag) out_file = open(log_name, 'wb') Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) result = TM.best_match_nll(xo, xm) match_on_known = np.mean(result[0]) match_on_unknown = np.mean(result[1]) str0 = "Test 1:" str1 = " match on known : {}".format(match_on_known) str2 = " match on unknown : {}".format(match_on_unknown) joint_str = "\n".join([str0, str1, str2]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() out_file.close() return
from PeaNet import PeaNet from InfNet import InfNet from GenNet import GenNet from GIPair import GIPair from NetLayers import relu_actfun, softplus_actfun, \ safe_softmax, safe_log # Simple test code, to check that everything is basically functional. print("TESTING...") # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr = datasets[0][0] # get and set some basic dataset information tr_samples = Xtr.get_value(borrow=True).shape[0] data_dim = Xtr.get_value(borrow=True).shape[1] prior_dim = 100 prior_sigma = 1.0 # Do moment matching in some transformed space mm_proj_dim = 250 #P = np.identity(data_dim) P = npr.randn(data_dim, mm_proj_dim) / np.sqrt(float(mm_proj_dim)) P = theano.shared(value=P.astype(theano.config.floatX), name='P_proj') target_mean, target_cov = projected_moments(Xtr, P, ary_type='theano')
def test_imocld_mnist(step_type="add", attention=False): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = "data/mnist.pkl.gz" datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 250 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ x_dim = Xtr.shape[1] write_dim = 300 enc_dim = 300 dec_dim = 300 mix_dim = 20 z_dim = 100 n_iter = 16 rnninits = {"weights_init": IsotropicGaussian(0.01), "biases_init": Constant(0.0)} inits = {"weights_init": IsotropicGaussian(0.01), "biases_init": Constant(0.0)} att_tag = "NA" # attention not implemented yet # setup the reader and writer (shared by primary and guide policies) read_dim = 2 * x_dim # dimension of output from reader_mlp reader_mlp = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer_mlp = MLP([None, None], [dec_dim, write_dim, x_dim], name="writer_mlp", **inits) # mlps for setting conditionals over z_mix mix_var_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], name="mix_var_mlp", **inits) mix_enc_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], name="mix_enc_mlp", **inits) # mlp for decoding z_mix into a distribution over initial LSTM states mix_dec_mlp = MLP( [Tanh(), Tanh()], [mix_dim, 250, (2 * enc_dim + 2 * dec_dim + 2 * enc_dim + mix_dim)], name="mix_dec_mlp", **inits ) # mlps for processing inputs to LSTMs var_mlp_in = MLP([Identity()], [(read_dim + dec_dim + mix_dim), 4 * enc_dim], name="var_mlp_in", **inits) enc_mlp_in = MLP([Identity()], [(read_dim + dec_dim + mix_dim), 4 * enc_dim], name="enc_mlp_in", **inits) dec_mlp_in = MLP([Identity()], [z_dim, 4 * dec_dim], name="dec_mlp_in", **inits) # mlps for turning LSTM outputs into conditionals over z_gen var_mlp_out = CondNet([], [enc_dim, z_dim], name="var_mlp_out", **inits) enc_mlp_out = CondNet([], [enc_dim, z_dim], name="enc_mlp_out", **inits) # LSTMs for the actual LSTMs (obviously, perhaps) var_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, name="var_rnn", **rnninits) enc_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, name="enc_rnn", **rnninits) dec_rnn = BiasedLSTM(dim=dec_dim, ig_bias=2.0, fg_bias=2.0, name="dec_rnn", **rnninits) draw = IMoCLDrawModels( n_iter, step_type=step_type, # step_type can be 'add' or 'jump' reader_mlp=reader_mlp, writer_mlp=writer_mlp, mix_enc_mlp=mix_enc_mlp, mix_dec_mlp=mix_dec_mlp, mix_var_mlp=mix_var_mlp, enc_mlp_in=enc_mlp_in, enc_mlp_out=enc_mlp_out, enc_rnn=enc_rnn, dec_mlp_in=dec_mlp_in, dec_rnn=dec_rnn, var_mlp_in=var_mlp_in, var_mlp_out=var_mlp_out, var_rnn=var_rnn, ) draw.initialize() # build the cost gradients, training function, samplers, etc. draw.build_model_funcs() # sample several interchangeable versions of the model conditions = [{"occ_dim": 0, "drop_prob": 0.8}, {"occ_dim": 16, "drop_prob": 0.0}] for cond_dict in conditions: occ_dim = cond_dict["occ_dim"] drop_prob = cond_dict["drop_prob"] dp_int = int(100.0 * drop_prob) draw.load_model_params( f_name="TBCLM_IMP_MNIST_PARAMS_OD{}_DP{}_{}_{}.pkl".format(occ_dim, dp_int, step_type, att_tag) ) # draw some independent samples from the model Xva = row_shuffle(Xva) Xb = to_fX(Xva[:128]) _, Xb, Mb = construct_masked_data(Xb, drop_prob=drop_prob, occ_dim=occ_dim, data_mean=None) Xb = np.repeat(Xb, 2, axis=0) Mb = np.repeat(Mb, 2, axis=0) samples = draw.do_sample(Xb, Mb) # save the samples to a pkl file, in their numpy array form sample_pkl_name = "IMP-MNIST-OD{0:d}-DP{1:d}-{2:s}.pkl".format(occ_dim, dp_int, step_type) f_handle = file(sample_pkl_name, "wb") cPickle.dump(samples, f_handle, protocol=-1) f_handle.close() print("Saved some samples in: {}".format(sample_pkl_name)) return
def train_walk_from_pretrained_osm(lam_kld=0.0): # Simple test code, to check that everything is basically functional. print("TESTING...") # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr = datasets[0][0] Xtr = Xtr.get_value(borrow=False) Xva = datasets[2][0] Xva = Xva.get_value(borrow=False) print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape), str(Xva.shape))) # get and set some basic dataset information tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] data_dim = Xtr.shape[1] batch_size = 100 batch_reps = 5 prior_sigma = 1.0 Xtr_mean = np.mean(Xtr, axis=0, keepdims=True) Xtr_mean = (0.0 * Xtr_mean) + np.mean(np.mean(Xtr, axis=1)) Xc_mean = np.repeat(Xtr_mean, batch_size, axis=0) # Symbolic inputs Xd = T.matrix(name='Xd') Xc = T.matrix(name='Xc') Xm = T.matrix(name='Xm') Xt = T.matrix(name='Xt') ############################### # Setup discriminator network # ############################### # Set some reasonable mlp parameters dn_params = {} # Set up some proto-networks pc0 = [data_dim, (300, 4), (300, 4), 10] dn_params['proto_configs'] = [pc0] # Set up some spawn networks sc0 = { 'proto_key': 0, 'input_noise': 0.0, 'bias_noise': 0.1, 'do_dropout': True } #sc1 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} dn_params['spawn_configs'] = [sc0] dn_params['spawn_weights'] = [1.0] # Set remaining params dn_params['init_scale'] = 0.75 dn_params['lam_l2a'] = 1e-2 dn_params['vis_drop'] = 0.2 dn_params['hid_drop'] = 0.5 # Initialize a network object to use as the discriminator DN = PeaNet(rng=rng, Xd=Xd, params=dn_params) DN.init_biases(0.0) ####################################################### # Load inferencer and generator from saved parameters # ####################################################### gn_fname = RESULT_PATH + "pt_osm_params_b80000_GN.pkl" in_fname = RESULT_PATH + "pt_osm_params_b80000_IN.pkl" IN = load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) GN = load_infnet_from_file(f_name=gn_fname, rng=rng, Xd=Xd) ######################################################## # Define parameters for the VCGLoop, and initialize it # ######################################################## print("Building the VCGLoop...") vcgl_params = {} vcgl_params['x_type'] = 'bernoulli' vcgl_params['xt_transform'] = 'sigmoid' vcgl_params['logvar_bound'] = LOGVAR_BOUND vcgl_params['cost_decay'] = 0.0 vcgl_params['chain_type'] = 'walkout' vcgl_params['lam_l2d'] = 5e-2 VCGL = VCGLoop(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, Xt=Xt, \ i_net=IN, g_net=GN, d_net=DN, chain_len=6, \ data_dim=data_dim, prior_dim=PRIOR_DIM, params=vcgl_params) out_file = open(RESULT_PATH + "pt_walk_results.txt", 'wb') #################################################### # Train the VCGLoop by unrolling and applying BPTT # #################################################### learn_rate = 0.0004 cost_1 = [0. for i in range(10)] for i in range(50000): scale = float(min((i + 1), 5000)) / 5000.0 if ((i + 1 % 25000) == 0): learn_rate = learn_rate * 0.9 ######################################## # TRAIN THE CHAIN IN FREE-RUNNING MODE # ######################################## VCGL.set_all_sgd_params(learn_rate=(scale*learn_rate), \ mom_1=0.5, mom_2=0.99) VCGL.set_disc_weights(dweight_gn=25.0, dweight_dn=25.0) VCGL.set_lam_chain_nll(1.0) VCGL.set_lam_chain_kld(lam_kld) # get some data to train with tr_idx = npr.randint(low=0, high=tr_samples, size=(batch_size, )) Xd_batch = Xtr.take(tr_idx, axis=0) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # examples from the target distribution, to train discriminator tr_idx = npr.randint(low=0, high=tr_samples, size=(2 * batch_size, )) Xt_batch = Xtr.take(tr_idx, axis=0) # do a minibatch update of the model, and compute some costs outputs = VCGL.train_joint(Xd_batch, Xc_batch, Xm_batch, Xt_batch, batch_reps) cost_1 = [(cost_1[k] + 1. * outputs[k]) for k in range(len(outputs))] if ((i % 500) == 0): cost_1 = [(v / 500.0) for v in cost_1] o_str_1 = "batch: {0:d}, joint_cost: {1:.4f}, chain_nll_cost: {2:.4f}, chain_kld_cost: {3:.4f}, disc_cost_gn: {4:.4f}, disc_cost_dn: {5:.4f}".format( \ i, cost_1[0], cost_1[1], cost_1[2], cost_1[5], cost_1[6]) print(o_str_1) cost_1 = [0. for v in cost_1] if ((i % 1000) == 0): tr_idx = npr.randint(low=0, high=Xtr.shape[0], size=(5, )) va_idx = npr.randint(low=0, high=Xva.shape[0], size=(5, )) Xd_batch = np.vstack( [Xtr.take(tr_idx, axis=0), Xva.take(va_idx, axis=0)]) # draw some chains of samples from the VAE loop file_name = RESULT_PATH + "pt_walk_chain_samples_b{0:d}.png".format( i) Xd_samps = np.repeat(Xd_batch, 3, axis=0) sample_lists = VCGL.OSM.sample_from_chain(Xd_samps, loop_iters=20) Xs = np.vstack(sample_lists["data samples"]) utils.visualize_samples(Xs, file_name, num_rows=20) # draw some masked chains of samples from the VAE loop file_name = RESULT_PATH + "pt_walk_mask_samples_b{0:d}.png".format( i) Xd_samps = np.repeat(Xc_mean[0:Xd_batch.shape[0], :], 3, axis=0) Xc_samps = np.repeat(Xd_batch, 3, axis=0) Xm_rand = sample_masks(Xc_samps, drop_prob=0.0) Xm_patch = sample_patch_masks(Xc_samps, (28, 28), (16, 16)) Xm_samps = Xm_rand * Xm_patch sample_lists = VCGL.OSM.sample_from_chain(Xd_samps, \ X_c=Xc_samps, X_m=Xm_samps, loop_iters=20) Xs = np.vstack(sample_lists["data samples"]) utils.visualize_samples(Xs, file_name, num_rows=20) # draw some samples independently from the GenNet's prior file_name = RESULT_PATH + "pt_walk_prior_samples_b{0:d}.png".format( i) Xs = VCGL.sample_from_prior(20 * 20) utils.visualize_samples(Xs, file_name, num_rows=20) # DUMP PARAMETERS FROM TIME-TO-TIME if (i % 10000 == 0): DN.save_to_file(f_name=RESULT_PATH + "pt_walk_params_b{0:d}_DN.pkl".format(i)) IN.save_to_file(f_name=RESULT_PATH + "pt_walk_params_b{0:d}_IN.pkl".format(i)) GN.save_to_file(f_name=RESULT_PATH + "pt_walk_params_b{0:d}_GN.pkl".format(i)) return
def test_gip_sigma_scale_mnist(): from LogPDFs import cross_validate_sigma # Simple test code, to check that everything is basically functional. print("TESTING...") # Initialize a source of randomness rng = np.random.RandomState(12345) # Load some data to train/validate/test with dataset = "data/mnist.pkl.gz" datasets = load_udm(dataset, zero_mean=False) Xtr = datasets[0][0] Xtr = Xtr.get_value(borrow=False) Xva = datasets[2][0] Xva = Xva.get_value(borrow=False) print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape), str(Xva.shape))) # get and set some basic dataset information tr_samples = Xtr.shape[0] batch_size = 100 Xtr_mean = np.mean(Xtr, axis=0, keepdims=True) Xtr_mean = (0.0 * Xtr_mean) + np.mean(Xtr) Xc_mean = np.repeat(Xtr_mean, batch_size, axis=0).astype(theano.config.floatX) # Symbolic inputs Xd = T.matrix(name="Xd") Xc = T.matrix(name="Xc") Xm = T.matrix(name="Xm") Xt = T.matrix(name="Xt") # Load inferencer and generator from saved parameters gn_fname = "MNIST_WALKOUT_TEST_MED_KLD/pt_osm_params_b80000_GN.pkl" in_fname = "MNIST_WALKOUT_TEST_MED_KLD/pt_osm_params_b80000_IN.pkl" IN = load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) GN = load_infnet_from_file(f_name=gn_fname, rng=rng, Xd=Xd) x_dim = IN.shared_layers[0].in_dim z_dim = IN.mu_layers[-1].out_dim # construct a GIPair with the loaded InfNet and GenNet osm_params = {} osm_params["x_type"] = "gaussian" osm_params["xt_transform"] = "sigmoid" osm_params["logvar_bound"] = LOGVAR_BOUND OSM = OneStageModel( rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, p_x_given_z=GN, q_z_given_x=IN, x_dim=x_dim, z_dim=z_dim, params=osm_params ) # compute variational likelihood bound and its sub-components Xva = row_shuffle(Xva) Xb = Xva[0:5000] file_name = "AX_MNIST_MAX_KLD_POST_KLDS.png" post_klds = OSM.compute_post_klds(Xb) post_dim_klds = np.mean(post_klds, axis=0) utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, file_name) # compute information about free-energy on validation set file_name = "AX_MNIST_MAX_KLD_FREE_ENERGY.png" fe_terms = OSM.compute_fe_terms(Xb, 20) utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, x_label="Posterior KLd", y_label="Negative Log-likelihood") # bound_results = OSM.compute_ll_bound(Xva) # ll_bounds = bound_results[0] # post_klds = bound_results[1] # log_likelihoods = bound_results[2] # max_lls = bound_results[3] # print("mean ll bound: {0:.4f}".format(np.mean(ll_bounds))) # print("mean posterior KLd: {0:.4f}".format(np.mean(post_klds))) # print("mean log-likelihood: {0:.4f}".format(np.mean(log_likelihoods))) # print("mean max log-likelihood: {0:.4f}".format(np.mean(max_lls))) # print("min ll bound: {0:.4f}".format(np.min(ll_bounds))) # print("max posterior KLd: {0:.4f}".format(np.max(post_klds))) # print("min log-likelihood: {0:.4f}".format(np.min(log_likelihoods))) # print("min max log-likelihood: {0:.4f}".format(np.min(max_lls))) # # compute some information about the approximate posteriors # post_stats = OSM.compute_post_stats(Xva, 0.0*Xva, 0.0*Xva) # all_post_klds = np.sort(post_stats[0].ravel()) # post KLds for each obs and dim # obs_post_klds = np.sort(post_stats[1]) # summed post KLds for each obs # post_dim_klds = post_stats[2] # average post KLds for each post dim # post_dim_vars = post_stats[3] # average squared mean for each post dim # utils.plot_line(np.arange(all_post_klds.shape[0]), all_post_klds, "AAA_ALL_POST_KLDS.png") # utils.plot_line(np.arange(obs_post_klds.shape[0]), obs_post_klds, "AAA_OBS_POST_KLDS.png") # utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, "AAA_POST_DIM_KLDS.png") # utils.plot_stem(np.arange(post_dim_vars.shape[0]), post_dim_vars, "AAA_POST_DIM_VARS.png") # draw many samples from the GIP for i in range(5): tr_idx = npr.randint(low=0, high=tr_samples, size=(100,)) Xd_batch = Xtr.take(tr_idx, axis=0) Xs = [] for row in range(3): Xs.append([]) for col in range(3): sample_lists = OSM.sample_from_chain(Xd_batch[0:10, :], loop_iters=100, sigma_scale=1.0) Xs[row].append(group_chains(sample_lists["data samples"])) Xs, block_im_dim = block_video(Xs, (28, 28), (3, 3)) to_video(Xs, block_im_dim, "AX_MNIST_MAX_KLD_CHAIN_VIDEO_{0:d}.avi".format(i), frame_rate=10) file_name = "AX_MNIST_MAX_KLD_PRIOR_SAMPLE.png" Xs = OSM.sample_from_prior(20 * 20) utils.visualize_samples(Xs, file_name, num_rows=20) # # test Parzen density estimator built from prior samples # Xs = OSM.sample_from_prior(10000) # [best_sigma, best_ll, best_lls] = \ # cross_validate_sigma(Xs, Xva, [0.12, 0.14, 0.15, 0.16, 0.18], 20) # sort_idx = np.argsort(best_lls) # sort_idx = sort_idx[0:400] # utils.plot_line(np.arange(sort_idx.shape[0]), best_lls[sort_idx], "A_MNIST_MAX_KLD_BEST_LLS_1.png") # utils.visualize_samples(Xva[sort_idx], "A_MNIST_MAX_KLD_BAD_DIGITS_1.png", num_rows=20) return
def test_imocld_mnist(step_type='add', attention=False): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 250 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ x_dim = Xtr.shape[1] write_dim = 300 enc_dim = 300 dec_dim = 300 mix_dim = 20 z_dim = 100 n_iter = 16 rnninits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } att_tag = "NA" # attention not implemented yet # setup the reader and writer (shared by primary and guide policies) read_dim = 2*x_dim # dimension of output from reader_mlp reader_mlp = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer_mlp = MLP([None, None], [dec_dim, write_dim, x_dim], \ name="writer_mlp", **inits) # mlps for setting conditionals over z_mix mix_var_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], \ name="mix_var_mlp", **inits) mix_enc_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], \ name="mix_enc_mlp", **inits) # mlp for decoding z_mix into a distribution over initial LSTM states mix_dec_mlp = MLP([Tanh(), Tanh()], \ [mix_dim, 250, (2*enc_dim + 2*dec_dim + 2*enc_dim + mix_dim)], \ name="mix_dec_mlp", **inits) # mlps for processing inputs to LSTMs var_mlp_in = MLP([Identity()], [(read_dim + dec_dim + mix_dim), 4*enc_dim], \ name="var_mlp_in", **inits) enc_mlp_in = MLP([Identity()], [(read_dim + dec_dim + mix_dim), 4*enc_dim], \ name="enc_mlp_in", **inits) dec_mlp_in = MLP([Identity()], [ z_dim, 4*dec_dim], \ name="dec_mlp_in", **inits) # mlps for turning LSTM outputs into conditionals over z_gen var_mlp_out = CondNet([], [enc_dim, z_dim], name="var_mlp_out", **inits) enc_mlp_out = CondNet([], [enc_dim, z_dim], name="enc_mlp_out", **inits) # LSTMs for the actual LSTMs (obviously, perhaps) var_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, \ name="var_rnn", **rnninits) enc_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, \ name="enc_rnn", **rnninits) dec_rnn = BiasedLSTM(dim=dec_dim, ig_bias=2.0, fg_bias=2.0, \ name="dec_rnn", **rnninits) draw = IMoCLDrawModels( n_iter, step_type=step_type, # step_type can be 'add' or 'jump' reader_mlp=reader_mlp, writer_mlp=writer_mlp, mix_enc_mlp=mix_enc_mlp, mix_dec_mlp=mix_dec_mlp, mix_var_mlp=mix_var_mlp, enc_mlp_in=enc_mlp_in, enc_mlp_out=enc_mlp_out, enc_rnn=enc_rnn, dec_mlp_in=dec_mlp_in, dec_rnn=dec_rnn, var_mlp_in=var_mlp_in, var_mlp_out=var_mlp_out, var_rnn=var_rnn) draw.initialize() # build the cost gradients, training function, samplers, etc. draw.build_model_funcs() # sample several interchangeable versions of the model conditions = [{'occ_dim': 0, 'drop_prob': 0.8}, \ {'occ_dim': 16, 'drop_prob': 0.0}] for cond_dict in conditions: occ_dim = cond_dict['occ_dim'] drop_prob = cond_dict['drop_prob'] dp_int = int(100.0 * drop_prob) draw.load_model_params(f_name="TBCLM_IMP_MNIST_PARAMS_OD{}_DP{}_{}_{}.pkl".format(occ_dim, dp_int, step_type, att_tag)) # draw some independent samples from the model Xva = row_shuffle(Xva) Xb = to_fX(Xva[:128]) _, Xb, Mb = construct_masked_data(Xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=None) Xb = np.repeat(Xb, 2, axis=0) Mb = np.repeat(Mb, 2, axis=0) samples, _ = draw.do_sample(Xb, Mb) # save the samples to a pkl file, in their numpy array form sample_pkl_name = "IMP-MNIST-OD{0:d}-DP{1:d}-{2:s}.pkl".format(occ_dim, dp_int, step_type) f_handle = file(sample_pkl_name, 'wb') cPickle.dump(samples, f_handle, protocol=-1) f_handle.close() print("Saved some samples in: {}".format(sample_pkl_name)) return
def test_lstm_structpred(step_type='add', use_pol=True, use_binary=False): ########################################### # Make a tag for identifying result files # ########################################### pol_tag = "P1" if use_pol else "P0" bin_tag = "B1" if use_binary else "B0" res_tag = "STRUCT_PRED_RESULTS/SP_LSTM_{}_{}_{}".format( step_type, pol_tag, bin_tag) if use_binary: ############################ # Get binary training data # ############################ rng = np.random.RandomState(1234) Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/') #Xtr = np.vstack((Xtr, Xva)) #Xva = Xte else: ################################ # Get continuous training data # ################################ rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xte = datasets[2][0] #Xtr = np.concatenate((Xtr, Xva), axis=0) #Xva = Xte Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 ######################################################## # Split data into "observation" and "prediction" parts # ######################################################## obs_cols = 14 # number of columns to observe pred_cols = 28 - obs_cols # number of columns to predict x_dim = obs_cols * 28 # dimensionality of observations y_dim = pred_cols * 28 # dimensionality of predictions Xtr, Ytr = img_split(Xtr, im_dim=(28, 28), split_col=obs_cols, transposed=True) Xva, Yva = img_split(Xva, im_dim=(28, 28), split_col=obs_cols, transposed=True) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ read_dim = 128 write_dim = 128 mlp_dim = 128 rnn_dim = 128 z_dim = 64 n_iter = 15 rnninits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } # setup reader/writer models reader_mlp = MLP([Rectifier(), Tanh()], [x_dim, mlp_dim, read_dim], name="reader_mlp", **inits) writer_mlp = MLP([Rectifier(), None], [rnn_dim, mlp_dim, y_dim], name="writer_mlp", **inits) # setup submodels for processing LSTM inputs pol_inp_dim = y_dim + read_dim + rnn_dim var_inp_dim = y_dim + y_dim + read_dim + rnn_dim pol_mlp_in = MLP([Identity()], [pol_inp_dim, 4 * rnn_dim], name="pol_mlp_in", **inits) var_mlp_in = MLP([Identity()], [var_inp_dim, 4 * rnn_dim], name="var_mlp_in", **inits) dec_mlp_in = MLP([Identity()], [z_dim, 4 * rnn_dim], name="dec_mlp_in", **inits) # setup submodels for turning LSTM states into conditionals over z pol_mlp_out = CondNet([], [rnn_dim, z_dim], name="pol_mlp_out", **inits) var_mlp_out = CondNet([], [rnn_dim, z_dim], name="var_mlp_out", **inits) dec_mlp_out = CondNet([], [rnn_dim, z_dim], name="dec_mlp_out", **inits) # setup the LSTMs for primary policy, guide policy, and shared dynamics pol_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="pol_rnn", **rnninits) var_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="var_rnn", **rnninits) dec_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="dec_rnn", **rnninits) model = IRStructPredModel(n_iter, step_type=step_type, use_pol=use_pol, reader_mlp=reader_mlp, writer_mlp=writer_mlp, pol_mlp_in=pol_mlp_in, pol_mlp_out=pol_mlp_out, pol_rnn=pol_rnn, var_mlp_in=var_mlp_in, var_mlp_out=var_mlp_out, var_rnn=var_rnn, dec_mlp_in=dec_mlp_in, dec_mlp_out=dec_mlp_out, dec_rnn=dec_rnn) model.initialize() compile_start_time = time.time() # build the cost gradients, training function, samplers, etc. model.build_sampling_funcs() print("Testing model sampler...") # draw some independent samples from the model samp_count = 10 samp_reps = 3 x_in = Xtr[:10, :].repeat(samp_reps, axis=0) y_in = Ytr[:10, :].repeat(samp_reps, axis=0) x_samps, y_samps = model.sample_model(x_in, y_in, sample_source='p') # TODO: visualize sample prediction trajectories img_seq = seq_img_join(x_samps, y_samps, im_dim=(28, 28), transposed=True) seq_len = len(img_seq) samp_count = img_seq[0].shape[0] seq_samps = np.zeros((seq_len * samp_count, img_seq[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = img_seq[s2][s1] idx += 1 file_name = "{0:s}_samples_b{1:d}.png".format(res_tag, 0) utils.visualize_samples(seq_samps, file_name, num_rows=samp_count) model.build_model_funcs() compile_end_time = time.time() compile_minutes = (compile_end_time - compile_start_time) / 60.0 print("THEANO COMPILE TIME (MIN): {}".format(compile_minutes)) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ print("Beginning to train the model...") out_file = open("{}_results.txt".format(res_tag), 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.9 batch_idx = np.arange(batch_size) + tr_samples for i in range(300000): scale = min(1.0, ((i + 1) / 5000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr, Ytr = row_shuffle(Xtr, Ytr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update model.set_sgd_params(lr=scale * learn_rate, mom_1=scale * momentum, mom_2=0.98) model.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.1) model.set_grad_noise(grad_noise=0.02) # perform a minibatch update and record the cost for this batch Xb = to_fX(Xtr.take(batch_idx, axis=0)) Yb = to_fX(Ytr.take(batch_idx, axis=0)) result = model.train_joint(Xb, Yb) costs = [(costs[j] + result[j]) for j in range(len(result))] # diagnostics if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " total_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_term : {0:.4f}".format(costs[2]) str5 = " kld_q2p : {0:.4f}".format(costs[3]) str6 = " kld_p2q : {0:.4f}".format(costs[4]) str7 = " reg_term : {0:.4f}".format(costs[5]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): model.save_model_params("{}_params.pkl".format(res_tag)) # compute a small-sample estimate of NLL bound on validation set Xva, Yva = row_shuffle(Xva, Yva) Xb = to_fX(Xva[:5000]) Yb = to_fX(Yva[:5000]) va_costs = model.compute_nll_bound(Xb, Yb) str1 = " va_nll_bound : {}".format(va_costs[1]) str2 = " va_nll_term : {}".format(va_costs[2]) str3 = " va_kld_q2p : {}".format(va_costs[3]) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() # draw some independent samples from the model samp_count = 10 samp_reps = 3 x_in = Xva[:samp_count, :].repeat(samp_reps, axis=0) y_in = Yva[:samp_count, :].repeat(samp_reps, axis=0) x_samps, y_samps = model.sample_model(x_in, y_in, sample_source='p') # visualize sample prediction trajectories img_seq = seq_img_join(x_samps, y_samps, im_dim=(28, 28), transposed=True) seq_len = len(img_seq) samp_count = img_seq[0].shape[0] seq_samps = np.zeros((seq_len * samp_count, img_seq[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): if use_binary: seq_samps[idx] = binarize_data(img_seq[s2][s1]) else: seq_samps[idx] = img_seq[s2][s1] idx += 1 file_name = "{0:s}_samples_b{1:d}.png".format(res_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=samp_count)
def test_lstm_structpred(step_type='add', use_pol=True, use_binary=False): ########################################### # Make a tag for identifying result files # ########################################### pol_tag = "P1" if use_pol else "P0" bin_tag = "B1" if use_binary else "B0" res_tag = "STRUCT_PRED_RESULTS/SP_LSTM_{}_{}_{}".format(step_type, pol_tag, bin_tag) if use_binary: ############################ # Get binary training data # ############################ rng = np.random.RandomState(1234) Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/') #Xtr = np.vstack((Xtr, Xva)) #Xva = Xte else: ################################ # Get continuous training data # ################################ rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xte = datasets[2][0] #Xtr = np.concatenate((Xtr, Xva), axis=0) #Xva = Xte Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 ######################################################## # Split data into "observation" and "prediction" parts # ######################################################## obs_cols = 14 # number of columns to observe pred_cols = 28 - obs_cols # number of columns to predict x_dim = obs_cols * 28 # dimensionality of observations y_dim = pred_cols * 28 # dimensionality of predictions Xtr, Ytr = img_split(Xtr, im_dim=(28, 28), split_col=obs_cols, transposed=True) Xva, Yva = img_split(Xva, im_dim=(28, 28), split_col=obs_cols, transposed=True) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ read_dim = 128 write_dim = 128 mlp_dim = 128 rnn_dim = 128 z_dim = 64 n_iter = 15 rnninits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } # setup reader/writer models reader_mlp = MLP([Rectifier(), Tanh()], [x_dim, mlp_dim, read_dim], name="reader_mlp", **inits) writer_mlp = MLP([Rectifier(), None], [rnn_dim, mlp_dim, y_dim], name="writer_mlp", **inits) # setup submodels for processing LSTM inputs pol_inp_dim = y_dim + read_dim + rnn_dim var_inp_dim = y_dim + y_dim + read_dim + rnn_dim pol_mlp_in = MLP([Identity()], [pol_inp_dim, 4*rnn_dim], name="pol_mlp_in", **inits) var_mlp_in = MLP([Identity()], [var_inp_dim, 4*rnn_dim], name="var_mlp_in", **inits) dec_mlp_in = MLP([Identity()], [z_dim, 4*rnn_dim], name="dec_mlp_in", **inits) # setup submodels for turning LSTM states into conditionals over z pol_mlp_out = CondNet([], [rnn_dim, z_dim], name="pol_mlp_out", **inits) var_mlp_out = CondNet([], [rnn_dim, z_dim], name="var_mlp_out", **inits) dec_mlp_out = CondNet([], [rnn_dim, z_dim], name="dec_mlp_out", **inits) # setup the LSTMs for primary policy, guide policy, and shared dynamics pol_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="pol_rnn", **rnninits) var_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="var_rnn", **rnninits) dec_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="dec_rnn", **rnninits) model = IRStructPredModel( n_iter, step_type=step_type, use_pol=use_pol, reader_mlp=reader_mlp, writer_mlp=writer_mlp, pol_mlp_in=pol_mlp_in, pol_mlp_out=pol_mlp_out, pol_rnn=pol_rnn, var_mlp_in=var_mlp_in, var_mlp_out=var_mlp_out, var_rnn=var_rnn, dec_mlp_in=dec_mlp_in, dec_mlp_out=dec_mlp_out, dec_rnn=dec_rnn) model.initialize() compile_start_time = time.time() # build the cost gradients, training function, samplers, etc. model.build_sampling_funcs() print("Testing model sampler...") # draw some independent samples from the model samp_count = 10 samp_reps = 3 x_in = Xtr[:10,:].repeat(samp_reps, axis=0) y_in = Ytr[:10,:].repeat(samp_reps, axis=0) x_samps, y_samps = model.sample_model(x_in, y_in, sample_source='p') # TODO: visualize sample prediction trajectories img_seq = seq_img_join(x_samps, y_samps, im_dim=(28,28), transposed=True) seq_len = len(img_seq) samp_count = img_seq[0].shape[0] seq_samps = np.zeros((seq_len*samp_count, img_seq[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = img_seq[s2][s1] idx += 1 file_name = "{0:s}_samples_b{1:d}.png".format(res_tag, 0) utils.visualize_samples(seq_samps, file_name, num_rows=samp_count) model.build_model_funcs() compile_end_time = time.time() compile_minutes = (compile_end_time - compile_start_time) / 60.0 print("THEANO COMPILE TIME (MIN): {}".format(compile_minutes)) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ print("Beginning to train the model...") out_file = open("{}_results.txt".format(res_tag), 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.9 batch_idx = np.arange(batch_size) + tr_samples for i in range(300000): scale = min(1.0, ((i+1) / 5000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr, Ytr = row_shuffle(Xtr, Ytr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update model.set_sgd_params(lr=scale*learn_rate, mom_1=scale*momentum, mom_2=0.98) model.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.1) model.set_grad_noise(grad_noise=0.02) # perform a minibatch update and record the cost for this batch Xb = to_fX(Xtr.take(batch_idx, axis=0)) Yb = to_fX(Ytr.take(batch_idx, axis=0)) result = model.train_joint(Xb, Yb) costs = [(costs[j] + result[j]) for j in range(len(result))] # diagnostics if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " total_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_term : {0:.4f}".format(costs[2]) str5 = " kld_q2p : {0:.4f}".format(costs[3]) str6 = " kld_p2q : {0:.4f}".format(costs[4]) str7 = " reg_term : {0:.4f}".format(costs[5]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): model.save_model_params("{}_params.pkl".format(res_tag)) # compute a small-sample estimate of NLL bound on validation set Xva, Yva = row_shuffle(Xva, Yva) Xb = to_fX(Xva[:5000]) Yb = to_fX(Yva[:5000]) va_costs = model.compute_nll_bound(Xb, Yb) str1 = " va_nll_bound : {}".format(va_costs[1]) str2 = " va_nll_term : {}".format(va_costs[2]) str3 = " va_kld_q2p : {}".format(va_costs[3]) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() # draw some independent samples from the model samp_count = 10 samp_reps = 3 x_in = Xva[:samp_count,:].repeat(samp_reps, axis=0) y_in = Yva[:samp_count,:].repeat(samp_reps, axis=0) x_samps, y_samps = model.sample_model(x_in, y_in, sample_source='p') # visualize sample prediction trajectories img_seq = seq_img_join(x_samps, y_samps, im_dim=(28,28), transposed=True) seq_len = len(img_seq) samp_count = img_seq[0].shape[0] seq_samps = np.zeros((seq_len*samp_count, img_seq[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): if use_binary: seq_samps[idx] = binarize_data(img_seq[s2][s1]) else: seq_samps[idx] = img_seq[s2][s1] idx += 1 file_name = "{0:s}_samples_b{1:d}.png".format(res_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=samp_count)
def test_mnist_results(step_type='add', imp_steps=6, occ_dim=15, drop_prob=0.0): ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = "{}GPSI_OD{}_DP{}_IS{}_{}_NA".format(RESULT_PATH, occ_dim, dp_int, imp_steps, step_type) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xte = datasets[2][0] # Merge validation set and training set, and test on test set. Xtr = np.concatenate((Xtr, Xva), axis=0) Xva = Xte Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 250 batch_reps = 1 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1], ))) # Load parameters from a previously trained model print("Testing model load from file...") GPSI = load_gpsimputer_from_file(f_name="{}_PARAMS.pkl".format(result_tag), \ rng=rng) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_FINAL_RESULTS_NEW.txt".format(result_tag) out_file = open(log_name, 'wb') Xva = row_shuffle(Xva) # record an estimate of performance on the test set str0 = "GUIDED SAMPLE BOUND:" print(str0) xi, xo, xm = construct_masked_data(Xva[:5000], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) nll_0, kld_0 = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10, \ use_guide_policy=True) xi, xo, xm = construct_masked_data(Xva[5000:], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) nll_1, kld_1 = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10, \ use_guide_policy=True) nll = np.concatenate((nll_0, nll_1)) kld = np.concatenate((kld_0, kld_1)) vfe = np.mean(nll) + np.mean(kld) str1 = " va_nll_bound : {}".format(vfe) str2 = " va_nll_term : {}".format(np.mean(nll)) str3 = " va_kld_q2p : {}".format(np.mean(kld)) joint_str = "\n".join([str0, str1, str2, str3]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() # record an estimate of performance on the test set str0 = "UNGUIDED SAMPLE BOUND:" print(str0) xi, xo, xm = construct_masked_data(Xva[:5000], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) nll_0, kld_0 = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10, \ use_guide_policy=False) xi, xo, xm = construct_masked_data(Xva[5000:], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) nll_1, kld_1 = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10, \ use_guide_policy=False) nll = np.concatenate((nll_0, nll_1)) kld = np.concatenate((kld_0, kld_1)) str1 = " va_nll_bound : {}".format(np.mean(nll)) str2 = " va_nll_term : {}".format(np.mean(nll)) str3 = " va_kld_q2p : {}".format(np.mean(kld)) joint_str = "\n".join([str0, str1, str2, str3]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush()
def test_mnist(step_type='add', imp_steps=6, occ_dim=15, drop_prob=0.0): ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = "{}GPSI_OD{}_DP{}_IS{}_{}_NA".format(RESULT_PATH, occ_dim, dp_int, imp_steps, step_type) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xte = datasets[2][0] # Merge validation set and training set, and test on test set. Xtr = np.concatenate((Xtr, Xva), axis=0) Xva = Xte Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 batch_reps = 1 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1], ))) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ x_dim = Xtr.shape[1] z_dim = 100 init_scale = 1.0 use_bn = False x_in_sym = T.matrix('x_in_sym') x_out_sym = T.matrix('x_out_sym') x_mask_sym = T.matrix('x_mask_sym') ################# # p_zi_given_xi # ################# params = {} shared_config = \ [ {'layer_type': 'fc', 'in_chans': x_dim, 'out_chans': 800, 'activation': relu_actfun, 'apply_bn': use_bn}, \ {'layer_type': 'fc', 'in_chans': 800, 'out_chans': 800, 'activation': relu_actfun, 'apply_bn': use_bn} ] out_layer = { 'layer_type': 'fc', 'in_chans': 800, 'out_chans': z_dim, 'activation': relu_actfun, 'apply_bn': False } output_config = [out_layer, out_layer] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = 1.0 params['build_theano_funcs'] = False p_zi_given_xi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_zi_given_xi.init_biases(0.0) ################### # p_sip1_given_zi # ################### params = {} shared_config = \ [ {'layer_type': 'fc', 'in_chans': z_dim, 'out_chans': 800, 'activation': relu_actfun, 'apply_bn': use_bn}, \ {'layer_type': 'fc', 'in_chans': 800, 'out_chans': 800, 'activation': relu_actfun, 'apply_bn': use_bn} ] out_layer = { 'layer_type': 'fc', 'in_chans': 800, 'out_chans': x_dim, 'activation': relu_actfun, 'apply_bn': False } output_config = [out_layer, out_layer, out_layer] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = 1.0 params['build_theano_funcs'] = False p_sip1_given_zi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_sip1_given_zi.init_biases(0.0) ################# # q_zi_given_xi # ################# params = {} shared_config = \ [ {'layer_type': 'fc', 'in_chans': (x_dim+x_dim), 'out_chans': 800, 'activation': relu_actfun, 'apply_bn': use_bn}, \ {'layer_type': 'fc', 'in_chans': 800, 'out_chans': 800, 'activation': relu_actfun, 'apply_bn': use_bn} ] out_layer = { 'layer_type': 'fc', 'in_chans': 800, 'out_chans': z_dim, 'activation': relu_actfun, 'apply_bn': False } output_config = [out_layer, out_layer] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = 1.0 params['build_theano_funcs'] = False q_zi_given_xi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_zi_given_xi.init_biases(0.0) ########################################################### # Define parameters for the GPSImputer, and initialize it # ########################################################### print("Building the GPSImputer...") gpsi_params = {} gpsi_params['x_dim'] = x_dim gpsi_params['z_dim'] = z_dim # switch between direct construction and construction via p_x_given_si gpsi_params['imp_steps'] = imp_steps gpsi_params['step_type'] = step_type gpsi_params['x_type'] = 'bernoulli' gpsi_params['obs_transform'] = 'sigmoid' GPSI = GPSImputer(rng=rng, x_in=x_in_sym, x_out=x_out_sym, x_mask=x_mask_sym, p_zi_given_xi=p_zi_given_xi, p_sip1_given_zi=p_sip1_given_zi, q_zi_given_xi=q_zi_given_xi, params=gpsi_params, shared_param_dicts=None) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format(result_tag) out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.0001 momentum = 0.90 batch_idx = np.arange(batch_size) + tr_samples for i in range(200000): scale = min(1.0, ((i + 1) / 5000.0)) if (((i + 1) % 15000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update GPSI.set_sgd_params(lr=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.98) GPSI.set_train_switch(1.0) GPSI.set_lam_nll(lam_nll=1.0) GPSI.set_lam_kld(lam_kld_q=1.0, lam_kld_p=0.1, lam_kld_g=0.0) GPSI.set_lam_l2w(1e-5) # perform a minibatch update and record the cost for this batch xb = to_fX(Xtr.take(batch_idx, axis=0)) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) result = GPSI.train_joint(xi, xo, xm, batch_reps) # do diagnostics and general training tracking costs = [(costs[j] + result[j]) for j in range(len(result) - 1)] if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_cost : {0:.4f}".format(costs[2]) str5 = " kld_cost : {0:.4f}".format(costs[3]) str6 = " reg_cost : {0:.4f}".format(costs[4]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva[0:5000], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) nll, kld = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10) vfe = np.mean(nll) + np.mean(kld) str1 = " va_nll_bound : {}".format(vfe) str2 = " va_nll_term : {}".format(np.mean(nll)) str3 = " va_kld_q2p : {}".format(np.mean(kld)) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() if ((i % 2000) == 0): GPSI.save_to_file("{}_PARAMS.pkl".format(result_tag)) # Get some validation samples for evaluating model performance xb = to_fX(Xva[0:100]) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) xi = np.repeat(xi, 2, axis=0) xo = np.repeat(xo, 2, axis=0) xm = np.repeat(xm, 2, axis=0) # draw some sample imputations from the model samp_count = xi.shape[0] _, model_samps = GPSI.sample_imputer(xi, xo, xm, use_guide_policy=False) seq_len = len(model_samps) seq_samps = np.zeros( (seq_len * samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "{0:s}_samples_ng_b{1:d}.png".format(result_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=20)
def test_dA(learning_rate=0.1, training_epochs=30, dataset='./data/mnist.pkl.gz', batch_size=25, output_folder='dA_plots'): """ Blargh! """ datasets = load_udm(dataset) train_set_x, train_set_y = datasets[0] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images if not os.path.isdir(output_folder): os.makedirs(output_folder) os.chdir(output_folder) #################################### # BUILDING THE MODEL NO CORRUPTION # #################################### rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2**30)) da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x, n_visible=28 * 28, n_hidden=500) cost, updates = da.get_cost_updates(corruption_level=0., learning_rate=learning_rate) train_da = theano.function(inputs=[index], outputs=[cost], updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size, :] }) ############ # TRAINING # ############ # go through training epochs for epoch in xrange(training_epochs): # go through training set c = [] t1 = time.clock() for batch_index in xrange(n_train_batches): c.append(train_da(batch_index)) t2 = time.clock() print "Training epoch {0:d}, cost {1:.4f}, time {2:.4f}".format( \ epoch, numpy.mean(c), (t2 - t1)) image = PIL.Image.fromarray( tile_raster_images(X=da.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters_corruption_00.png') ##################################### # BUILDING THE MODEL CORRUPTION 30% # ##################################### rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2**30)) da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x, n_visible=28 * 28, n_hidden=500) cost, updates = da.get_cost_updates(corruption_level=0.3, learning_rate=learning_rate) train_da = theano.function( [index], cost, updates=updates, givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]}) ############ # TRAINING # ############ # go through training epochs for epoch in xrange(training_epochs): # go through training set c = [] t1 = time.clock() for batch_index in xrange(n_train_batches): c.append(train_da(batch_index)) t2 = time.clock() print "Training epoch {0:d}, cost {1:.4f}, time {2:.4f}".format( \ epoch, numpy.mean(c), (t2 - t1)) image = PIL.Image.fromarray( tile_raster_images(X=da.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters_corruption_30.png') os.chdir('../')
def test_mnist(step_type='add', occ_dim=15, drop_prob=0.0): ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = "{}GPSI_OD{}_DP{}_{}_NA".format(RESULT_PATH, occ_dim, dp_int, step_type) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 250 batch_reps = 1 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX( all_pix_mean * np.ones((Xtr.shape[1],)) ) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ obs_dim = Xtr.shape[1] z_dim = 100 imp_steps = 5 init_scale = 1.0 x_in_sym = T.matrix('x_in_sym') x_out_sym = T.matrix('x_out_sym') x_mask_sym = T.matrix('x_mask_sym') ################# # p_zi_given_xi # ################# params = {} shared_config = [obs_dim, 1000, 1000] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_zi_given_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_zi_given_xi.init_biases(0.2) ################### # p_xip1_given_zi # ################### params = {} shared_config = [z_dim, 1000, 1000] output_config = [obs_dim, obs_dim] params['shared_config'] = shared_config params['output_config'] = output_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_xip1_given_zi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_xip1_given_zi.init_biases(0.2) ################### # q_zi_given_x_xi # ################### params = {} shared_config = [(obs_dim + obs_dim), 1000, 1000] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_zi_given_x_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_zi_given_x_xi.init_biases(0.2) ########################################################### # Define parameters for the GPSImputer, and initialize it # ########################################################### print("Building the GPSImputer...") gpsi_params = {} gpsi_params['obs_dim'] = obs_dim gpsi_params['z_dim'] = z_dim gpsi_params['imp_steps'] = imp_steps gpsi_params['step_type'] = step_type gpsi_params['x_type'] = 'bernoulli' gpsi_params['obs_transform'] = 'sigmoid' GPSI = GPSImputer(rng=rng, x_in=x_in_sym, x_out=x_out_sym, x_mask=x_mask_sym, \ p_zi_given_xi=p_zi_given_xi, \ p_xip1_given_zi=p_xip1_given_zi, \ q_zi_given_x_xi=q_zi_given_x_xi, \ params=gpsi_params, \ shared_param_dicts=None) # # test model saving # print("Testing model save to file...") # GPSI.save_to_file("AAA_GPSI_SAVE_TEST.pkl") # # test model loading # print("Testing model load from file...") # GPSI = load_gpsimputer_from_file(f_name="AAA_GPSI_SAVE_TEST.pkl", rng=rng) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format(result_tag) out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.5 batch_idx = np.arange(batch_size) + tr_samples for i in range(200000): scale = min(1.0, ((i+1) / 5000.0)) if (((i + 1) % 15000) == 0): learn_rate = learn_rate * 0.92 if (i > 10000): momentum = 0.90 else: momentum = 0.50 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update GPSI.set_sgd_params(lr=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.98) GPSI.set_train_switch(1.0) GPSI.set_lam_nll(lam_nll=1.0) GPSI.set_lam_kld(lam_kld_p=0.1, lam_kld_q=0.9) GPSI.set_lam_l2w(1e-4) # perform a minibatch update and record the cost for this batch xb = to_fX( Xtr.take(batch_idx, axis=0) ) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) result = GPSI.train_joint(xi, xo, xm, batch_reps) # do diagnostics and general training tracking costs = [(costs[j] + result[j]) for j in range(len(result)-1)] if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_cost : {0:.4f}".format(costs[2]) str5 = " kld_cost : {0:.4f}".format(costs[3]) str6 = " reg_cost : {0:.4f}".format(costs[4]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva[0:5000], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) nll, kld = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10) vfe = np.mean(nll) + np.mean(kld) str1 = " va_nll_bound : {}".format(vfe) str2 = " va_nll_term : {}".format(np.mean(nll)) str3 = " va_kld_q2p : {}".format(np.mean(kld)) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() if ((i % 5000) == 0): GPSI.save_to_file("{}_PARAMS.pkl".format(result_tag)) # Get some validation samples for evaluating model performance xb = to_fX( Xva[0:100] ) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) xi = np.repeat(xi, 2, axis=0) xo = np.repeat(xo, 2, axis=0) xm = np.repeat(xm, 2, axis=0) # draw some sample imputations from the model samp_count = xi.shape[0] _, model_samps = GPSI.sample_imputer(xi, xo, xm, use_guide_policy=False) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "{0:s}_samples_ng_b{1:d}.png".format(result_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # get visualizations of policy parameters file_name = "{0:s}_gen_gen_weights_b{1:d}.png".format(result_tag, i) W = GPSI.gen_gen_weights.get_value(borrow=False) utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "{0:s}_gen_inf_weights_b{1:d}.png".format(result_tag, i) W = GPSI.gen_inf_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
def test_imocld_imp_mnist(step_type='add', occ_dim=14, drop_prob=0.0, attention=False): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xte = datasets[2][0] # Merge validation set and training set, and test on test set. Xtr = np.concatenate((Xtr, Xva), axis=0) Xva = Xte Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ x_dim = Xtr.shape[1] write_dim = 300 enc_dim = 300 dec_dim = 300 mix_dim = 20 z_dim = 100 n_iter = 25 dp_int = int(100.0 * drop_prob) rnninits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } att_tag = "NA" # attention not implemented yet # setup the reader and writer (shared by primary and guide policies) read_dim = 2 * x_dim # dimension of output from reader_mlp reader_mlp = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer_mlp = MLP([None, None], [dec_dim, write_dim, x_dim], \ name="writer_mlp", **inits) # mlps for setting conditionals over z_mix mix_var_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], \ name="mix_var_mlp", **inits) mix_enc_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], \ name="mix_enc_mlp", **inits) # mlp for decoding z_mix into a distribution over initial LSTM states mix_dec_mlp = MLP([Tanh(), Tanh()], \ [mix_dim, 250, (2*enc_dim + 2*dec_dim + 2*enc_dim)], \ name="mix_dec_mlp", **inits) # mlps for processing inputs to LSTMs var_mlp_in = MLP([Identity()], [(read_dim + dec_dim), 4*enc_dim], \ name="var_mlp_in", **inits) enc_mlp_in = MLP([Identity()], [(read_dim + dec_dim), 4*enc_dim], \ name="enc_mlp_in", **inits) dec_mlp_in = MLP([Identity()], [ z_dim, 4*dec_dim], \ name="dec_mlp_in", **inits) # mlps for turning LSTM outputs into conditionals over z_gen var_mlp_out = CondNet([], [enc_dim, z_dim], name="var_mlp_out", **inits) enc_mlp_out = CondNet([], [enc_dim, z_dim], name="enc_mlp_out", **inits) # LSTMs for the actual LSTMs (obviously, perhaps) var_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, \ name="var_rnn", **rnninits) enc_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, \ name="enc_rnn", **rnninits) dec_rnn = BiasedLSTM(dim=dec_dim, ig_bias=2.0, fg_bias=2.0, \ name="dec_rnn", **rnninits) draw = IMoCLDrawModels( n_iter, step_type=step_type, # step_type can be 'add' or 'jump' reader_mlp=reader_mlp, writer_mlp=writer_mlp, mix_enc_mlp=mix_enc_mlp, mix_dec_mlp=mix_dec_mlp, mix_var_mlp=mix_var_mlp, enc_mlp_in=enc_mlp_in, enc_mlp_out=enc_mlp_out, enc_rnn=enc_rnn, dec_mlp_in=dec_mlp_in, dec_rnn=dec_rnn, var_mlp_in=var_mlp_in, var_mlp_out=var_mlp_out, var_rnn=var_rnn) draw.initialize() # build the cost gradients, training function, samplers, etc. draw.build_model_funcs() #draw.load_model_params(f_name="TBCLM_IMP_MNIST_PARAMS_OD{}_DP{}_{}_{}.pkl".format(occ_dim, dp_int, step_type, att_tag)) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ print("Beginning to train the model...") out_file = open( "TBCLM_IMP_MNIST_RESULTS_OD{}_DP{}_{}_{}.txt".format( occ_dim, dp_int, step_type, att_tag), 'wb') out_file.flush() costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.9 batch_idx = np.arange(batch_size) + tr_samples for i in range(250000): scale = min(1.0, ((i + 1) / 1000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update zero_ary = np.zeros((1, )) draw.lr.set_value(to_fX(zero_ary + learn_rate)) draw.mom_1.set_value(to_fX(zero_ary + momentum)) draw.mom_2.set_value(to_fX(zero_ary + 0.99)) # perform a minibatch update and record the cost for this batch Xb = to_fX(Xtr.take(batch_idx, axis=0)) _, Xb, Mb = construct_masked_data(Xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=None) result = draw.train_joint(Xb, Mb) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 200) == 0): costs = [(v / 200.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " total_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_term : {0:.4f}".format(costs[2]) str5 = " kld_q2p : {0:.4f}".format(costs[3]) str6 = " kld_p2q : {0:.4f}".format(costs[4]) str7 = " reg_term : {0:.4f}".format(costs[5]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): draw.save_model_params( "TBCLM_IMP_MNIST_PARAMS_OD{}_DP{}_{}_{}.pkl".format( occ_dim, dp_int, step_type, att_tag)) # compute a small-sample estimate of NLL bound on validation set Xva = row_shuffle(Xva) Xb = to_fX(Xva[:5000]) _, Xb, Mb = construct_masked_data(Xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=None) va_costs = draw.compute_nll_bound(Xb, Mb) str1 = " va_nll_bound : {}".format(va_costs[1]) str2 = " va_nll_term : {}".format(va_costs[2]) str3 = " va_kld_q2p : {}".format(va_costs[3]) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() # draw some independent samples from the model Xb = to_fX(Xva[:100]) _, Xb, Mb = construct_masked_data(Xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=None) samples, _ = draw.do_sample(Xb, Mb) n_iter, N, D = samples.shape samples = samples.reshape((n_iter, N, 28, 28)) for j in xrange(n_iter): img = img_grid(samples[j, :, :, :]) img.save( "TBCLM-IMP-MNIST-OD{0:d}-DP{1:d}-{2:s}-samples-{3:03d}.png" .format(occ_dim, dp_int, step_type, j))
def pretrain_osm(lam_kld=0.0): # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr = datasets[0][0] Xtr = Xtr.get_value(borrow=False) Xva = datasets[2][0] Xva = Xva.get_value(borrow=False) print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape),str(Xva.shape))) # get and set some basic dataset information Xtr_mean = np.mean(Xtr, axis=0) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 100 batch_reps = 5 # setup some symbolic variables and stuff Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') data_dim = Xtr.shape[1] prior_sigma = 1.0 ########################## # NETWORK CONFIGURATIONS # ########################## gn_params = {} shared_config = [PRIOR_DIM, 1000, 1000] top_config = [shared_config[-1], data_dim] gn_params['shared_config'] = shared_config gn_params['mu_config'] = top_config gn_params['sigma_config'] = top_config gn_params['activation'] = relu_actfun gn_params['init_scale'] = 1.4 gn_params['lam_l2a'] = 0.0 gn_params['vis_drop'] = 0.0 gn_params['hid_drop'] = 0.0 gn_params['bias_noise'] = 0.0 gn_params['input_noise'] = 0.0 # choose some parameters for the continuous inferencer in_params = {} shared_config = [data_dim, 1000, 1000] top_config = [shared_config[-1], PRIOR_DIM] in_params['shared_config'] = shared_config in_params['mu_config'] = top_config in_params['sigma_config'] = top_config in_params['activation'] = relu_actfun in_params['init_scale'] = 1.4 in_params['lam_l2a'] = 0.0 in_params['vis_drop'] = 0.0 in_params['hid_drop'] = 0.0 in_params['bias_noise'] = 0.0 in_params['input_noise'] = 0.0 # Initialize the base networks for this OneStageModel IN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=in_params, shared_param_dicts=None) GN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=gn_params, shared_param_dicts=None) # Initialize biases in IN and GN IN.init_biases(0.2) GN.init_biases(0.2) ######################### # INITIALIZE THE GIPAIR # ######################### osm_params = {} osm_params['x_type'] = 'bernoulli' osm_params['xt_transform'] = 'sigmoid' osm_params['logvar_bound'] = LOGVAR_BOUND OSM = OneStageModel(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, \ p_x_given_z=GN, q_z_given_x=IN, \ x_dim=data_dim, z_dim=PRIOR_DIM, params=osm_params) OSM.set_lam_l2w(1e-5) safe_mean = (0.9 * Xtr_mean) + 0.05 safe_mean_logit = np.log(safe_mean / (1.0 - safe_mean)) OSM.set_output_bias(safe_mean_logit) OSM.set_input_bias(-Xtr_mean) ###################### # BASIC VAE TRAINING # ###################### out_file = open(RESULT_PATH+"pt_osm_results.txt", 'wb') # Set initial learning rate and basic SGD hyper parameters obs_costs = np.zeros((batch_size,)) costs = [0. for i in range(10)] learn_rate = 0.0005 for i in range(150000): scale = min(1.0, float(i) / 10000.0) if ((i > 1) and ((i % 20000) == 0)): learn_rate = learn_rate * 0.9 # do a minibatch update of the model, and compute some costs tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,)) Xd_batch = Xtr.take(tr_idx, axis=0) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # do a minibatch update of the model, and compute some costs OSM.set_sgd_params(lr_1=(scale*learn_rate), mom_1=0.5, mom_2=0.98) OSM.set_lam_nll(1.0) OSM.set_lam_kld(lam_kld_1=(1.0 + (scale*(lam_kld-1.0))), lam_kld_2=0.0) result = OSM.train_joint(Xd_batch, Xc_batch, Xm_batch, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 1000) == 0): # record and then reset the cost trackers costs = [(v / 1000.0) for v in costs] str_1 = "-- batch {0:d} --".format(i) str_2 = " joint_cost: {0:.4f}".format(costs[0]) str_3 = " nll_cost : {0:.4f}".format(costs[1]) str_4 = " kld_cost : {0:.4f}".format(costs[2]) str_5 = " reg_cost : {0:.4f}".format(costs[3]) costs = [0.0 for v in costs] # print out some diagnostic information joint_str = "\n".join([str_1, str_2, str_3, str_4, str_5]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() if ((i % 2000) == 0): Xva = row_shuffle(Xva) model_samps = OSM.sample_from_prior(500) file_name = RESULT_PATH+"pt_osm_samples_b{0:d}_XG.png".format(i) utils.visualize_samples(model_samps, file_name, num_rows=20) # compute information about free-energy on validation set file_name = RESULT_PATH+"pt_osm_free_energy_b{0:d}.png".format(i) fe_terms = OSM.compute_fe_terms(Xva[0:2500], 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) fe_str = " nll_bound : {0:.4f}".format(fe_mean) print(fe_str) out_file.write(fe_str+"\n") utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ x_label='Posterior KLd', y_label='Negative Log-likelihood') # compute information about posterior KLds on validation set file_name = RESULT_PATH+"pt_osm_post_klds_b{0:d}.png".format(i) post_klds = OSM.compute_post_klds(Xva[0:2500]) post_dim_klds = np.mean(post_klds, axis=0) utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, \ file_name) if ((i % 5000) == 0): IN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_b{0:d}_IN.pkl".format(i)) GN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_b{0:d}_GN.pkl".format(i)) IN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_IN.pkl") GN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_GN.pkl") return
def test_seq_cond_gen_sequence(step_type='add'): ############################## # File tag, for output stuff # ############################## result_tag = "{}BBB_SCG".format(RESULT_PATH) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) # get training/validation/test images Xtr = datasets[0][0] Xva = datasets[1][0] Xte = datasets[2][0] Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) Xte = to_fX(shift_and_scale_into_01(Xte)) obs_dim = Xtr.shape[1] # get label representations tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 step_reps = 3 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ total_steps = step_reps * 28 init_steps = step_reps exit_rate = 0.0 nll_weight = 1.0 / step_reps x_dim = 28 y_dim = 28 z_dim = 100 rnn_dim = 300 write_dim = 250 mlp_dim = 250 def visualize_attention(sampler_result, pre_tag="AAA", post_tag="AAA"): # get generated predictions seq_len = sampler_result[0].shape[0] samp_count = sampler_result[0].shape[1] x_dim = sampler_result[0].shape[2] seq_samps = np.zeros((samp_count, 28*28)) for samp in range(samp_count): step = 0 samp_vals = np.zeros((28,28)) for col in range(28): col_vals = np.zeros((28,)) for rep in range(step_reps): if (rep == (step_reps-1)): col_vals = sampler_result[0][step,samp,:] step += 1 samp_vals[:,col] = col_vals seq_samps[samp,:] = samp_vals.ravel() file_name = "{0:s}_traj_xs_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=10) # get sequential attention maps seq_samps = np.zeros((samp_count, 28*28)) for samp in range(samp_count): step = 0 samp_vals = np.zeros((28,28)) for col in range(28): col_vals = np.zeros((28,)) for rep in range(step_reps): col_vals = col_vals + sampler_result[1][step,samp,:x_dim] col_vals = col_vals + sampler_result[1][step,samp,x_dim:] step += 1 samp_vals[:,col] = col_vals / (2.0*step_reps) seq_samps[samp,:] = samp_vals.ravel() file_name = "{0:s}_traj_att_maps_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=10) # get sequential attention maps (read out values) seq_samps = np.zeros((samp_count, 28*28)) for samp in range(samp_count): step = 0 samp_vals = np.zeros((28,28)) for col in range(28): col_vals = np.zeros((28,)) for rep in range(step_reps): col_vals = col_vals + sampler_result[2][step,samp,:x_dim] col_vals = col_vals + sampler_result[2][step,samp,x_dim:] step += 1 samp_vals[:,col] = col_vals / (2.0*step_reps) seq_samps[samp,:] = samp_vals.ravel() file_name = "{0:s}_traj_read_outs_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=10) return def batch_reshape(Xb, reps=step_reps): # reshape for stuff bs = Xb.shape[0] xb = Xb.reshape((bs, 28, 28)).swapaxes(0,2).swapaxes(1,2) xb = xb.repeat(reps, axis=0) return xb rnninits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } read_N = 2 # inner/outer grid dimension for reader read_dim = 2*read_N # total number of "pixels" read by reader reader_mlp = SimpleAttentionReader1d(x_dim=x_dim, con_dim=rnn_dim, N=read_N, init_scale=2.0, **inits) writer_mlp = MLP([None, None], [rnn_dim, write_dim, y_dim], \ name="writer_mlp", **inits) # mlps for processing inputs to LSTMs con_mlp_in = MLP([Identity()], [ z_dim, 4*rnn_dim], \ name="con_mlp_in", **inits) var_mlp_in = MLP([Identity()], [(y_dim + read_dim + rnn_dim), 4*rnn_dim], \ name="var_mlp_in", **inits) gen_mlp_in = MLP([Identity()], [ (read_dim + rnn_dim), 4*rnn_dim], \ name="gen_mlp_in", **inits) # mlps for turning LSTM outputs into conditionals over z_gen con_mlp_out = CondNet([], [rnn_dim, z_dim], name="con_mlp_out", **inits) gen_mlp_out = CondNet([], [rnn_dim, z_dim], name="gen_mlp_out", **inits) var_mlp_out = CondNet([], [rnn_dim, z_dim], name="var_mlp_out", **inits) # LSTMs for the actual LSTMs (obviously, perhaps) con_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="con_rnn", **rnninits) gen_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="gen_rnn", **rnninits) var_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="var_rnn", **rnninits) SCG = SeqCondGen( x_and_y_are_seqs=True, # this test uses sequential x/y total_steps=total_steps, init_steps=init_steps, exit_rate=exit_rate, nll_weight=nll_weight, step_type=step_type, x_dim=x_dim, y_dim=y_dim, reader_mlp=reader_mlp, writer_mlp=writer_mlp, con_mlp_in=con_mlp_in, con_rnn=con_rnn, gen_mlp_in=gen_mlp_in, gen_mlp_out=gen_mlp_out, gen_rnn=gen_rnn, var_mlp_in=var_mlp_in, var_mlp_out=var_mlp_out, var_rnn=var_rnn) SCG.initialize() compile_start_time = time.time() # build the cost gradients, training function, samplers, etc. SCG.build_attention_funcs() ########################################### # Sample and draw attention trajectories. # ########################################### samp_count = 100 Xb = Xva[:samp_count,:] Xb = batch_reshape(Xb, reps=step_reps) print("Xb.shape: {}".format(Xb.shape)) result = SCG.sample_attention(Xb, Xb) visualize_attention(result, pre_tag=result_tag, post_tag="b0") print("TESTED SAMPLER!") Xva = row_shuffle(Xva) Xb = Xva[:500] Xb = batch_reshape(Xb, reps=step_reps) va_costs = SCG.simple_nll_bound(Xb, Xb) print("nll_bound : {}".format(va_costs[0])) print("nll_term : {}".format(va_costs[1])) print("kld_q2p : {}".format(va_costs[2])) print("TESTED NLL BOUND!") SCG.build_model_funcs() compile_end_time = time.time() compile_minutes = (compile_end_time - compile_start_time) / 60.0 print("THEANO COMPILE TIME (MIN): {}".format(compile_minutes)) #SCG.load_model_params(f_name="SCG_params.pkl") ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ print("Beginning to train the model...") out_file = open("{}_results.txt".format(result_tag), 'wb') out_file.flush() costs = [0. for i in range(10)] learn_rate = 0.0001 momentum = 0.75 batch_idx = np.arange(batch_size) + tr_samples for i in range(250000): scale = min(1.0, ((i+1) / 2500.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 if (i > 10000): momentum = 0.95 else: momentum = 0.75 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update SCG.set_sgd_params(lr=learn_rate, mom_1=momentum, mom_2=0.99) SCG.set_lam_kld(lam_kld_q2p=0.95, lam_kld_p2q=0.05, lam_kld_p2g=0.0) # perform a minibatch update and record the cost for this batch Xb = Xtr.take(batch_idx, axis=0) Xb = batch_reshape(Xb, reps=step_reps) result = SCG.train_joint(Xb, Xb) costs = [(costs[j] + result[j]) for j in range(len(result))] # output diagnostic information and checkpoint parameters, etc. if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " total_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_term : {0:.4f}".format(costs[2]) str5 = " kld_q2p : {0:.4f}".format(costs[3]) str6 = " kld_p2q : {0:.4f}".format(costs[4]) str7 = " kld_p2g : {0:.4f}".format(costs[5]) str8 = " reg_term : {0:.4f}".format(costs[6]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7, str8]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 500) == 0): #((i % 1000) == 0): SCG.save_model_params("{}_params.pkl".format(result_tag)) # compute a small-sample estimate of NLL bound on validation set Xva = row_shuffle(Xva) Xb = Xva[:500] Xb = batch_reshape(Xb, reps=step_reps) va_costs = SCG.compute_nll_bound(Xb, Xb) str1 = " va_nll_bound : {}".format(va_costs[1]) str2 = " va_nll_term : {}".format(va_costs[2]) str3 = " va_kld_q2p : {}".format(va_costs[3]) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() ########################################### # Sample and draw attention trajectories. # ########################################### post_tag = "b{}".format(i) Xb = Xva[:100,:] Xb = batch_reshape(Xb, reps=step_reps) result = SCG.sample_attention(Xb, Xb) visualize_attention(result, pre_tag=result_tag, post_tag=post_tag)
def test_mnist(step_type='add', imp_steps=6, occ_dim=15, drop_prob=0.0): ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = "{}GPSI_conv_bn_OD{}_DP{}_IS{}_{}_NA".format(RESULT_PATH, occ_dim, dp_int, imp_steps, step_type) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xte = datasets[2][0] # Merge validation set and training set, and test on test set. Xtr = np.concatenate((Xtr, Xva), axis=0) Xva = Xte Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 batch_reps = 1 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX( all_pix_mean * np.ones((Xtr.shape[1],)) ) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ x_dim = Xtr.shape[1] z_dim = 100 init_scale = 1.0 use_bn = True x_in_sym = T.matrix('x_in_sym') x_out_sym = T.matrix('x_out_sym') x_mask_sym = T.matrix('x_mask_sym') ################# # p_zi_given_xi # ################# params = {} shared_config = \ [ {'layer_type': 'conv', 'in_chans': 1, # in shape: (batch, 784) 'out_chans': 64, # out shape: (batch, 64, 14, 14) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'double', 'apply_bn': use_bn, 'shape_func_in': lambda x: T.reshape(x, (-1, 1, 28, 28))}, \ {'layer_type': 'conv', 'in_chans': 64, # in shape: (batch, 64, 14, 14) 'out_chans': 128, # out shape: (batch, 128, 7, 7) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'double', 'apply_bn': use_bn, 'shape_func_out': lambda x: T.flatten(x, 2)}, \ {'layer_type': 'fc', 'in_chans': 128*7*7, 'out_chans': 256, 'activation': relu_actfun, 'apply_bn': use_bn} ] output_config = \ [ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': z_dim, 'activation': relu_actfun, 'apply_bn': False}, \ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': z_dim, 'activation': relu_actfun, 'apply_bn': False} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = init_scale params['build_theano_funcs'] = False p_zi_given_xi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_zi_given_xi.init_biases(0.0) ################### # p_sip1_given_zi # ################### params = {} shared_config = \ [ {'layer_type': 'fc', 'in_chans': z_dim, 'out_chans': 256, 'activation': relu_actfun, 'apply_bn': use_bn}, \ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': 7*7*128, 'activation': relu_actfun, 'apply_bn': use_bn, 'shape_func_out': lambda x: T.reshape(x, (-1, 128, 7, 7))}, \ {'layer_type': 'conv', 'in_chans': 128, # in shape: (batch, 128, 7, 7) 'out_chans': 64, # out shape: (batch, 64, 14, 14) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'half', 'apply_bn': use_bn} ] output_config = \ [ {'layer_type': 'conv', 'in_chans': 64, # in shape: (batch, 64, 14, 14) 'out_chans': 1, # out shape: (batch, 1, 28, 28) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'half', 'apply_bn': False, 'shape_func_out': lambda x: T.flatten(x, 2)}, \ {'layer_type': 'conv', 'in_chans': 64, 'out_chans': 1, 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'half', 'apply_bn': False, 'shape_func_out': lambda x: T.flatten(x, 2)}, \ {'layer_type': 'conv', 'in_chans': 64, 'out_chans': 1, 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'half', 'apply_bn': False, 'shape_func_out': lambda x: T.flatten(x, 2)} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = init_scale params['build_theano_funcs'] = False p_sip1_given_zi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_sip1_given_zi.init_biases(0.0) ################# # q_zi_given_xi # ################# params = {} shared_config = \ [ {'layer_type': 'conv', 'in_chans': 2, # in shape: (batch, 784+784) 'out_chans': 64, # out shape: (batch, 64, 14, 14) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'double', 'apply_bn': use_bn, 'shape_func_in': lambda x: T.reshape(x, (-1, 2, 28, 28))}, \ {'layer_type': 'conv', 'in_chans': 64, # in shape: (batch, 64, 14, 14) 'out_chans': 128, # out shape: (batch, 128, 7, 7) 'activation': relu_actfun, 'filt_dim': 5, 'conv_stride': 'double', 'apply_bn': use_bn, 'shape_func_out': lambda x: T.flatten(x, 2)}, \ {'layer_type': 'fc', 'in_chans': 128*7*7, 'out_chans': 256, 'activation': relu_actfun, 'apply_bn': use_bn} ] output_config = \ [ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': z_dim, 'activation': relu_actfun, 'apply_bn': False}, \ {'layer_type': 'fc', 'in_chans': 256, 'out_chans': z_dim, 'activation': relu_actfun, 'apply_bn': False} ] params['shared_config'] = shared_config params['output_config'] = output_config params['init_scale'] = init_scale params['build_theano_funcs'] = False q_zi_given_xi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_zi_given_xi.init_biases(0.0) ########################################################### # Define parameters for the GPSImputer, and initialize it # ########################################################### print("Building the GPSImputer...") gpsi_params = {} gpsi_params['x_dim'] = x_dim gpsi_params['z_dim'] = z_dim # switch between direct construction and construction via p_x_given_si gpsi_params['imp_steps'] = imp_steps gpsi_params['step_type'] = step_type gpsi_params['x_type'] = 'bernoulli' gpsi_params['obs_transform'] = 'sigmoid' GPSI = GPSImputer(rng=rng, x_in=x_in_sym, x_out=x_out_sym, x_mask=x_mask_sym, p_zi_given_xi=p_zi_given_xi, p_sip1_given_zi=p_sip1_given_zi, q_zi_given_xi=q_zi_given_xi, params=gpsi_params, shared_param_dicts=None) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format(result_tag) out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.0001 momentum = 0.90 batch_idx = np.arange(batch_size) + tr_samples for i in range(200000): scale = min(1.0, ((i+1) / 5000.0)) if (((i + 1) % 15000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update GPSI.set_sgd_params(lr=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.98) GPSI.set_train_switch(1.0) GPSI.set_lam_nll(lam_nll=1.0) GPSI.set_lam_kld(lam_kld_q=1.0, lam_kld_p=0.1, lam_kld_g=0.0) GPSI.set_lam_l2w(1e-5) # perform a minibatch update and record the cost for this batch xb = to_fX( Xtr.take(batch_idx, axis=0) ) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) result = GPSI.train_joint(xi, xo, xm, batch_reps) # do diagnostics and general training tracking costs = [(costs[j] + result[j]) for j in range(len(result)-1)] if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_cost : {0:.4f}".format(costs[2]) str5 = " kld_cost : {0:.4f}".format(costs[3]) str6 = " reg_cost : {0:.4f}".format(costs[4]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva[0:5000], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) nll, kld = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10) vfe = np.mean(nll) + np.mean(kld) str1 = " va_nll_bound : {}".format(vfe) str2 = " va_nll_term : {}".format(np.mean(nll)) str3 = " va_kld_q2p : {}".format(np.mean(kld)) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() if ((i % 2000) == 0): #GPSI.save_to_file("{}_PARAMS.pkl".format(result_tag)) # Get some validation samples for evaluating model performance xb = to_fX( Xva[0:100] ) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) xi = np.repeat(xi, 2, axis=0) xo = np.repeat(xo, 2, axis=0) xm = np.repeat(xm, 2, axis=0) # draw some sample imputations from the model samp_count = xi.shape[0] _, model_samps = GPSI.sample_imputer(xi, xo, xm, use_guide_policy=False) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "{0:s}_samples_ng_b{1:d}.png".format(result_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=20)
def test_seq_cond_gen_static(step_type='add'): ############################## # File tag, for output stuff # ############################## result_tag = "{}AAA_SCG".format(RESULT_PATH) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) # get training/validation/test images Xtr = datasets[0][0] Xva = datasets[1][0] Xte = datasets[2][0] Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) Xte = to_fX(shift_and_scale_into_01(Xte)) obs_dim = Xtr.shape[1] # get label representations y_reps = 10 Ytr = one_hot_np(datasets[0][1]-1, cat_dim=10).repeat(y_reps, axis=1) Yva = one_hot_np(datasets[1][1]-1, cat_dim=10).repeat(y_reps, axis=1) Yte = one_hot_np(datasets[2][1]-1, cat_dim=10).repeat(y_reps, axis=1) label_dim = Ytr.shape[1] # merge image and lagel representations print("Xtr.shape: {}".format(Xtr.shape)) print("Ytr.shape: {}".format(Ytr.shape)) XYtr = to_fX( np.hstack( [Xtr, Ytr] ) ) XYva = to_fX( np.hstack( [Xva, Yva] ) ) tr_samples = XYtr.shape[0] va_samples = XYva.shape[0] batch_size = 200 def split_xy(xy_ary): x_ary = xy_ary[:,:obs_dim] y_ary = xy_ary[:,obs_dim:] return x_ary, y_ary ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ total_steps = 10 init_steps = 3 exit_rate = 0.2 x_dim = obs_dim y_dim = obs_dim + label_dim z_dim = 100 rnn_dim = 400 write_dim = 400 mlp_dim = 400 def visualize_attention(result, pre_tag="AAA", post_tag="AAA"): seq_len = result[0].shape[0] samp_count = result[0].shape[1] # get generated predictions x_samps = np.zeros((seq_len*samp_count, obs_dim)) y_samps = np.zeros((seq_len*samp_count, label_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): x_samps[idx] = result[0][s2,s1,:obs_dim] y_samps[idx] = result[0][s2,s1,obs_dim:] # add ticks at the corners of label predictions, to make them # easier to parse visually. max_val = np.mean(result[0][s2,s1,obs_dim:]) y_samps[idx][0] = max_val y_samps[idx][9] = max_val y_samps[idx][-1] = max_val y_samps[idx][-10] = max_val idx += 1 file_name = "{0:s}_traj_xs_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(x_samps, file_name, num_rows=20) file_name = "{0:s}_traj_ys_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(y_samps, file_name, num_rows=20) # get sequential attention maps seq_samps = np.zeros((seq_len*samp_count, x_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = result[1][s2,s1,:x_dim] + result[1][s2,s1,x_dim:] idx += 1 file_name = "{0:s}_traj_att_maps_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=20) # get sequential attention maps (read out values) seq_samps = np.zeros((seq_len*samp_count, x_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = result[2][s2,s1,:x_dim] + result[2][s2,s1,x_dim:] idx += 1 file_name = "{0:s}_traj_read_outs_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=20) return rnninits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } read_N = 2 # inner/outer grid dimension for reader reader_mlp = SimpleAttentionReader2d(x_dim=x_dim, con_dim=rnn_dim, width=28, height=28, N=read_N, img_scale=0.2, att_scale=0.5, **inits) read_dim = reader_mlp.read_dim # total number of "pixels" read by reader writer_mlp = MLP([None, None], [rnn_dim, write_dim, y_dim], \ name="writer_mlp", **inits) # mlps for processing inputs to LSTMs con_mlp_in = MLP([Identity()], [ z_dim, 4*rnn_dim], \ name="con_mlp_in", **inits) var_mlp_in = MLP([Identity()], [(y_dim + read_dim + rnn_dim), 4*rnn_dim], \ name="var_mlp_in", **inits) gen_mlp_in = MLP([Identity()], [ (read_dim + rnn_dim), 4*rnn_dim], \ name="gen_mlp_in", **inits) # mlps for turning LSTM outputs into conditionals over z_gen con_mlp_out = CondNet([Rectifier(), Rectifier()], \ [rnn_dim, mlp_dim, mlp_dim, z_dim], \ name="con_mlp_out", **inits) gen_mlp_out = CondNet([], [rnn_dim, z_dim], name="gen_mlp_out", **inits) var_mlp_out = CondNet([], [rnn_dim, z_dim], name="var_mlp_out", **inits) # LSTMs for the actual LSTMs (obviously, perhaps) con_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="con_rnn", **rnninits) gen_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="gen_rnn", **rnninits) var_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="var_rnn", **rnninits) SeqCondGen_doc_str = \ """ SeqCondGen -- constructs conditional densities under time constraints. This model sequentially constructs a conditional density estimate by taking repeated glimpses at the input x, and constructing a hypothesis about the output y. The objective is maximum likelihood for (x,y) pairs drawn from some training set. We learn a proper generative model, using variational inference -- which can be interpreted as a sort of guided policy search. The input pairs (x, y) can be either "static" or "sequential". In the static case, the same x and y are used at every step of the hypothesis construction loop. In the sequential case, x and y can change at each step of the loop. Parameters: x_and_y_are_seqs: boolean telling whether the conditioning information and prediction targets are sequential. total_steps: total number of steps in sequential estimation process init_steps: number of steps prior to first NLL measurement exit_rate: probability of exiting following each non "init" step **^^ THIS IS SET TO 0 WHEN USING SEQUENTIAL INPUT ^^** nll_weight: weight for the prediction NLL term at each step. **^^ THIS IS IGNORED WHEN USING STATIC INPUT ^^** step_type: whether to use "additive" steps or "jump" steps -- jump steps predict directly from the controller LSTM's "hidden" state (a.k.a. its memory cells). x_dim: dimension of inputs on which to condition y_dim: dimension of outputs to predict reader_mlp: used for reading from the input writer_mlp: used for writing to the output prediction con_mlp_in: preprocesses input to the "controller" LSTM con_rnn: the "controller" LSTM con_mlp_out: CondNet for distribution over z given con_rnn gen_mlp_in: preprocesses input to the "generator" LSTM gen_rnn: the "generator" LSTM gen_mlp_out: CondNet for distribution over z given gen_rnn var_mlp_in: preprocesses input to the "variational" LSTM var_rnn: the "variational" LSTM var_mlp_out: CondNet for distribution over z given gen_rnn """ SCG = SeqCondGen( x_and_y_are_seqs=False, # this test doesn't use sequential x/y total_steps=total_steps, init_steps=init_steps, exit_rate=exit_rate, nll_weight=0.0, # ignored, because x_and_y_are_seqs == False step_type=step_type, x_dim=x_dim, y_dim=y_dim, reader_mlp=reader_mlp, writer_mlp=writer_mlp, con_mlp_in=con_mlp_in, con_mlp_out=con_mlp_out, con_rnn=con_rnn, gen_mlp_in=gen_mlp_in, gen_mlp_out=gen_mlp_out, gen_rnn=gen_rnn, var_mlp_in=var_mlp_in, var_mlp_out=var_mlp_out, var_rnn=var_rnn) SCG.initialize() compile_start_time = time.time() # build the attention trajectory sampler SCG.build_attention_funcs() # quick test of attention trajectory sampler samp_count = 100 XYb = XYva[:samp_count,:] Xb, Yb = split_xy(XYb) #Xb = Xva[:samp_count] result = SCG.sample_attention(Xb, XYb) visualize_attention(result, pre_tag=result_tag, post_tag="b0") # build the main model functions (i.e. training and cost functions) SCG.build_model_funcs() compile_end_time = time.time() compile_minutes = (compile_end_time - compile_start_time) / 60.0 print("THEANO COMPILE TIME (MIN): {}".format(compile_minutes)) #SCG.load_model_params(f_name="SCG_params.pkl") ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ print("Beginning to train the model...") out_file = open("{}_results.txt".format(result_tag), 'wb') out_file.flush() costs = [0. for i in range(10)] learn_rate = 0.0001 momentum = 0.8 batch_idx = np.arange(batch_size) + tr_samples for i in range(250000): scale = min(1.0, ((i+1) / 2500.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 if (i > 10000): momentum = 0.95 else: momentum = 0.8 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set XYtr = row_shuffle(XYtr) #Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update SCG.set_sgd_params(lr=learn_rate, mom_1=momentum, mom_2=0.99) SCG.set_lam_kld(lam_kld_q2p=0.95, lam_kld_p2q=0.05, lam_kld_p2g=0.1) # perform a minibatch update and record the cost for this batch XYb = XYtr.take(batch_idx, axis=0) Xb, Yb = split_xy(XYb) #Xb = Xtr.take(batch_idx, axis=0) result = SCG.train_joint(Xb, XYb) costs = [(costs[j] + result[j]) for j in range(len(result))] # output diagnostic information and checkpoint parameters, etc. if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " total_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_term : {0:.4f}".format(costs[2]) str5 = " kld_q2p : {0:.4f}".format(costs[3]) str6 = " kld_p2q : {0:.4f}".format(costs[4]) str7 = " kld_p2g : {0:.4f}".format(costs[5]) str8 = " reg_term : {0:.4f}".format(costs[6]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7, str8]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 500) == 0): #((i % 1000) == 0): SCG.save_model_params("{}_params.pkl".format(result_tag)) # compute a small-sample estimate of NLL bound on validation set XYva = row_shuffle(XYva) XYb = XYva[:1000] Xb, Yb = split_xy(XYb) #Xva = row_shuffle(Xva) #Xb = Xva[:1000] va_costs = SCG.compute_nll_bound(Xb, XYb) str1 = " va_nll_bound : {}".format(va_costs[1]) str2 = " va_nll_term : {}".format(va_costs[2]) str3 = " va_kld_q2p : {}".format(va_costs[3]) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() ########################################### # Sample and draw attention trajectories. # ########################################### samp_count = 100 XYb = XYva[:samp_count,:] Xb, Yb = split_xy(XYb) #Xb = Xva[:samp_count] result = SCG.sample_attention(Xb, XYb) post_tag = "b{0:d}".format(i) visualize_attention(result, pre_tag=result_tag, post_tag=post_tag)
def test_oi_seq_cond_gen(attention=False): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xte = datasets[2][0] # Merge validation set and training set, and test on test set. #Xtr = np.concatenate((Xtr, Xva), axis=0) #Xva = Xte Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ x_dim = 28 outer_steps = 27 inner_steps = 5 rnn_dim = 128 write_dim = 64 mlp_dim = 128 z_dim = 50 rnninits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } # setup the reader and writer if attention: read_N = 3 # inner/outer grid dimension for reader reader_mlp = SimpleAttentionReader1d(x_dim=x_dim, con_dim=rnn_dim, N=read_N, init_scale=2.0, **inits) read_dim = reader_mlp.read_dim att_tag = "YA" else: read_dim = 2*x_dim reader_mlp = Reader(x_dim=x_dim, dec_dim=rnn_dim, **inits) att_tag = "NA" writer_mlp = MLP([None, None], [rnn_dim, write_dim, x_dim], \ name="writer_mlp", **inits) # mlps for processing inputs to LSTMs con_mlp_in = MLP([Identity()], [ z_dim, 4*rnn_dim], \ name="con_mlp_in", **inits) gen_mlp_in = MLP([Identity()], [ (read_dim + rnn_dim), 4*rnn_dim], \ name="gen_mlp_in", **inits) var_mlp_in = MLP([Identity()], [(x_dim + read_dim + rnn_dim), 4*rnn_dim], \ name="var_mlp_in", **inits) mem_mlp_in = MLP([Identity()], [ 2*rnn_dim, 4*rnn_dim], \ name="mem_mlp_in", **inits) # mlps for turning LSTM outputs into conditionals over z_gen gen_mlp_out = CondNet([], [rnn_dim, z_dim], name="gen_mlp_out", **inits) var_mlp_out = CondNet([], [rnn_dim, z_dim], name="var_mlp_out", **inits) mem_mlp_out = MLP([Identity()], [rnn_dim, 2*rnn_dim], \ name="mem_mlp_out", **inits) # LSTMs for the actual LSTMs (obviously, perhaps) con_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="con_rnn", **rnninits) gen_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="gen_rnn", **rnninits) var_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="var_rnn", **rnninits) mem_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="mem_rnn", **rnninits) OISeqCondGen_doc_str = \ """ OISeqCondGen -- a model for predicting inputs, given previous inputs. For each input in a sequence, this model sequentially builds a prediction for the next input. Each of these predictions conditions directly on the previous input, and indirectly on even earlier inputs. Conditioning on the current input is either "fully informed" or "attention based". Conditioning on even earlier inputs is through state that is carried across predictions using, e.g., an LSTM. Parameters: obs_dim: dimension of inputs to observe and predict outer_steps: #predictions to make inner_steps: #steps when constructing each prediction reader_mlp: used for reading from the current input writer_mlp: used for writing to prediction of the next input con_mlp_in: preprocesses input to the "controller" LSTM con_rnn: the "controller" LSTM gen_mlp_in: preprocesses input to the "generator" LSTM gen_rnn: the "generator" LSTM gen_mlp_out: CondNet for distribution over z given gen_rnn var_mlp_in: preprocesses input to the "variational" LSTM var_rnn: the "variational" LSTM var_mlp_out: CondNet for distribution over z given gen_rnn mem_mlp_in: preprocesses input to the "memory" LSTM mem_rnn: the "memory" LSTM (this stores inter-prediction state) mem_mlp_out: emits initial controller state for each prediction """ IMS = OISeqCondGen( obs_dim=x_dim, outer_steps=outer_steps, inner_steps=inner_steps, reader_mlp=reader_mlp, writer_mlp=writer_mlp, con_mlp_in=con_mlp_in, con_rnn=con_rnn, gen_mlp_in=gen_mlp_in, gen_mlp_out=gen_mlp_out, gen_rnn=gen_rnn, var_mlp_in=var_mlp_in, var_mlp_out=var_mlp_out, var_rnn=var_rnn, mem_mlp_in=mem_mlp_in, mem_mlp_out=mem_mlp_out, mem_rnn=mem_rnn ) IMS.initialize() # build the cost gradients, training function, samplers, etc. IMS.build_model_funcs() #IMS.load_model_params(f_name="SRRM_params.pkl") ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ print("Beginning to train the model...") out_file = open("IMS_results.txt", 'wb') out_file.flush() costs = [0. for i in range(10)] learn_rate = 0.0001 momentum = 0.75 batch_idx = np.arange(batch_size) + tr_samples for i in range(250000): scale = min(1.0, ((i+1) / 2500.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 if (i > 10000): momentum = 0.95 else: momentum = 0.75 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update zero_ary = np.zeros((1,)) IMS.lr.set_value(to_fX(zero_ary + learn_rate)) IMS.mom_1.set_value(to_fX(zero_ary + momentum)) IMS.mom_2.set_value(to_fX(zero_ary + 0.99)) # perform a minibatch update and record the cost for this batch Xb = to_fX(Xtr.take(batch_idx, axis=0)) Xb = Xb.reshape(batch_size, x_dim, x_dim).swapaxes(0,2).swapaxes(1,2) result = IMS.train_joint(Xb) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 100) == 0): costs = [(v / 100.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " total_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_term : {0:.4f}".format(costs[2]) str5 = " kld_q2p : {0:.4f}".format(costs[3]) str6 = " kld_p2q : {0:.4f}".format(costs[4]) str7 = " reg_term : {0:.4f}".format(costs[5]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): IMS.save_model_params("IMS_params.pkl") # compute a small-sample estimate of NLL bound on validation set Xva = row_shuffle(Xva) Xb = to_fX(Xva[:5000]) Xb = Xb.reshape(batch_size, x_dim, x_dim).swapaxes(0,2).swapaxes(1,2) va_costs = IMS.compute_nll_bound(Xb) str1 = " va_nll_bound : {}".format(va_costs[1]) str2 = " va_nll_term : {}".format(va_costs[2]) str3 = " va_kld_q2p : {}".format(va_costs[3]) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush()
def check_mnist_walkout(): # DERPA DERPA DOO KLD_PATH = "MNIST_WALKOUT_TEST_KLD/" VAE_PATH = "MNIST_WALKOUT_TEST_VAE/" RESULT_PATH = "MNIST_WALKOUT_RESULTS/" # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr_shared = datasets[0][0] Xva_shared = datasets[1][0] Xtr = Xtr_shared.get_value(borrow=False).astype(theano.config.floatX) Xva = Xva_shared.get_value(borrow=False).astype(theano.config.floatX) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 100 batch_reps = 5 # Construct a GenNet and an InfNet, then test constructor for GIPair. # Do basic testing, to make sure classes aren't completely broken. Xp = T.matrix('Xp_base') Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') data_dim = Xtr.shape[1] prior_sigma = 1.0 p_vals_kld, v_vals_kld, p_vals_vae, v_vals_vae = [], [], [], [] kl_vals_kld, ll_vals_kld, kl_vals_vae, ll_vals_vae = [], [], [], [] ######################################################## # CHECK MODEL BEHAVIOR AT DIFFERENT STAGES OF TRAINING # ######################################################## for i in range(10000,200000): if ((i % 10000) == 0): if (i <= 80000): net_type = 'gip' b = i else: net_type = 'walk' b = i - 80000 ############################################################# # Process the GIPair trained with strong KLd regularization # ############################################################# gn_fname = KLD_PATH + "pt_{0:s}_params_b{1:d}_GN.pkl".format(net_type, b) in_fname = KLD_PATH + "pt_{0:s}_params_b{1:d}_IN.pkl".format(net_type, b) IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp) IN.set_sigma_scale(1.0) prior_dim = GN.latent_dim post_klds_kld = posterior_klds(IN, Xtr, 5000, 5) # Initialize the GIPair GIP_KLD = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN, i_net=IN, \ data_dim=data_dim, prior_dim=prior_dim, params=None) GIP_KLD.set_lam_l2w(1e-4) GIP_KLD.set_lam_nll(1.0) GIP_KLD.set_lam_kld(1.0) # draw samples freely from the generative model's prior Xs = GIP_KLD.sample_from_prior(20*20) file_name = RESULT_PATH + "prior_samples_b{0:d}_kld.png".format(i) utils.visualize_samples(Xs, file_name, num_rows=20) # test Parzen density estimator built from prior samples Xs = GIP_KLD.sample_from_prior(10000, sigma=1.0) parzen_vals_kld = cross_validate_sigma(Xs, Xva, [0.1, 0.13, 0.15, 0.18, 0.2], 50) # get variational bound info var_vals_kld = GIP_KLD.compute_ll_bound(Xva) # record info about variational and parzen bounds p_vals_kld.append(parzen_vals_kld[1]) v_vals_kld.append(np.mean(var_vals_kld[0])) ################################################################ # Process the GIPair trained with basic VAE KLd regularization # ################################################################ gn_fname = VAE_PATH + "pt_{0:s}_params_b{1:d}_GN.pkl".format(net_type, b) in_fname = VAE_PATH + "pt_{0:s}_params_b{1:d}_IN.pkl".format(net_type, b) IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp) IN.set_sigma_scale(1.0) prior_dim = GN.latent_dim post_klds_vae = posterior_klds(IN, Xtr, 5000, 5) # Initialize the GIPair GIP_VAE = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN, i_net=IN, \ data_dim=data_dim, prior_dim=prior_dim, params=None) GIP_VAE.set_lam_l2w(1e-4) GIP_VAE.set_lam_nll(1.0) GIP_VAE.set_lam_kld(1.0) # draw samples freely from the generative model's prior Xs = GIP_VAE.sample_from_prior(20*20) file_name = RESULT_PATH + "prior_samples_b{0:d}_vae.png".format(i) utils.visualize_samples(Xs, file_name, num_rows=20) # test Parzen density estimator built from prior samples Xs = GIP_VAE.sample_from_prior(10000, sigma=1.0) parzen_vals_vae = cross_validate_sigma(Xs, Xva, [0.12, 0.15, 0.18, 0.20, 0.25], 50) # get variational bound info var_vals_vae = GIP_VAE.compute_ll_bound(Xva) # record info about variational and parzen bounds p_vals_vae.append(parzen_vals_vae[1]) v_vals_vae.append(np.mean(var_vals_vae[0])) ######################## # Plot posterior KLds. # ######################## file_name = RESULT_PATH + "post_klds_b{0:d}.pdf".format(i) draw_posterior_kld_hist( \ np.asarray(post_klds_kld), np.asarray(post_klds_vae), file_name, bins=30) if i in [20000, 50000, 80000, 110000, 150000, 190000]: # select random random indices into the validation set va_idx = npr.randint(0,high=va_samples,size=(150,)) # record information about their current variational bounds kl_vals_kld.extend([v for v in var_vals_kld[1][va_idx]]) ll_vals_kld.extend([v for v in var_vals_kld[2][va_idx]]) kl_vals_vae.extend([v for v in var_vals_vae[1][va_idx]]) ll_vals_vae.extend([v for v in var_vals_vae[2][va_idx]]) # do some plotting s1_name = RESULT_PATH + "parzen_vs_variational.pdf" s2_name = RESULT_PATH + "kld_vs_likelihood.pdf" draw_parzen_vs_variational_scatter(p_vals_kld, v_vals_kld, \ p_vals_vae, v_vals_vae, f_name=s1_name) draw_kld_vs_likelihood_scatter(kl_vals_kld, ll_vals_kld, \ kl_vals_vae, ll_vals_vae, f_name=s2_name) return
def test_cA(start_rate=0.01, decay_rate=1.0, training_epochs=20, dataset='./data/mnist.pkl.gz', batch_size=50, output_folder='CAE_plots', contraction_level=0.1): """ This demo is tested on MNIST :type learning_rate: float :param learning_rate: learning rate used for training the contracting AutoEncoder :type training_epochs: int :param training_epochs: number of epochs used for training :type dataset: string :param dataset: path to the picked dataset """ datasets = load_udm(dataset,as_shared=True) train_set_x, train_set_y = datasets[0] learning_rate = theano.shared(numpy.asarray(start_rate, dtype=theano.config.floatX)) # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images if not os.path.isdir(output_folder): os.makedirs(output_folder) os.chdir(output_folder) #################################### # BUILDING THE MODEL # #################################### rng = numpy.random.RandomState(12345) cae = cA(numpy_rng=rng, input=x, n_visible=(28 * 28), n_hidden=500, n_batchsize=batch_size) cost, updates = cae.get_cost_updates(contraction_level=contraction_level, learning_rate=learning_rate) print '... building Theano functions:', # Build a function for updating the CAE parameters train_cae = theano.function([index], [T.mean(cae.L_rec), cae.L_jacob], updates=updates, givens={x: train_set_x[index * batch_size: (index + 1) * batch_size]}) # Build a function for decaying the learning rate set_learning_rate = theano.function(inputs=[], outputs=learning_rate, \ updates={learning_rate: learning_rate * decay_rate}) print 'done' start_time = time.clock() ############ # TRAINING # ############ # go through training epochs for epoch in xrange(training_epochs): c = [] # Cycle over all training minibatches for this epoch... print "Epoch {0:d}:".format(epoch), stdout.flush() for batch_index in xrange(n_train_batches): c.append(train_cae(batch_index)) if ((batch_index % (n_train_batches / 40)) == 0): print ".", stdout.flush() print " " update_the_rate = set_learning_rate() # Display diagnostics for the most recent epoch of training c_array = numpy.vstack(c) print "-- reconstruction: {0:.4f}, jacobian: {1:.4f}".format( \ numpy.mean(c_array[0]), numpy.mean(numpy.sqrt(c_array[1]))) # Visualize filters in their current state image = PIL.Image.fromarray(tile_raster_images( X=cae.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('cae_filters.png') # Save the CAE parameters to disk W = cae.W.get_value(borrow=False) b_encode = cae.W.get_value(borrow=False) b_decode = cae.b_prime.get_value(borrow=False) np.save('cae_W.npy',W) np.save('cae_b_encode.npy',b_encode) np.save('cae_b_decode.npy',b_decode) # Record total training time, just for kicks end_time = time.clock() training_time = (end_time - start_time) # Print some jibber-jabber print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((training_time) / 60.)) # Eject, eJect, EJECT! os.chdir('../')
def check_mnist_recon(): # DERPA DERPA DOO KLD_PATH = "MNIST_WALKOUT_TEST_KLD/" VAE_PATH = "MNIST_WALKOUT_TEST_VAE/" RESULT_PATH = "MNIST_WALKOUT_RESULTS/" # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr_shared = datasets[0][0] Xva_shared = datasets[1][0] Xtr = Xtr_shared.get_value(borrow=False).astype(theano.config.floatX) Xva = Xva_shared.get_value(borrow=False).astype(theano.config.floatX) Xtr_mean = np.mean(Xtr, axis=0, keepdims=True) Xtr_mean = (0.0 * Xtr_mean) + np.mean(Xtr_mean) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 100 batch_reps = 5 # Construct a GenNet and an InfNet, then test constructor for GIPair. # Do basic testing, to make sure classes aren't completely broken. Xp = T.matrix('Xp_base') Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') data_dim = Xtr.shape[1] prior_sigma = 1.0 ############################################################# # Process the GIPair trained with strong KLd regularization # ############################################################# gn_fname = KLD_PATH + "pt_walk_params_b120000_GN.pkl" in_fname = KLD_PATH + "pt_walk_params_b120000_IN.pkl" IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp) IN.set_sigma_scale(1.0) prior_dim = GN.latent_dim # Initialize the GIPair GIP_KLD = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN, i_net=IN, \ data_dim=data_dim, prior_dim=prior_dim, params=None) ################################################################ # Process the GIPair trained with basic VAE KLd regularization # ################################################################ gn_fname = VAE_PATH + "pt_walk_params_b120000_GN.pkl" in_fname = VAE_PATH + "pt_walk_params_b120000_IN.pkl" IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp) IN.set_sigma_scale(1.0) prior_dim = GN.latent_dim # Initialize the GIPair GIP_VAE = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN, i_net=IN, \ data_dim=data_dim, prior_dim=prior_dim, params=None) for trial in range(15): ######################################################### # DRAW THE SAMPLE OBSERVATIONS AND MASKS FOR THIS TRIAL # ######################################################### va_idx = npr.randint(low=0, high=Xva.shape[0], size=(15,)) Xc_batch = Xva.take(va_idx, axis=0) Xd_batch = np.repeat(Xtr_mean, Xc_batch.shape[0], axis=0).astype(theano.config.floatX) Xm_rand = sample_masks(Xc_batch, drop_prob=0.2) Xm_patch = sample_patch_masks(Xc_batch, (28,28), (14,14)) Xm_batch = Xm_rand * Xm_patch ##################################### # COMPARE SAMPLES IN A NORMAL CHAIN # ##################################### # draw some chains of samples from the VAE loop result_kld = GIP_KLD.sample_from_chain(Xc_batch, loop_iters=51) result_vae = GIP_VAE.sample_from_chain(Xc_batch, loop_iters=51) chain_samples_kld = [] chain_samples_vae = [] for i in range(len(result_kld['data samples'])): if (((i % 10) == 0) or (i == 1)): chain_samples_kld.append(result_kld['data samples'][i]) chain_samples_vae.append(result_vae['data samples'][i]) # interleave the chain samples for beauteous display chain_samples_both = [] for i in range(len(chain_samples_kld)): Xs_kld = chain_samples_kld[i] Xs_vae = chain_samples_vae[i] joint_samples = np.zeros((2*Xs_kld.shape[0], Xs_kld.shape[1])) for j in range(Xs_kld.shape[0]): joint_samples[2*j] = Xs_kld[j] joint_samples[2*j + 1] = Xs_vae[j] chain_samples_both.append(joint_samples) chain_len = len(chain_samples_both) Xs = np.vstack(chain_samples_both) file_name = RESULT_PATH + "FIG_CHAIN_{0:d}.png".format(trial) utils.visualize_samples(Xs, file_name, num_rows=chain_len) ############################################# # COMPARE SAMPLES IN A RECONSTRUCTION CHAIN # ############################################# # draw some chains of samples from the VAE loop result_kld = GIP_KLD.sample_from_chain(Xd_batch, X_c=Xc_batch, \ X_m=Xm_batch, loop_iters=10) result_vae = GIP_VAE.sample_from_chain(Xd_batch, X_c=Xc_batch, \ X_m=Xm_batch, loop_iters=10) recon_samples_kld = [] recon_samples_vae = [] for i in range(len(result_kld['data samples'])): if (((i % 2) == 0) or (i == 1)): recon_samples_kld.append(result_kld['data samples'][i]) recon_samples_vae.append(result_vae['data samples'][i]) # interleave the recon samples for beauteous display recon_samples_both = [] for i in range(len(recon_samples_kld)): Xs_kld = recon_samples_kld[i] Xs_vae = recon_samples_vae[i] joint_samples = np.zeros((2*Xs_kld.shape[0], Xs_kld.shape[1])) for j in range(Xs_kld.shape[0]): joint_samples[2*j] = Xs_kld[j] joint_samples[2*j + 1] = Xs_vae[j] recon_samples_both.append(joint_samples) recon_len = len(recon_samples_both) Xs = np.vstack(recon_samples_both) file_name = RESULT_PATH + "FIG_RECON_{0:d}.png".format(trial) utils.visualize_samples(Xs, file_name, num_rows=recon_len) return
def batch_test_ss_mlp_pt(test_count=10, su_count=1000): """Setup basic test for semisupervised EAR-regularized MLP.""" # Set some reasonable sgd parameters sgd_params = {} sgd_params['start_rate'] = 0.01 sgd_params['decay_rate'] = 0.998 sgd_params['wt_norm_bound'] = 3.5 sgd_params['epochs'] = 1000 sgd_params['batch_size'] = 100 sgd_params['result_tag'] = '---' # Set some reasonable mlp parameters mlp_params = {} # Set up some proto-networks pc0 = [28*28, 800, 800, 11] mlp_params['proto_configs'] = [pc0] # Set up some spawn networks sc0 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} sc1 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} mlp_params['spawn_configs'] = [sc0, sc1] mlp_params['spawn_weights'] = [0.0, 1.0] # Set remaining params mlp_params['ear_type'] = 5 mlp_params['ear_lam'] = 1.0 mlp_params['lam_l2a'] = 1e-2 mlp_params['reg_all_obs'] = True for test_num in range(test_count): rng_seed = test_num sgd_params['result_tag'] = "test_{0:d}".format(test_num) # Initialize a random number generator for this test rng = np.random.RandomState(rng_seed) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) # Construct the EarNet object that we will be training x_in = T.matrix('x_in') NET = EarNet(rng=rng, input=x_in, params=mlp_params) init_biases(NET, b_init=0.05) ########################################## # First, pretrain each layer in the mlp. # ########################################## sgd_params['result_tag'] = "ss_ear_pt_s{0:d}_t{1:d}".format(su_count,test_num) sgd_params['batch_size'] = 25 sgd_params['start_rate'] = 0.02 sgd_params['epochs'] = 40 for i in range(len(NET.dae_costs)): print("==================================================") print("Pretraining hidden layer(s) at depth {0:d}".format(i+1)) print("==================================================") train_dae(NET, i, sgd_params, datasets) # Load some data to train/validate/test with rng = np.random.RandomState(rng_seed) dataset = 'data/mnist.pkl.gz' datasets = load_udm_ss(dataset, su_count, rng, zero_mean=False) # Run semisupervised training on the given MLP sgd_params['batch_size'] = 100 sgd_params['start_rate'] = 0.04 # Train with weak EAR regularization sgd_params['top_only'] = True sgd_params['epochs'] = 5 NET.set_ear_lam(0.0) train_ss_mlp(NET, sgd_params, datasets) COMMENT=""" # Train with no EAR regularization sgd_params['top_only'] = False sgd_params['epochs'] = 100 NET.set_ear_lam(0.0) train_ss_mlp(NET, sgd_params, datasets) """ # Train with weak EAR regularization sgd_params['top_only'] = False sgd_params['epochs'] = 5 NET.set_ear_lam(0.5) train_ss_mlp(NET, sgd_params, datasets) # Train with weak EAR regularization sgd_params['epochs'] = 10 NET.set_ear_lam(1.0) train_ss_mlp(NET, sgd_params, datasets) # Train with more EAR regularization sgd_params['epochs'] = 15 NET.set_ear_lam(1.5) train_ss_mlp(NET, sgd_params, datasets) # Train with more EAR regularization sgd_params['epochs'] = 20 NET.set_ear_lam(2.0) train_ss_mlp(NET, sgd_params, datasets) # Train with more EAR regularization sgd_params['epochs'] = 100 NET.set_ear_lam(3.0) train_ss_mlp(NET, sgd_params, datasets) return
def test_gip_sigma_scale_mnist(): from LogPDFs import cross_validate_sigma # Simple test code, to check that everything is basically functional. print("TESTING...") # Initialize a source of randomness rng = np.random.RandomState(12345) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr = datasets[0][0] Xtr = Xtr.get_value(borrow=False) Xva = datasets[2][0] Xva = Xva.get_value(borrow=False) print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape), str(Xva.shape))) # get and set some basic dataset information tr_samples = Xtr.shape[0] batch_size = 100 Xtr_mean = np.mean(Xtr, axis=0, keepdims=True) Xtr_mean = (0.0 * Xtr_mean) + np.mean(Xtr) Xc_mean = np.repeat(Xtr_mean, batch_size, axis=0).astype(theano.config.floatX) # Symbolic inputs Xd = T.matrix(name='Xd') Xc = T.matrix(name='Xc') Xm = T.matrix(name='Xm') Xt = T.matrix(name='Xt') # Load inferencer and generator from saved parameters gn_fname = "MNIST_WALKOUT_TEST_MED_KLD/pt_osm_params_b80000_GN.pkl" in_fname = "MNIST_WALKOUT_TEST_MED_KLD/pt_osm_params_b80000_IN.pkl" IN = load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) GN = load_infnet_from_file(f_name=gn_fname, rng=rng, Xd=Xd) x_dim = IN.shared_layers[0].in_dim z_dim = IN.mu_layers[-1].out_dim # construct a GIPair with the loaded InfNet and GenNet osm_params = {} osm_params['x_type'] = 'gaussian' osm_params['xt_transform'] = 'sigmoid' osm_params['logvar_bound'] = LOGVAR_BOUND OSM = OneStageModel(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, \ p_x_given_z=GN, q_z_given_x=IN, \ x_dim=x_dim, z_dim=z_dim, params=osm_params) # compute variational likelihood bound and its sub-components Xva = row_shuffle(Xva) Xb = Xva[0:5000] file_name = "AX_MNIST_MAX_KLD_POST_KLDS.png" post_klds = OSM.compute_post_klds(Xb) post_dim_klds = np.mean(post_klds, axis=0) utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, \ file_name) # compute information about free-energy on validation set file_name = "AX_MNIST_MAX_KLD_FREE_ENERGY.png" fe_terms = OSM.compute_fe_terms(Xb, 20) utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ x_label='Posterior KLd', y_label='Negative Log-likelihood') # bound_results = OSM.compute_ll_bound(Xva) # ll_bounds = bound_results[0] # post_klds = bound_results[1] # log_likelihoods = bound_results[2] # max_lls = bound_results[3] # print("mean ll bound: {0:.4f}".format(np.mean(ll_bounds))) # print("mean posterior KLd: {0:.4f}".format(np.mean(post_klds))) # print("mean log-likelihood: {0:.4f}".format(np.mean(log_likelihoods))) # print("mean max log-likelihood: {0:.4f}".format(np.mean(max_lls))) # print("min ll bound: {0:.4f}".format(np.min(ll_bounds))) # print("max posterior KLd: {0:.4f}".format(np.max(post_klds))) # print("min log-likelihood: {0:.4f}".format(np.min(log_likelihoods))) # print("min max log-likelihood: {0:.4f}".format(np.min(max_lls))) # # compute some information about the approximate posteriors # post_stats = OSM.compute_post_stats(Xva, 0.0*Xva, 0.0*Xva) # all_post_klds = np.sort(post_stats[0].ravel()) # post KLds for each obs and dim # obs_post_klds = np.sort(post_stats[1]) # summed post KLds for each obs # post_dim_klds = post_stats[2] # average post KLds for each post dim # post_dim_vars = post_stats[3] # average squared mean for each post dim # utils.plot_line(np.arange(all_post_klds.shape[0]), all_post_klds, "AAA_ALL_POST_KLDS.png") # utils.plot_line(np.arange(obs_post_klds.shape[0]), obs_post_klds, "AAA_OBS_POST_KLDS.png") # utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, "AAA_POST_DIM_KLDS.png") # utils.plot_stem(np.arange(post_dim_vars.shape[0]), post_dim_vars, "AAA_POST_DIM_VARS.png") # draw many samples from the GIP for i in range(5): tr_idx = npr.randint(low=0, high=tr_samples, size=(100, )) Xd_batch = Xtr.take(tr_idx, axis=0) Xs = [] for row in range(3): Xs.append([]) for col in range(3): sample_lists = OSM.sample_from_chain(Xd_batch[0:10,:], loop_iters=100, \ sigma_scale=1.0) Xs[row].append(group_chains(sample_lists['data samples'])) Xs, block_im_dim = block_video(Xs, (28, 28), (3, 3)) to_video(Xs, block_im_dim, "AX_MNIST_MAX_KLD_CHAIN_VIDEO_{0:d}.avi".format(i), frame_rate=10) file_name = "AX_MNIST_MAX_KLD_PRIOR_SAMPLE.png" Xs = OSM.sample_from_prior(20 * 20) utils.visualize_samples(Xs, file_name, num_rows=20) # # test Parzen density estimator built from prior samples # Xs = OSM.sample_from_prior(10000) # [best_sigma, best_ll, best_lls] = \ # cross_validate_sigma(Xs, Xva, [0.12, 0.14, 0.15, 0.16, 0.18], 20) # sort_idx = np.argsort(best_lls) # sort_idx = sort_idx[0:400] # utils.plot_line(np.arange(sort_idx.shape[0]), best_lls[sort_idx], "A_MNIST_MAX_KLD_BEST_LLS_1.png") # utils.visualize_samples(Xva[sort_idx], "A_MNIST_MAX_KLD_BAD_DIGITS_1.png", num_rows=20) return
def test_with_model_init(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr_shared = datasets[0][0] Xva_shared = datasets[1][0] Xtr = Xtr_shared.get_value(borrow=False).astype(theano.config.floatX) Xva = Xva_shared.get_value(borrow=False).astype(theano.config.floatX) tr_samples = Xtr.shape[0] batch_size = 200 batch_reps = 1 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ obs_dim = Xtr.shape[1] z_dim = 20 h_dim = 100 x_type = 'bernoulli' # some InfNet instances to build the TwoStageModel from X_sym = T.matrix('X_sym') ######################## # p_s0_obs_given_z_obs # ######################## params = {} shared_config = [z_dim, 250, 250] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 1e-3 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_s0_obs_given_z_obs = InfNet(rng=rng, Xd=X_sym, \ params=params, shared_param_dicts=None) p_s0_obs_given_z_obs.init_biases(0.2) ################# # p_hi_given_si # ################# params = {} shared_config = [obs_dim, 250, 250] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_hi_given_si = InfNet(rng=rng, Xd=X_sym, \ params=params, shared_param_dicts=None) p_hi_given_si.init_biases(0.2) ###################### # p_sip1_given_si_hi # ###################### params = {} shared_config = [h_dim, 250, 250] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_sip1_given_si_hi = InfNet(rng=rng, Xd=X_sym, \ params=params, shared_param_dicts=None) p_sip1_given_si_hi.init_biases(0.2) ############### # q_z_given_x # ############### params = {} shared_config = [obs_dim, 250, 250] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_z_given_x = InfNet(rng=rng, Xd=X_sym, \ params=params, shared_param_dicts=None) q_z_given_x.init_biases(0.2) ################### # q_hi_given_x_si # ################### params = {} shared_config = [(obs_dim + obs_dim), 500, 500] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_hi_given_x_si = InfNet(rng=rng, Xd=X_sym, \ params=params, shared_param_dicts=None) q_hi_given_x_si.init_biases(0.2) ################################################################ # Define parameters for the MultiStageModel, and initialize it # ################################################################ print("Building the MultiStageModel...") msm_params = {} msm_params['x_type'] = x_type msm_params['obs_transform'] = 'sigmoid' MSM = MultiStageModel(rng=rng, x_in=X_sym, \ p_s0_obs_given_z_obs=p_s0_obs_given_z_obs, \ p_hi_given_si=p_hi_given_si, \ p_sip1_given_si_hi=p_sip1_given_si_hi, \ q_z_given_x=q_z_given_x, \ q_hi_given_x_si=q_hi_given_x_si, \ obs_dim=obs_dim, z_dim=z_dim, h_dim=h_dim, \ model_init_obs=True, ir_steps=2, \ params=msm_params) obs_mean = (0.9 * np.mean(Xtr, axis=0)) + 0.05 obs_mean_logit = np.log(obs_mean / (1.0 - obs_mean)) MSM.set_input_bias(-obs_mean) MSM.set_obs_bias(0.1*obs_mean_logit) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ costs = [0. for i in range(10)] learn_rate = 0.0003 momentum = 0.8 for i in range(300000): scale = min(1.0, ((i+1) / 10000.0)) extra_kl = max(0.0, ((50000.0 - i) / 50000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # randomly sample a minibatch tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,)) Xb = binarize_data(Xtr.take(tr_idx, axis=0)) Xb = Xb.astype(theano.config.floatX) # set sgd and objective function hyperparams for this update MSM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \ mom_1=(scale*momentum), mom_2=0.98) MSM.set_train_switch(1.0) MSM.set_l1l2_weight(1.0) MSM.set_lam_nll(lam_nll=1.0) MSM.set_lam_kld(lam_kld_1=(1.0+extra_kl), lam_kld_2=(1.0+extra_kl)) MSM.set_lam_l2w(1e-6) MSM.set_kzg_weight(0.01) # perform a minibatch update and record the cost for this batch result = MSM.train_joint(Xb, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] print("-- batch {0:d} --".format(i)) print(" joint_cost: {0:.4f}".format(costs[0])) print(" nll_cost : {0:.4f}".format(costs[1])) print(" kld_cost : {0:.4f}".format(costs[2])) print(" reg_cost : {0:.4f}".format(costs[3])) costs = [0.0 for v in costs] if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))): Xva = row_shuffle(Xva) # draw some independent random samples from the model samp_count = 200 model_samps = MSM.sample_from_prior(samp_count) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "MX_SAMPLES_b{0:d}.png".format(i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # visualize some important weights in the model file_name = "MX_INF_1_WEIGHTS_b{0:d}.png".format(i) W = MSM.inf_1_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MX_INF_2_WEIGHTS_b{0:d}.png".format(i) W = MSM.inf_2_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MX_GEN_1_WEIGHTS_b{0:d}.png".format(i) W = MSM.gen_1_weights.get_value(borrow=False) utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MX_GEN_2_WEIGHTS_b{0:d}.png".format(i) W = MSM.gen_2_weights.get_value(borrow=False) utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MX_GEN_INF_WEIGHTS_b{0:d}.png".format(i) W = MSM.gen_inf_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) # compute information about posterior KLds on validation set post_klds = MSM.compute_post_klds(Xva[0:5000]) file_name = "MX_H0_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(post_klds[0].shape[1]), \ np.mean(post_klds[0], axis=0), file_name) file_name = "MX_HI_COND_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(post_klds[1].shape[1]), \ np.mean(post_klds[1], axis=0), file_name) file_name = "MX_HI_GLOB_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(post_klds[2].shape[1]), \ np.mean(post_klds[2], axis=0), file_name) # compute information about free-energy on validation set file_name = "MX_FREE_ENERGY_b{0:d}.png".format(i) fe_terms = MSM.compute_fe_terms(binarize_data(Xva[0:5000]), 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) print(" nll_bound : {0:.4f}".format(fe_mean)) utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ x_label='Posterior KLd', y_label='Negative Log-likelihood') return
def test_with_model_init(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr_shared = datasets[0][0] Xva_shared = datasets[1][0] Xtr = Xtr_shared.get_value(borrow=False).astype(theano.config.floatX) Xva = Xva_shared.get_value(borrow=False).astype(theano.config.floatX) tr_samples = Xtr.shape[0] batch_size = 500 batch_reps = 1 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ obs_dim = Xtr.shape[1] z_rnn_dim = 25 z_obs_dim = 5 jnt_dim = obs_dim + z_rnn_dim h_dim = 100 x_type = 'bernoulli' prior_sigma = 1.0 # some InfNet instances to build the TwoStageModel from X_sym = T.matrix('X_sym') ######################## # p_s0_obs_given_z_obs # ######################## params = {} shared_config = [z_obs_dim, 250, 250] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 1e-3 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_s0_obs_given_z_obs = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \ params=params, shared_param_dicts=None) p_s0_obs_given_z_obs.init_biases(0.2) ################# # p_hi_given_si # ################# params = {} shared_config = [jnt_dim, 500, 500] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_hi_given_si = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \ params=params, shared_param_dicts=None) p_hi_given_si.init_biases(0.2) ###################### # p_sip1_given_si_hi # ###################### params = {} shared_config = [(h_dim + z_rnn_dim), 500, 500] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_sip1_given_si_hi = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \ params=params, shared_param_dicts=None) p_sip1_given_si_hi.init_biases(0.2) ############### # q_z_given_x # ############### params = {} shared_config = [obs_dim, 250, 250] top_config = [shared_config[-1], (z_rnn_dim + z_obs_dim)] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_z_given_x = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \ params=params, shared_param_dicts=None) q_z_given_x.init_biases(0.2) ################### # q_hi_given_x_si # ################### params = {} shared_config = [(obs_dim + jnt_dim), 500, 500] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = softplus_actfun params['init_scale'] = 1.2 params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_hi_given_x_si = InfNet(rng=rng, Xd=X_sym, prior_sigma=prior_sigma, \ params=params, shared_param_dicts=None) q_hi_given_x_si.init_biases(0.2) ################################################################ # Define parameters for the MultiStageModel, and initialize it # ################################################################ print("Building the MultiStageModel...") msm_params = {} msm_params['x_type'] = x_type msm_params['obs_transform'] = 'sigmoid' MSM = MultiStageModel(rng=rng, x_in=X_sym, \ p_s0_obs_given_z_obs=p_s0_obs_given_z_obs, \ p_hi_given_si=p_hi_given_si, \ p_sip1_given_si_hi=p_sip1_given_si_hi, \ q_z_given_x=q_z_given_x, \ q_hi_given_x_si=q_hi_given_x_si, \ obs_dim=obs_dim, z_rnn_dim=z_rnn_dim, z_obs_dim=z_obs_dim, \ h_dim=h_dim, model_init_obs=False, model_init_rnn=True, \ ir_steps=3, params=msm_params) obs_mean = (0.9 * np.mean(Xtr, axis=0)) + 0.05 obs_mean_logit = np.log(obs_mean / (1.0 - obs_mean)) MSM.set_input_bias(-obs_mean) MSM.set_obs_bias(0.1*obs_mean_logit) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ costs = [0. for i in range(10)] learn_rate = 0.003 momentum = 0.5 for i in range(300000): scale = min(1.0, ((i+1) / 5000.0)) l1l2_weight = 1.0 #min(1.0, ((i+1) / 2500.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.92 if (i > 100000): momentum = 0.80 if (i > 50000): momentum = 0.65 else: momentum = 0.50 # randomly sample a minibatch tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,)) Xb = binarize_data(Xtr.take(tr_idx, axis=0)) Xb = Xb.astype(theano.config.floatX) # set sgd and objective function hyperparams for this update MSM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \ mom_1=(scale*momentum), mom_2=0.99) MSM.set_train_switch(1.0) MSM.set_l1l2_weight(l1l2_weight) MSM.set_lam_nll(lam_nll=1.0) MSM.set_lam_kld(lam_kld_1=1.0, lam_kld_2=1.0) MSM.set_lam_l2w(1e-5) MSM.set_kzg_weight(0.01) # perform a minibatch update and record the cost for this batch result = MSM.train_joint(Xb, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] print("-- batch {0:d} --".format(i)) print(" joint_cost: {0:.4f}".format(costs[0])) print(" nll_cost : {0:.4f}".format(costs[1])) print(" kld_cost : {0:.4f}".format(costs[2])) print(" reg_cost : {0:.4f}".format(costs[3])) costs = [0.0 for v in costs] if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))): Xva = row_shuffle(Xva) # draw some independent random samples from the model samp_count = 200 model_samps = MSM.sample_from_prior(samp_count) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "MZ_SAMPLES_b{0:d}.png".format(i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # visualize some important weights in the model file_name = "MZ_INF_1_WEIGHTS_b{0:d}.png".format(i) W = MSM.inf_1_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MZ_INF_2_WEIGHTS_b{0:d}.png".format(i) W = MSM.inf_2_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MZ_GEN_1_WEIGHTS_b{0:d}.png".format(i) W = MSM.gen_1_weights.get_value(borrow=False) utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MZ_GEN_2_WEIGHTS_b{0:d}.png".format(i) W = MSM.gen_2_weights.get_value(borrow=False) utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "MZ_GEN_INF_WEIGHTS_b{0:d}.png".format(i) W = MSM.gen_inf_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) # compute information about posterior KLds on validation set post_klds = MSM.compute_post_klds(Xva[0:5000]) file_name = "MZ_H0_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(post_klds[0].shape[1]), \ np.mean(post_klds[0], axis=0), file_name) file_name = "MZ_HI_COND_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(post_klds[1].shape[1]), \ np.mean(post_klds[1], axis=0), file_name) file_name = "MZ_HI_GLOB_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(post_klds[2].shape[1]), \ np.mean(post_klds[2], axis=0), file_name) # compute information about free-energy on validation set file_name = "MZ_FREE_ENERGY_b{0:d}.png".format(i) fe_terms = MSM.compute_fe_terms(binarize_data(Xva[0:5000]), 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) print(" nll_bound : {0:.4f}".format(fe_mean)) utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ x_label='Posterior KLd', y_label='Negative Log-likelihood') return
def pretrain_osm(lam_kld=0.0): # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr = datasets[0][0] Xtr = Xtr.get_value(borrow=False) Xva = datasets[2][0] Xva = Xva.get_value(borrow=False) print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape), str(Xva.shape))) # get and set some basic dataset information Xtr_mean = np.mean(Xtr, axis=0) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 100 batch_reps = 5 # setup some symbolic variables and stuff Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') data_dim = Xtr.shape[1] prior_sigma = 1.0 ########################## # NETWORK CONFIGURATIONS # ########################## gn_params = {} shared_config = [PRIOR_DIM, 1000, 1000] top_config = [shared_config[-1], data_dim] gn_params['shared_config'] = shared_config gn_params['mu_config'] = top_config gn_params['sigma_config'] = top_config gn_params['activation'] = relu_actfun gn_params['init_scale'] = 1.4 gn_params['lam_l2a'] = 0.0 gn_params['vis_drop'] = 0.0 gn_params['hid_drop'] = 0.0 gn_params['bias_noise'] = 0.0 gn_params['input_noise'] = 0.0 # choose some parameters for the continuous inferencer in_params = {} shared_config = [data_dim, 1000, 1000] top_config = [shared_config[-1], PRIOR_DIM] in_params['shared_config'] = shared_config in_params['mu_config'] = top_config in_params['sigma_config'] = top_config in_params['activation'] = relu_actfun in_params['init_scale'] = 1.4 in_params['lam_l2a'] = 0.0 in_params['vis_drop'] = 0.0 in_params['hid_drop'] = 0.0 in_params['bias_noise'] = 0.0 in_params['input_noise'] = 0.0 # Initialize the base networks for this OneStageModel IN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=in_params, shared_param_dicts=None) GN = InfNet(rng=rng, Xd=Xd, prior_sigma=prior_sigma, \ params=gn_params, shared_param_dicts=None) # Initialize biases in IN and GN IN.init_biases(0.2) GN.init_biases(0.2) ######################### # INITIALIZE THE GIPAIR # ######################### osm_params = {} osm_params['x_type'] = 'bernoulli' osm_params['xt_transform'] = 'sigmoid' osm_params['logvar_bound'] = LOGVAR_BOUND OSM = OneStageModel(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, \ p_x_given_z=GN, q_z_given_x=IN, \ x_dim=data_dim, z_dim=PRIOR_DIM, params=osm_params) OSM.set_lam_l2w(1e-5) safe_mean = (0.9 * Xtr_mean) + 0.05 safe_mean_logit = np.log(safe_mean / (1.0 - safe_mean)) OSM.set_output_bias(safe_mean_logit) OSM.set_input_bias(-Xtr_mean) ###################### # BASIC VAE TRAINING # ###################### out_file = open(RESULT_PATH + "pt_osm_results.txt", 'wb') # Set initial learning rate and basic SGD hyper parameters obs_costs = np.zeros((batch_size, )) costs = [0. for i in range(10)] learn_rate = 0.0005 for i in range(150000): scale = min(1.0, float(i) / 10000.0) if ((i > 1) and ((i % 20000) == 0)): learn_rate = learn_rate * 0.9 # do a minibatch update of the model, and compute some costs tr_idx = npr.randint(low=0, high=tr_samples, size=(batch_size, )) Xd_batch = Xtr.take(tr_idx, axis=0) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # do a minibatch update of the model, and compute some costs OSM.set_sgd_params(lr_1=(scale * learn_rate), mom_1=0.5, mom_2=0.98) OSM.set_lam_nll(1.0) OSM.set_lam_kld(lam_kld_1=(1.0 + (scale * (lam_kld - 1.0))), lam_kld_2=0.0) result = OSM.train_joint(Xd_batch, Xc_batch, Xm_batch, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 1000) == 0): # record and then reset the cost trackers costs = [(v / 1000.0) for v in costs] str_1 = "-- batch {0:d} --".format(i) str_2 = " joint_cost: {0:.4f}".format(costs[0]) str_3 = " nll_cost : {0:.4f}".format(costs[1]) str_4 = " kld_cost : {0:.4f}".format(costs[2]) str_5 = " reg_cost : {0:.4f}".format(costs[3]) costs = [0.0 for v in costs] # print out some diagnostic information joint_str = "\n".join([str_1, str_2, str_3, str_4, str_5]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() if ((i % 2000) == 0): Xva = row_shuffle(Xva) model_samps = OSM.sample_from_prior(500) file_name = RESULT_PATH + "pt_osm_samples_b{0:d}_XG.png".format(i) utils.visualize_samples(model_samps, file_name, num_rows=20) # compute information about free-energy on validation set file_name = RESULT_PATH + "pt_osm_free_energy_b{0:d}.png".format(i) fe_terms = OSM.compute_fe_terms(Xva[0:2500], 20) fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) fe_str = " nll_bound : {0:.4f}".format(fe_mean) print(fe_str) out_file.write(fe_str + "\n") utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ x_label='Posterior KLd', y_label='Negative Log-likelihood') # compute information about posterior KLds on validation set file_name = RESULT_PATH + "pt_osm_post_klds_b{0:d}.png".format(i) post_klds = OSM.compute_post_klds(Xva[0:2500]) post_dim_klds = np.mean(post_klds, axis=0) utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, \ file_name) if ((i % 5000) == 0): IN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_b{0:d}_IN.pkl".format(i)) GN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_b{0:d}_GN.pkl".format(i)) IN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_IN.pkl") GN.save_to_file(f_name=RESULT_PATH + "pt_osm_params_GN.pkl") return
def test_with_model_init(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = to_fX(datasets[0][0]) Xva = to_fX(datasets[1][0]) Ytr = datasets[0][1] Yva = datasets[1][1] Xtr_class_groups = make_class_groups(Xtr, Ytr) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 300 BD = lambda ary: binarize_data(ary) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ obs_dim = Xtr.shape[1] z_dim = 32 h_dim = 100 ir_steps = 2 init_scale = 1.0 x_type = 'bernoulli' # some InfNet instances to build the TwoStageModel from x_in = T.matrix('x_in') x_pos = T.matrix('x_pos') y_in = T.lvector('y_in') ################# # p_hi_given_si # ################# params = {} shared_config = [obs_dim, 500, 500] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_hi_given_si = InfNet(rng=rng, Xd=x_in, \ params=params, shared_param_dicts=None) p_hi_given_si.init_biases(0.2) ###################### # p_sip1_given_si_hi # ###################### params = {} shared_config = [(h_dim + obs_dim), 500, 500] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_sip1_given_si_hi = InfNet(rng=rng, Xd=x_in, \ params=params, shared_param_dicts=None) p_sip1_given_si_hi.init_biases(0.2) ################ # p_s0_given_z # ################ params = {} shared_config = [z_dim, 500, 500] top_config = [shared_config[-1], obs_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_s0_given_z = InfNet(rng=rng, Xd=x_in, \ params=params, shared_param_dicts=None) p_s0_given_z.init_biases(0.2) ############### # q_z_given_x # ############### params = {} shared_config = [obs_dim, (500, 4), (500, 4)] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.2 params['hid_drop'] = 0.5 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_z_given_x = InfNet(rng=rng, Xd=x_in, \ params=params, shared_param_dicts=None) q_z_given_x.init_biases(0.0) ################### # q_hi_given_x_si # ################### params = {} shared_config = [(obs_dim + obs_dim), 800, 800] top_config = [shared_config[-1], h_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_hi_given_x_si = InfNet(rng=rng, Xd=x_in, \ params=params, shared_param_dicts=None) q_hi_given_x_si.init_biases(0.2) ################################################################ # Define parameters for the MultiStageModel, and initialize it # ################################################################ print("Building the MultiStageModel...") msm_params = {} msm_params['x_type'] = x_type msm_params['obs_transform'] = 'sigmoid' MSM = MultiStageModelSS(rng=rng, \ x_in=x_in, x_pos=x_pos, y_in=y_in, \ p_s0_given_z=p_s0_given_z, \ p_hi_given_si=p_hi_given_si, \ p_sip1_given_si_hi=p_sip1_given_si_hi, \ q_z_given_x=q_z_given_x, \ q_hi_given_x_si=q_hi_given_x_si, \ class_count=10, \ obs_dim=obs_dim, z_dim=z_dim, h_dim=h_dim, \ ir_steps=ir_steps, params=msm_params) MSM.set_lam_class(lam_class=20.0) MSM.set_lam_nll(lam_nll=1.0) MSM.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.9, \ lam_kld_p2q=0.1) MSM.set_lam_l2w(1e-4) MSM.set_drop_rate(0.0) MSM.q_hi_given_x_si.set_bias_noise(0.0) MSM.p_hi_given_si.set_bias_noise(0.0) MSM.p_sip1_given_si_hi.set_bias_noise(0.0) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ out_file = open("MSS_A_RESULTS.txt", 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.5 batch_idx = np.arange(batch_size) + tr_samples for i in range(250000): scale = min(1.0, ((i+1) / 2000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 if (i > 20000): momentum = 0.90 else: momentum = 0.50 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr, Ytr = row_shuffle(Xtr, Ytr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update MSM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.99) MSM.set_train_switch(1.0) # perform a minibatch update and record the cost for this batch Xi_tr = Xtr.take(batch_idx, axis=0) Yi_tr = Ytr.take(batch_idx, axis=0) Xp_tr, Xn_tr = sample_class_groups(Yi_tr, Xtr_class_groups) result = MSM.train_joint(BD(Xi_tr), BD(Xp_tr), Yi_tr) costs = [(costs[j] + result[j]) for j in range(len(result)-1)] # output useful information about training progress if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost : {0:.4f}".format(costs[0]) str3 = " class_cost : {0:.4f}".format(costs[1]) str4 = " nll_cost : {0:.4f}".format(costs[2]) str5 = " kld_cost : {0:.4f}".format(costs[3]) str6 = " reg_cost : {0:.4f}".format(costs[4]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))): # Get some validation samples for computing diagnostics Xva, Yva = row_shuffle(Xva, Yva) Xb_va = Xva[0:2500] Yb_va = Yva[0:2500] # draw some independent random samples from the model samp_count = 200 model_samps = MSM.sample_from_prior(samp_count) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "MSS_A_SAMPLES_IND_b{0:d}.png".format(i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # draw some conditional random samples from the model Xs = Xb_va[0:50] # only use validation set samples Xs = np.repeat(Xs, 4, axis=0) samp_count = Xs.shape[0] utils.visualize_samples(seq_samps, file_name, num_rows=20) # draw some conditional random samples from the model model_samps = MSM.sample_from_input(BD(Xs), guided_decoding=False) model_samps.append(Xs) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "MSS_A_SAMPLES_CND_UD_b{0:d}.png".format(i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # compute information about posterior KLds on validation set raw_costs = MSM.compute_raw_costs(BD(Xb_va), BD(Xb_va)) init_nll, init_kld, q2p_kld, p2q_kld, step_nll, step_kld = raw_costs file_name = "MSS_A_H0_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(init_kld.shape[1]), \ np.mean(init_kld, axis=0), file_name) file_name = "MSS_A_HI_Q2P_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(q2p_kld.shape[1]), \ np.mean(q2p_kld, axis=0), file_name) file_name = "MSS_A_HI_P2Q_KLDS_b{0:d}.png".format(i) utils.plot_stem(np.arange(p2q_kld.shape[1]), \ np.mean(p2q_kld, axis=0), file_name) # draw weights for the initial encoder/classifier file_name = "MSS_A_QZX_WEIGHTS_b{0:d}.png".format(i) W = q_z_given_x.shared_layers[0].W.get_value(borrow=False).T utils.visualize_samples(W, file_name, num_rows=20) # compute free-energy terms on training samples fe_terms = MSM.compute_fe_terms(BD(Xtr[0:2500]), BD(Xtr[0:2500]), 30) fe_nll = np.mean(fe_terms[0]) fe_kld = np.mean(fe_terms[1]) fe_joint = fe_nll + fe_kld joint_str = " vfe-tr: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() # compute free-energy terms on validation samples fe_terms = MSM.compute_fe_terms(BD(Xb_va), BD(Xb_va), 30) fe_nll = np.mean(fe_terms[0]) fe_kld = np.mean(fe_terms[1]) fe_joint = fe_nll + fe_kld joint_str = " vfe-va: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() # compute multi-sample estimate of classification error err_rate, err_idx, y_preds = MSM.class_error(Xb_va, Yb_va, \ samples=30, prep_func=BD) joint_str = " va-class-error: {0:.4f}".format(err_rate) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() # draw some conditional random samples from the model Xs = Xb_va[err_idx] # use validation samples with class errors if (Xs.shape[0] > 50): Xs = Xs[:50] Xs = np.repeat(Xs, 4, axis=0) if ((Xs.shape[0] % 20) != 0): # round-off the number of error examples, for nice display remainder = Xs.shape[0] % 20 Xs = Xs[:-remainder] samp_count = Xs.shape[0] # draw some conditional random samples from the model model_samps = MSM.sample_from_input(BD(Xs), guided_decoding=False) model_samps.append(Xs) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "MSS_A_SAMPLES_CND_ERR_b{0:d}.png".format(i) utils.visualize_samples(seq_samps, file_name, num_rows=20)
def test_gi_pair(): # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr = datasets[0][0].get_value(borrow=False).astype(theano.config.floatX) tr_samples = Xtr.shape[0] # Construct a GenNet and an InfNet, then test constructor for GIPair. # Do basic testing, to make sure classes aren't completely broken. Xp = T.matrix('Xp_base') Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') data_dim = Xtr.shape[1] prior_dim = 64 prior_sigma = 1.0 # Choose some parameters for the generator network gn_params = {} gn_config = [prior_dim, 1000, 1000, data_dim] gn_params['mlp_config'] = gn_config gn_params['activation'] = relu_actfun gn_params['out_type'] = 'bernoulli' gn_params['init_scale'] = 2.0 gn_params['lam_l2a'] = 1e-2 gn_params['vis_drop'] = 0.0 gn_params['hid_drop'] = 0.0 gn_params['bias_noise'] = 0.1 # choose some parameters for the continuous inferencer in_params = {} shared_config = [data_dim, (250, 4), (250, 4)] top_config = [shared_config[-1], (125, 4), prior_dim] in_params['shared_config'] = shared_config in_params['mu_config'] = top_config in_params['sigma_config'] = top_config in_params['activation'] = relu_actfun in_params['init_scale'] = 2.0 in_params['lam_l2a'] = 1e-2 in_params['vis_drop'] = 0.0 in_params['hid_drop'] = 0.0 in_params['bias_noise'] = 0.1 in_params['input_noise'] = 0.1 # Initialize the base networks for this GIPair IN = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, prior_sigma=prior_sigma, \ params=in_params, shared_param_dicts=None) GN = GenNet(rng=rng, Xp=Xp, prior_sigma=prior_sigma, \ params=gn_params, shared_param_dicts=None) # Initialize biases in IN and GN IN.init_biases(0.0) GN.init_biases(0.1) # Initialize the GIPair GIP = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN, i_net=IN, \ data_dim=data_dim, prior_dim=prior_dim, params=None) GIP.set_lam_l2w(1e-4) # Set initial learning rate and basic SGD hyper parameters learn_rate = 0.001 for i in range(750000): scale = min(1.0, float(i) / 25000.0) if ((i+1 % 100000) == 0): learn_rate = learn_rate * 0.75 GIP.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.95) GIP.set_lam_nll(lam_nll=1.0) GIP.set_lam_kld(lam_kld=(1.0 * scale)) # get some data to train with tr_idx = npr.randint(low=0,high=tr_samples,size=(100,)) Xd_batch = Xtr.take(tr_idx, axis=0) #binarize_data(Xtr.take(tr_idx, axis=0)) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # do a minibatch update of the model, and compute some costs outputs = GIP.train_joint(Xd_batch, Xc_batch, Xm_batch) joint_cost = 1.0 * outputs[0] data_nll_cost = 1.0 * outputs[1] post_kld_cost = 1.0 * outputs[2] other_reg_cost = 1.0 * outputs[3] if ((i % 1000) == 0): print("batch: {0:d}, joint_cost: {1:.4f}, data_nll_cost: {2:.4f}, post_kld_cost: {3:.4f}, other_reg_cost: {4:.4f}".format( \ i, joint_cost, data_nll_cost, post_kld_cost, other_reg_cost)) if ((i % 5000) == 0): file_name = "GIP_CHAIN_SAMPLES_b{0:d}.png".format(i) Xd_samps = np.repeat(Xd_batch[0:10,:], 3, axis=0) sample_lists = GIP.sample_gil_from_data(Xd_samps, loop_iters=20) Xs = np.vstack(sample_lists["data samples"]) utils.visualize_samples(Xs, file_name, num_rows=20) # draw inference net first layer weights file_name = "GIP_INF_WEIGHTS_b{0:d}.png".format(i) utils.visualize_net_layer(GIP.IN.shared_layers[0], file_name) # draw generator net final layer weights file_name = "GIP_GEN_WEIGHTS_b{0:d}.png".format(i) utils.visualize_net_layer(GIP.GN.mlp_layers[-1], file_name, use_transpose=True) print("TESTING COMPLETE!") return
if __name__ == "__main__": # TEST CODE FOR MODEL SAVING AND LOADING from load_data import load_udm, load_udm_ss, load_mnist from NetLayers import relu_actfun, softplus_actfun, \ safe_softmax, safe_log # Simple test code, to check that everything is basically functional. print("TESTING...") # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr = datasets[0][0] Xtr = Xtr.get_value(borrow=False) Xva = datasets[1][0] Xva = Xva.get_value(borrow=False) print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape), str(Xva.shape))) # get and set some basic dataset information tr_samples = Xtr.shape[0] data_dim = Xtr.shape[1] batch_size = 128 prior_dim = 50 prior_sigma = 1.0 Xtr_mean = np.mean(Xtr, axis=0, keepdims=True) Xtr_mean = (0.0 * Xtr_mean) + np.mean(Xtr)
def train_walk_from_pretrained_osm(lam_kld=0.0): # Simple test code, to check that everything is basically functional. print("TESTING...") # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr = datasets[0][0] Xtr = Xtr.get_value(borrow=False) Xva = datasets[2][0] Xva = Xva.get_value(borrow=False) print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape),str(Xva.shape))) # get and set some basic dataset information tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] data_dim = Xtr.shape[1] batch_size = 100 batch_reps = 5 prior_sigma = 1.0 Xtr_mean = np.mean(Xtr, axis=0, keepdims=True) Xtr_mean = (0.0 * Xtr_mean) + np.mean(np.mean(Xtr,axis=1)) Xc_mean = np.repeat(Xtr_mean, batch_size, axis=0) # Symbolic inputs Xd = T.matrix(name='Xd') Xc = T.matrix(name='Xc') Xm = T.matrix(name='Xm') Xt = T.matrix(name='Xt') ############################### # Setup discriminator network # ############################### # Set some reasonable mlp parameters dn_params = {} # Set up some proto-networks pc0 = [data_dim, (300, 4), (300, 4), 10] dn_params['proto_configs'] = [pc0] # Set up some spawn networks sc0 = {'proto_key': 0, 'input_noise': 0.0, 'bias_noise': 0.1, 'do_dropout': True} #sc1 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} dn_params['spawn_configs'] = [sc0] dn_params['spawn_weights'] = [1.0] # Set remaining params dn_params['init_scale'] = 0.75 dn_params['lam_l2a'] = 1e-2 dn_params['vis_drop'] = 0.2 dn_params['hid_drop'] = 0.5 # Initialize a network object to use as the discriminator DN = PeaNet(rng=rng, Xd=Xd, params=dn_params) DN.init_biases(0.0) ####################################################### # Load inferencer and generator from saved parameters # ####################################################### gn_fname = RESULT_PATH+"pt_osm_params_b80000_GN.pkl" in_fname = RESULT_PATH+"pt_osm_params_b80000_IN.pkl" IN = load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) GN = load_infnet_from_file(f_name=gn_fname, rng=rng, Xd=Xd) ######################################################## # Define parameters for the VCGLoop, and initialize it # ######################################################## print("Building the VCGLoop...") vcgl_params = {} vcgl_params['x_type'] = 'bernoulli' vcgl_params['xt_transform'] = 'sigmoid' vcgl_params['logvar_bound'] = LOGVAR_BOUND vcgl_params['cost_decay'] = 0.0 vcgl_params['chain_type'] = 'walkout' vcgl_params['lam_l2d'] = 5e-2 VCGL = VCGLoop(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, Xt=Xt, \ i_net=IN, g_net=GN, d_net=DN, chain_len=6, \ data_dim=data_dim, prior_dim=PRIOR_DIM, params=vcgl_params) out_file = open(RESULT_PATH+"pt_walk_results.txt", 'wb') #################################################### # Train the VCGLoop by unrolling and applying BPTT # #################################################### learn_rate = 0.0004 cost_1 = [0. for i in range(10)] for i in range(50000): scale = float(min((i+1), 5000)) / 5000.0 if ((i+1 % 25000) == 0): learn_rate = learn_rate * 0.9 ######################################## # TRAIN THE CHAIN IN FREE-RUNNING MODE # ######################################## VCGL.set_all_sgd_params(learn_rate=(scale*learn_rate), \ mom_1=0.5, mom_2=0.99) VCGL.set_disc_weights(dweight_gn=25.0, dweight_dn=25.0) VCGL.set_lam_chain_nll(1.0) VCGL.set_lam_chain_kld(lam_kld) # get some data to train with tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,)) Xd_batch = Xtr.take(tr_idx, axis=0) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # examples from the target distribution, to train discriminator tr_idx = npr.randint(low=0,high=tr_samples,size=(2*batch_size,)) Xt_batch = Xtr.take(tr_idx, axis=0) # do a minibatch update of the model, and compute some costs outputs = VCGL.train_joint(Xd_batch, Xc_batch, Xm_batch, Xt_batch, batch_reps) cost_1 = [(cost_1[k] + 1.*outputs[k]) for k in range(len(outputs))] if ((i % 500) == 0): cost_1 = [(v / 500.0) for v in cost_1] o_str_1 = "batch: {0:d}, joint_cost: {1:.4f}, chain_nll_cost: {2:.4f}, chain_kld_cost: {3:.4f}, disc_cost_gn: {4:.4f}, disc_cost_dn: {5:.4f}".format( \ i, cost_1[0], cost_1[1], cost_1[2], cost_1[5], cost_1[6]) print(o_str_1) cost_1 = [0. for v in cost_1] if ((i % 1000) == 0): tr_idx = npr.randint(low=0,high=Xtr.shape[0],size=(5,)) va_idx = npr.randint(low=0,high=Xva.shape[0],size=(5,)) Xd_batch = np.vstack([Xtr.take(tr_idx, axis=0), Xva.take(va_idx, axis=0)]) # draw some chains of samples from the VAE loop file_name = RESULT_PATH+"pt_walk_chain_samples_b{0:d}.png".format(i) Xd_samps = np.repeat(Xd_batch, 3, axis=0) sample_lists = VCGL.OSM.sample_from_chain(Xd_samps, loop_iters=20) Xs = np.vstack(sample_lists["data samples"]) utils.visualize_samples(Xs, file_name, num_rows=20) # draw some masked chains of samples from the VAE loop file_name = RESULT_PATH+"pt_walk_mask_samples_b{0:d}.png".format(i) Xd_samps = np.repeat(Xc_mean[0:Xd_batch.shape[0],:], 3, axis=0) Xc_samps = np.repeat(Xd_batch, 3, axis=0) Xm_rand = sample_masks(Xc_samps, drop_prob=0.0) Xm_patch = sample_patch_masks(Xc_samps, (28,28), (16,16)) Xm_samps = Xm_rand * Xm_patch sample_lists = VCGL.OSM.sample_from_chain(Xd_samps, \ X_c=Xc_samps, X_m=Xm_samps, loop_iters=20) Xs = np.vstack(sample_lists["data samples"]) utils.visualize_samples(Xs, file_name, num_rows=20) # draw some samples independently from the GenNet's prior file_name = RESULT_PATH+"pt_walk_prior_samples_b{0:d}.png".format(i) Xs = VCGL.sample_from_prior(20*20) utils.visualize_samples(Xs, file_name, num_rows=20) # DUMP PARAMETERS FROM TIME-TO-TIME if (i % 10000 == 0): DN.save_to_file(f_name=RESULT_PATH+"pt_walk_params_b{0:d}_DN.pkl".format(i)) IN.save_to_file(f_name=RESULT_PATH+"pt_walk_params_b{0:d}_IN.pkl".format(i)) GN.save_to_file(f_name=RESULT_PATH+"pt_walk_params_b{0:d}_GN.pkl".format(i)) return
def test_imocld_imp_mnist(step_type='add', occ_dim=14, drop_prob=0.0, attention=False): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 250 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ x_dim = Xtr.shape[1] write_dim = 300 enc_dim = 300 dec_dim = 300 mix_dim = 20 z_dim = 100 n_iter = 16 dp_int = int(100.0 * drop_prob) rnninits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } att_tag = "NA" # attention not implemented yet # setup the reader and writer (shared by primary and guide policies) read_dim = 2*x_dim # dimension of output from reader_mlp reader_mlp = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer_mlp = MLP([None, None], [dec_dim, write_dim, x_dim], \ name="writer_mlp", **inits) # mlps for setting conditionals over z_mix mix_var_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], \ name="mix_var_mlp", **inits) mix_enc_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], \ name="mix_enc_mlp", **inits) # mlp for decoding z_mix into a distribution over initial LSTM states mix_dec_mlp = MLP([Tanh(), Tanh()], \ [mix_dim, 250, (2*enc_dim + 2*dec_dim + 2*enc_dim)], \ name="mix_dec_mlp", **inits) # mlps for processing inputs to LSTMs var_mlp_in = MLP([Identity()], [(read_dim + dec_dim), 4*enc_dim], \ name="var_mlp_in", **inits) enc_mlp_in = MLP([Identity()], [(read_dim + dec_dim), 4*enc_dim], \ name="enc_mlp_in", **inits) dec_mlp_in = MLP([Identity()], [ z_dim, 4*dec_dim], \ name="dec_mlp_in", **inits) # mlps for turning LSTM outputs into conditionals over z_gen var_mlp_out = CondNet([], [enc_dim, z_dim], name="var_mlp_out", **inits) enc_mlp_out = CondNet([], [enc_dim, z_dim], name="enc_mlp_out", **inits) # LSTMs for the actual LSTMs (obviously, perhaps) var_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, \ name="var_rnn", **rnninits) enc_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, \ name="enc_rnn", **rnninits) dec_rnn = BiasedLSTM(dim=dec_dim, ig_bias=2.0, fg_bias=2.0, \ name="dec_rnn", **rnninits) draw = IMoCLDrawModels( n_iter, step_type=step_type, # step_type can be 'add' or 'jump' reader_mlp=reader_mlp, writer_mlp=writer_mlp, mix_enc_mlp=mix_enc_mlp, mix_dec_mlp=mix_dec_mlp, mix_var_mlp=mix_var_mlp, enc_mlp_in=enc_mlp_in, enc_mlp_out=enc_mlp_out, enc_rnn=enc_rnn, dec_mlp_in=dec_mlp_in, dec_rnn=dec_rnn, var_mlp_in=var_mlp_in, var_mlp_out=var_mlp_out, var_rnn=var_rnn) draw.initialize() # build the cost gradients, training function, samplers, etc. draw.build_model_funcs() #draw.load_model_params(f_name="TBCLM_IMP_PARAMS_OD{}_DP{}_{}_{}.pkl".format(occ_dim, dp_int, step_type, att_tag)) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ print("Beginning to train the model...") out_file = open("TBCLM_IMP_RESULTS_OD{}_DP{}_{}_{}.txt".format(occ_dim, dp_int, step_type, att_tag), 'wb') out_file.flush() costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.5 batch_idx = np.arange(batch_size) + tr_samples for i in range(250000): scale = min(1.0, ((i+1) / 1000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 if (i > 10000): momentum = 0.90 else: momentum = 0.50 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update zero_ary = np.zeros((1,)) draw.lr.set_value(to_fX(zero_ary + learn_rate)) draw.mom_1.set_value(to_fX(zero_ary + momentum)) draw.mom_2.set_value(to_fX(zero_ary + 0.99)) # perform a minibatch update and record the cost for this batch Xb = to_fX(Xtr.take(batch_idx, axis=0)) _, Xb, Mb = construct_masked_data(Xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=None) result = draw.train_joint(Xb, Mb) costs = [(costs[j] + result[j]) for j in range(len(result))] if ((i % 200) == 0): costs = [(v / 200.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " total_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_term : {0:.4f}".format(costs[2]) str5 = " kld_q2p : {0:.4f}".format(costs[3]) str6 = " kld_p2q : {0:.4f}".format(costs[4]) str7 = " reg_term : {0:.4f}".format(costs[5]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): draw.save_model_params("TBCLM_IMP_PARAMS_OD{}_DP{}_{}_{}.pkl".format(occ_dim, dp_int, step_type, att_tag)) # compute a small-sample estimate of NLL bound on validation set Xva = row_shuffle(Xva) Xb = to_fX(Xva[:5000]) _, Xb, Mb = construct_masked_data(Xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=None) va_costs = draw.compute_nll_bound(Xb, Mb) str1 = " va_nll_bound : {}".format(va_costs[1]) str2 = " va_nll_term : {}".format(va_costs[2]) str3 = " va_kld_q2p : {}".format(va_costs[3]) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush()
def test_seq_cond_gen_copy(step_type='add', res_tag="AAA"): ############################## # File tag, for output stuff # ############################## result_tag = "{}TEST_{}".format(RESULT_PATH, res_tag) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xte = datasets[2][0] # merge validation set and training set, and test on test set. #Xtr = np.concatenate((Xtr, Xva), axis=0) #Xva = Xte Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) # basic params batch_size = 128 traj_len = 20 im_dim = 28 obs_dim = im_dim * im_dim def sample_batch(np_ary, bs=100): row_count = np_ary.shape[0] samp_idx = npr.randint(low=0, high=row_count, size=(bs, )) xb = np_ary.take(samp_idx, axis=0) return xb ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ total_steps = traj_len init_steps = 5 exit_rate = 0.1 nll_weight = 0.0 x_dim = obs_dim y_dim = obs_dim z_dim = 128 att_spec_dim = 5 rnn_dim = 512 mlp_dim = 512 def visualize_attention(result, pre_tag="AAA", post_tag="AAA"): seq_len = result[0].shape[0] samp_count = result[0].shape[1] # get generated predictions x_samps = np.zeros((seq_len * samp_count, obs_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): x_samps[idx] = result[0][s2, s1, :] idx += 1 file_name = "{0:s}_traj_xs_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(x_samps, file_name, num_rows=samp_count) # get sequential attention maps seq_samps = np.zeros((seq_len * samp_count, obs_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = result[1][s2, s1, :] idx += 1 file_name = "{0:s}_traj_att_maps_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=samp_count) # get sequential attention maps (read out values) seq_samps = np.zeros((seq_len * samp_count, obs_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = result[2][s2, s1, :] idx += 1 file_name = "{0:s}_traj_read_outs_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=samp_count) # get original input sequences seq_samps = np.zeros((seq_len * samp_count, obs_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = result[3][s2, s1, :] idx += 1 file_name = "{0:s}_traj_xs_in_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=samp_count) return rnninits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } # module for doing local 2d read defined by an attention specification img_scale = 1.0 # image coords will range over [-img_scale...img_scale] read_N = 2 # use NxN grid for reader reader_mlp = FovAttentionReader2d(x_dim=obs_dim, width=im_dim, height=im_dim, N=read_N, img_scale=img_scale, att_scale=0.5, **inits) read_dim = reader_mlp.read_dim # total number of "pixels" read by reader # MLP for updating belief state based on con_rnn writer_mlp = MLP([None, None], [rnn_dim, mlp_dim, obs_dim], \ name="writer_mlp", **inits) # mlps for processing inputs to LSTMs con_mlp_in = MLP([Identity()], \ [ z_dim, 4*rnn_dim], \ name="con_mlp_in", **inits) var_mlp_in = MLP([Identity()], \ [(read_dim + read_dim + att_spec_dim + rnn_dim), 4*rnn_dim], \ name="var_mlp_in", **inits) gen_mlp_in = MLP([Identity()], \ [ (read_dim + att_spec_dim + rnn_dim), 4*rnn_dim], \ name="gen_mlp_in", **inits) # mlps for turning LSTM outputs into conditionals over z_gen con_mlp_out = CondNet([], [rnn_dim, att_spec_dim], \ name="con_mlp_out", **inits) gen_mlp_out = CondNet([], [rnn_dim, z_dim], name="gen_mlp_out", **inits) var_mlp_out = CondNet([], [rnn_dim, z_dim], name="var_mlp_out", **inits) # LSTMs for the actual LSTMs (obviously, perhaps) con_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="con_rnn", **rnninits) gen_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="gen_rnn", **rnninits) var_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="var_rnn", **rnninits) SCG = SeqCondGenRAM(x_and_y_are_seqs=False, total_steps=total_steps, init_steps=init_steps, exit_rate=exit_rate, nll_weight=nll_weight, step_type=step_type, x_dim=obs_dim, y_dim=obs_dim, reader_mlp=reader_mlp, writer_mlp=writer_mlp, con_mlp_in=con_mlp_in, con_mlp_out=con_mlp_out, con_rnn=con_rnn, gen_mlp_in=gen_mlp_in, gen_mlp_out=gen_mlp_out, gen_rnn=gen_rnn, var_mlp_in=var_mlp_in, var_mlp_out=var_mlp_out, var_rnn=var_rnn) SCG.initialize() compile_start_time = time.time() # build the attention trajectory sampler SCG.build_attention_funcs() # quick test of attention trajectory sampler Xb = sample_batch(Xtr, bs=32) result = SCG.sample_attention(Xb, Xb) visualize_attention(result, pre_tag=result_tag, post_tag="b0") # build the main model functions (i.e. training and cost functions) SCG.build_model_funcs() compile_end_time = time.time() compile_minutes = (compile_end_time - compile_start_time) / 60.0 print("THEANO COMPILE TIME (MIN): {}".format(compile_minutes)) # TEST SAVE/LOAD FUNCTIONALITY param_save_file = "{}_params.pkl".format(result_tag) SCG.save_model_params(param_save_file) SCG.load_model_params(param_save_file) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ print("Beginning to train the model...") out_file = open("{}_results.txt".format(result_tag), 'wb') out_file.flush() costs = [0. for i in range(10)] learn_rate = 0.0001 momentum = 0.95 for i in range(250000): lr_scale = min(1.0, ((i + 1) / 5000.0)) mom_scale = min(1.0, ((i + 1) / 10000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # set sgd and objective function hyperparams for this update SCG.set_sgd_params(lr=lr_scale * learn_rate, mom_1=mom_scale * momentum, mom_2=0.99) SCG.set_lam_kld(lam_kld_q2p=0.95, lam_kld_p2q=0.05, \ lam_kld_amu=0.0, lam_kld_alv=0.1) # perform a minibatch update and record the cost for this batch Xb = sample_batch(Xtr, bs=batch_size) result = SCG.train_joint(Xb, Xb) costs = [(costs[j] + result[j]) for j in range(len(result))] # output diagnostic information and checkpoint parameters, etc. if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " total_cost: {0:.4f}".format(costs[0]) str3 = " nll_term : {0:.4f}".format(costs[1]) str4 = " kld_q2p : {0:.4f}".format(costs[2]) str5 = " kld_p2q : {0:.4f}".format(costs[3]) str6 = " kld_amu : {0:.4f}".format(costs[4]) str7 = " kld_alv : {0:.4f}".format(costs[5]) str8 = " reg_term : {0:.4f}".format(costs[6]) joint_str = "\n".join( [str1, str2, str3, str4, str5, str6, str7, str8]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 500) == 0): SCG.save_model_params("{}_params.pkl".format(result_tag)) ############################################# # check model performance on validation set # ############################################# Xb = sample_batch(Xva, bs=500) result = SCG.compute_nll_bound(Xb, Xb) str2 = " va_total_cost: {0:.4f}".format(float(result[0])) str3 = " va_nll_term : {0:.4f}".format(float(result[1])) str4 = " va_kld_q2p : {0:.4f}".format(float(result[2])) str5 = " va_kld_p2q : {0:.4f}".format(float(result[3])) str6 = " va_kld_amu : {0:.4f}".format(float(result[4])) str7 = " va_kld_alv : {0:.4f}".format(float(result[5])) str8 = " va_reg_term : {0:.4f}".format(float(result[6])) joint_str = "\n".join([str2, str3, str4, str5, str6, str7, str8]) print(joint_str) out_file.write(joint_str + "\n") out_file.flush() ########################################### # sample and draw attention trajectories. # ########################################### Xb = sample_batch(Xva, bs=32) result = SCG.sample_attention(Xb, Xb) post_tag = "b{0:d}".format(i) visualize_attention(result, pre_tag=result_tag, post_tag=post_tag)
def test_gc_pair(): # Simple test code, to check that everything is basically functional. print("TESTING...") # Initialize a source of randomness rng = np.random.RandomState(1234) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) Xtr = datasets[0][0] tr_samples = Xtr.get_value(borrow=True).shape[0] data_dim = Xtr.get_value(borrow=True).shape[1] mm_proj_dim = 250 # Do moment matching in some transformed space #P = np.identity(data_dim) P = npr.randn(data_dim, mm_proj_dim) / np.sqrt(float(mm_proj_dim)) P = theano.shared(value=P.astype(theano.config.floatX), name='P_proj') target_mean, target_cov = projected_moments(Xtr, P, ary_type='theano') P = P.get_value(borrow=False).astype(theano.config.floatX) ########################### # Setup generator network # ########################### # Choose some parameters for the generative network gn_params = {} gn_config = [200, 1000, 1000, 28*28] gn_params['mlp_config'] = gn_config gn_params['lam_l2a'] = 1e-3 gn_params['vis_drop'] = 0.0 gn_params['hid_drop'] = 0.0 gn_params['bias_noise'] = 0.1 gn_params['out_type'] = 'bernoulli' gn_params['activation'] = relu_actfun gn_params['init_scale'] = 4.0 # Symbolic input matrix to generator network Xp_sym = T.matrix(name='Xp_sym') Xd_sym = T.matrix(name='Xd_sym') # Initialize a generator network object GN = GenNet(rng=rng, Xp=Xp_sym, prior_sigma=1.0, params=gn_params) ############################### # Setup discriminator network # ############################### # Set some reasonable mlp parameters dn_params = {} # Set up some proto-networks pc0 = [28*28, (250, 4), (250, 4), 11] dn_params['proto_configs'] = [pc0] # Set up some spawn networks sc0 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} #sc1 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} dn_params['spawn_configs'] = [sc0] dn_params['spawn_weights'] = [1.0] # Set remaining params dn_params['ear_type'] = 2 dn_params['ear_lam'] = 0.0 dn_params['lam_l2a'] = 1e-3 dn_params['vis_drop'] = 0.2 dn_params['hid_drop'] = 0.5 # Initialize a network object to use as the discriminator DN = PeaNet(rng=rng, Xd=Xd_sym, params=dn_params) ######################################################################## # Initialize the joint controller for the generator/discriminator pair # ######################################################################## gcp_params = {} gcp_params['d_net'] = DN gcp_params['g_net'] = GN gcp_params['lam_l2d'] = 1e-2 gcp_params['mom_mix_rate'] = 0.05 gcp_params['mom_match_weight'] = 0.05 gcp_params['mom_match_proj'] = P gcp_params['target_mean'] = target_mean gcp_params['target_cov'] = target_cov # Initialize a GCPair instance using the previously constructed generator and # discriminator networks. GCP = GCPair(rng=rng, Xd=Xd_sym, Xp=Xp_sym, d_net=DN, g_net=GN, \ data_dim=28*28, params=gcp_params) gn_learn_rate = 0.02 dn_learn_rate = 0.01 GCP.set_gn_sgd_params(learn_rate=gn_learn_rate, momentum=0.98) GCP.set_dn_sgd_params(learn_rate=dn_learn_rate, momentum=0.98) # Init generator's mean and covariance estimates with many samples GCP.init_moments(10000) batch_idx = T.lvector('batch_idx') batch_sample = theano.function(inputs=[ batch_idx ], \ outputs=Xtr.take(batch_idx, axis=0)) for i in range(750000): tr_idx = npr.randint(low=0,high=tr_samples,size=(100,)).astype(np.int32) Xn_np = GN.sample_from_prior(100) Xd_batch = batch_sample(tr_idx) Xd_batch = Xd_batch.astype(theano.config.floatX) Xn_batch = Xn_np.astype(theano.config.floatX) all_idx = np.arange(200) data_idx = all_idx[:100] noise_idx = all_idx[100:] scale = min(1.0, float(i+1)/10000.0) GCP.set_disc_weights(dweight_gn=scale, dweight_dn=scale) outputs = GCP.train_joint(Xd_batch, Xn_batch, data_idx, noise_idx) mom_match_cost = 1.0 * outputs[0] disc_cost_gn = 1.0 * outputs[1] disc_cost_dn = 1.0 * outputs[2] if ((i+1 % 100000) == 0): gn_learn_rate = gn_learn_rate * 0.7 dn_learn_rate = dn_learn_rate * 0.7 GCP.set_gn_sgd_params(learn_rate=gn_learn_rate, momentum=0.98) GCP.set_dn_sgd_params(learn_rate=dn_learn_rate, momentum=0.98) if ((i % 1000) == 0): print("batch: {0:d}, mom_match_cost: {1:.4f}, disc_cost_gn: {2:.4f}, disc_cost_dn: {3:.4f}".format( \ i, mom_match_cost, disc_cost_gn, disc_cost_dn)) if ((i % 5000) == 0): file_name = "GCP_SAMPLES_b{0:d}.png".format(i) Xs = GCP.sample_from_gn(200) utils.visualize_samples(Xs, file_name) file_name = "GCP_WEIGHTS_b{0:d}.png".format(i) utils.visualize(GCP.DN, 0, 0, file_name) print("TESTING COMPLETE!") return
def test_mnist_results(step_type='add', imp_steps=6, occ_dim=15, drop_prob=0.0): ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = "{}GPSI_OD{}_DP{}_IS{}_{}_NA".format(RESULT_PATH, occ_dim, dp_int, imp_steps, step_type) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xte = datasets[2][0] # Merge validation set and training set, and test on test set. Xtr = np.concatenate((Xtr, Xva), axis=0) Xva = Xte Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 250 batch_reps = 1 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX( all_pix_mean * np.ones((Xtr.shape[1],)) ) # Load parameters from a previously trained model print("Testing model load from file...") GPSI = load_gpsimputer_from_file(f_name="{}_PARAMS.pkl".format(result_tag), \ rng=rng) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_FINAL_RESULTS_NEW.txt".format(result_tag) out_file = open(log_name, 'wb') Xva = row_shuffle(Xva) # record an estimate of performance on the test set str0 = "GUIDED SAMPLE BOUND:" print(str0) xi, xo, xm = construct_masked_data(Xva[:5000], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) nll_0, kld_0 = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10, \ use_guide_policy=True) xi, xo, xm = construct_masked_data(Xva[5000:], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) nll_1, kld_1 = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10, \ use_guide_policy=True) nll = np.concatenate((nll_0, nll_1)) kld = np.concatenate((kld_0, kld_1)) vfe = np.mean(nll) + np.mean(kld) str1 = " va_nll_bound : {}".format(vfe) str2 = " va_nll_term : {}".format(np.mean(nll)) str3 = " va_kld_q2p : {}".format(np.mean(kld)) joint_str = "\n".join([str0, str1, str2, str3]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() # record an estimate of performance on the test set str0 = "UNGUIDED SAMPLE BOUND:" print(str0) xi, xo, xm = construct_masked_data(Xva[:5000], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) nll_0, kld_0 = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10, \ use_guide_policy=False) xi, xo, xm = construct_masked_data(Xva[5000:], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) nll_1, kld_1 = GPSI.compute_fe_terms(xi, xo, xm, sample_count=10, \ use_guide_policy=False) nll = np.concatenate((nll_0, nll_1)) kld = np.concatenate((kld_0, kld_1)) str1 = " va_nll_bound : {}".format(np.mean(nll)) str2 = " va_nll_term : {}".format(np.mean(nll)) str3 = " va_kld_q2p : {}".format(np.mean(kld)) joint_str = "\n".join([str0, str1, str2, str3]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush()
def test_with_model_init(): ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = to_fX(datasets[0][0]) Xva = to_fX(datasets[1][0]) Ytr = datasets[0][1] Yva = datasets[1][1] tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 BD = lambda ary: binarize_data(ary) ####################################### # Setup some parameters for the model # ####################################### obs_dim = Xtr.shape[1] z_dim = 64 init_scale = 0.2 # some InfNet instances to build the TwoStageModel from x_in = T.matrix('x_in') y_in = T.lvector('y_in') ############### # q_z_given_x # ############### print("Building q_z_given_x...") params = {} shared_config = [obs_dim, 1000, 1000] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.2 params['hid_drop'] = 0.5 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_z_given_x = InfNet(rng=rng, Xd=x_in, \ params=params, shared_param_dicts=None) q_z_given_x.init_biases(0.2) ########################################################### # Define parameters for the ClassModel, and initialize it # ########################################################### print("Building the ClassModel...") CM = ClassModel(rng=rng, \ x_in=x_in, y_in=y_in, \ q_z_given_x=q_z_given_x, \ class_count=10, \ z_dim=z_dim, \ use_samples=False) CM.set_drop_rate(0.5) CM.set_lam_nll(lam_nll=1.0) CM.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0) CM.set_lam_l2w(lam_l2w=1e-5) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ out_file = open("CM_RESULTS.txt", 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.9 batch_idx = np.arange(batch_size) + tr_samples for i in range(250000): scale = min(1.0, ((i+1) / 1000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr, Ytr = row_shuffle(Xtr, Ytr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update CM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.99) # perform a minibatch update and record the cost for this batch Xi_tr = Xtr.take(batch_idx, axis=0) Yi_tr = Ytr.take(batch_idx, axis=0) result = CM.train_joint(Xi_tr, Yi_tr) costs = [(costs[j] + result[j]) for j in range(len(result)-1)] # output useful information about training progress if ((i % 500) == 0): costs = [(v / 500.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost : {0:.4f}".format(costs[0]) str3 = " nll_cost : {0:.4f}".format(costs[1]) str4 = " kld_cost : {0:.4f}".format(costs[2]) str5 = " reg_cost : {0:.4f}".format(costs[3]) joint_str = "\n".join([str1, str2, str3, str4, str5]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))): ##################################################### # compute multi-sample estimates of the free-energy # ##################################################### # training set... fe_terms = CM.compute_fe_terms(Xtr[0:2500],Ytr[0:2500], 30) fe_nll = np.mean(fe_terms[0]) fe_kld = np.mean(fe_terms[1]) fe_joint = fe_nll + fe_kld joint_str = " vfe-tr: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() # validation set... Xva, Yva = row_shuffle(Xva, Yva) fe_terms = CM.compute_fe_terms(Xva[0:2500], Yva[0:2500], 30) fe_nll = np.mean(fe_terms[0]) fe_kld = np.mean(fe_terms[1]) fe_joint = fe_nll + fe_kld joint_str = " vfe-va: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() ########################################################## # compute multi-sample estimates of classification error # ########################################################## # training set... va_error, va_preds = CM.class_error(Xtr[:2500], Ytr[:2500], samples=30) joint_str = " tr-class-error: {0:.4f}".format(va_error) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() # validation set... va_error, va_preds = CM.class_error(Xva[:2500], Yva[:2500], samples=30) joint_str = " va-class-error: {0:.4f}".format(va_error) print(joint_str) out_file.write(joint_str+"\n") out_file.flush()
def batch_test_ss_mlp_pt(test_count=10, su_count=1000): """Setup basic test for semisupervised EAR-regularized MLP.""" # Set some reasonable sgd parameters sgd_params = {} sgd_params['start_rate'] = 0.01 sgd_params['decay_rate'] = 0.998 sgd_params['wt_norm_bound'] = 3.5 sgd_params['epochs'] = 1000 sgd_params['batch_size'] = 100 sgd_params['result_tag'] = '---' # Set some reasonable mlp parameters mlp_params = {} # Set up some proto-networks pc0 = [28 * 28, 800, 800, 11] mlp_params['proto_configs'] = [pc0] # Set up some spawn networks sc0 = { 'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True } sc1 = { 'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True } mlp_params['spawn_configs'] = [sc0, sc1] mlp_params['spawn_weights'] = [0.0, 1.0] # Set remaining params mlp_params['ear_type'] = 5 mlp_params['ear_lam'] = 1.0 mlp_params['lam_l2a'] = 1e-2 mlp_params['reg_all_obs'] = True for test_num in range(test_count): rng_seed = test_num sgd_params['result_tag'] = "test_{0:d}".format(test_num) # Initialize a random number generator for this test rng = np.random.RandomState(rng_seed) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, zero_mean=False) # Construct the EAR_NET object that we will be training x_in = T.matrix('x_in') NET = EAR_NET(rng=rng, input=x_in, params=mlp_params) init_biases(NET, b_init=0.05) ########################################## # First, pretrain each layer in the mlp. # ########################################## sgd_params['result_tag'] = "ss_ear_pt_s{0:d}_t{1:d}".format( su_count, test_num) sgd_params['batch_size'] = 25 sgd_params['start_rate'] = 0.02 sgd_params['epochs'] = 40 for i in range(len(NET.dae_costs)): print("==================================================") print("Pretraining hidden layer(s) at depth {0:d}".format(i + 1)) print("==================================================") train_dae(NET, i, sgd_params, datasets) # Load some data to train/validate/test with rng = np.random.RandomState(rng_seed) dataset = 'data/mnist.pkl.gz' datasets = load_udm_ss(dataset, su_count, rng, zero_mean=False) # Run semisupervised training on the given MLP sgd_params['batch_size'] = 100 sgd_params['start_rate'] = 0.04 # Train with weak EAR regularization sgd_params['top_only'] = True sgd_params['epochs'] = 5 NET.set_ear_lam(0.0) train_ss_mlp(NET, sgd_params, datasets) COMMENT = """ # Train with no EAR regularization sgd_params['top_only'] = False sgd_params['epochs'] = 100 NET.set_ear_lam(0.0) train_ss_mlp(NET, sgd_params, datasets) """ # Train with weak EAR regularization sgd_params['top_only'] = False sgd_params['epochs'] = 5 NET.set_ear_lam(0.5) train_ss_mlp(NET, sgd_params, datasets) # Train with weak EAR regularization sgd_params['epochs'] = 10 NET.set_ear_lam(1.0) train_ss_mlp(NET, sgd_params, datasets) # Train with more EAR regularization sgd_params['epochs'] = 15 NET.set_ear_lam(1.5) train_ss_mlp(NET, sgd_params, datasets) # Train with more EAR regularization sgd_params['epochs'] = 20 NET.set_ear_lam(2.0) train_ss_mlp(NET, sgd_params, datasets) # Train with more EAR regularization sgd_params['epochs'] = 100 NET.set_ear_lam(3.0) train_ss_mlp(NET, sgd_params, datasets) return
def test_mnist(occ_dim=15, drop_prob=0.0): ######################################### # Format the result tag more thoroughly # ######################################### dp_int = int(100.0 * drop_prob) result_tag = "{}VAE_OD{}_DP{}".format(RESULT_PATH, occ_dim, dp_int) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 batch_reps = 1 all_pix_mean = np.mean(np.mean(Xtr, axis=1)) data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1],))) ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ obs_dim = Xtr.shape[1] z_dim = 100 imp_steps = 15 # we'll check for the best step count (found oracularly) init_scale = 1.0 x_in_sym = T.matrix('x_in_sym') x_out_sym = T.matrix('x_out_sym') x_mask_sym = T.matrix('x_mask_sym') ################# # p_zi_given_xi # ################# params = {} shared_config = [obs_dim, 500, 500] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_zi_given_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_zi_given_xi.init_biases(0.2) ################### # p_xip1_given_zi # ################### params = {} shared_config = [z_dim, 500, 500] output_config = [obs_dim, obs_dim] params['shared_config'] = shared_config params['output_config'] = output_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False p_xip1_given_zi = HydraNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) p_xip1_given_zi.init_biases(0.2) ################### # q_zi_given_x_xi # ################### params = {} shared_config = [(obs_dim + obs_dim), 500, 500] top_config = [shared_config[-1], z_dim] params['shared_config'] = shared_config params['mu_config'] = top_config params['sigma_config'] = top_config params['activation'] = relu_actfun params['init_scale'] = init_scale params['lam_l2a'] = 0.0 params['vis_drop'] = 0.0 params['hid_drop'] = 0.0 params['bias_noise'] = 0.0 params['input_noise'] = 0.0 params['build_theano_funcs'] = False q_zi_given_x_xi = InfNet(rng=rng, Xd=x_in_sym, \ params=params, shared_param_dicts=None) q_zi_given_x_xi.init_biases(0.2) ########################################################### # Define parameters for the GPSImputer, and initialize it # ########################################################### print("Building the GPSImputer...") gpsi_params = {} gpsi_params['obs_dim'] = obs_dim gpsi_params['z_dim'] = z_dim gpsi_params['imp_steps'] = imp_steps gpsi_params['step_type'] = 'jump' gpsi_params['x_type'] = 'bernoulli' gpsi_params['obs_transform'] = 'sigmoid' gpsi_params['use_osm_mode'] = True GPSI = GPSImputer(rng=rng, x_in=x_in_sym, x_out=x_out_sym, x_mask=x_mask_sym, \ p_zi_given_xi=p_zi_given_xi, \ p_xip1_given_zi=p_xip1_given_zi, \ q_zi_given_x_xi=q_zi_given_x_xi, \ params=gpsi_params, \ shared_param_dicts=None) ######################################################################### # Define parameters for the underlying OneStageModel, and initialize it # ######################################################################### print("Building the OneStageModel...") osm_params = {} osm_params['x_type'] = 'bernoulli' osm_params['xt_transform'] = 'sigmoid' OSM = OneStageModel(rng=rng, \ x_in=x_in_sym, \ p_x_given_z=p_xip1_given_zi, \ q_z_given_x=p_zi_given_xi, \ x_dim=obs_dim, z_dim=z_dim, \ params=osm_params) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ log_name = "{}_RESULTS.txt".format(result_tag) out_file = open(log_name, 'wb') costs = [0. for i in range(10)] learn_rate = 0.0002 momentum = 0.5 batch_idx = np.arange(batch_size) + tr_samples for i in range(200000): scale = min(1.0, ((i+1) / 5000.0)) if (((i + 1) % 15000) == 0): learn_rate = learn_rate * 0.92 if (i > 10000): momentum = 0.90 else: momentum = 0.50 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update OSM.set_sgd_params(lr=scale*learn_rate, \ mom_1=scale*momentum, mom_2=0.99) OSM.set_lam_nll(lam_nll=1.0) OSM.set_lam_kld(lam_kld_1=1.0, lam_kld_2=0.0) OSM.set_lam_l2w(1e-4) # perform a minibatch update and record the cost for this batch xb = to_fX( Xtr.take(batch_idx, axis=0) ) result = OSM.train_joint(xb, batch_reps) costs = [(costs[j] + result[j]) for j in range(len(result)-1)] if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " joint_cost: {0:.4f}".format(costs[0]) str3 = " nll_cost : {0:.4f}".format(costs[1]) str4 = " kld_cost : {0:.4f}".format(costs[2]) str5 = " reg_cost : {0:.4f}".format(costs[3]) joint_str = "\n".join([str1, str2, str3, str4, str5]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): Xva = row_shuffle(Xva) # record an estimate of performance on the test set xi, xo, xm = construct_masked_data(Xva[0:5000], drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) step_nll, step_kld = GPSI.compute_per_step_cost(xi, xo, xm, sample_count=10) min_nll = np.min(step_nll) str1 = " va_nll_bound : {}".format(min_nll) str2 = " va_nll_min : {}".format(min_nll) str3 = " va_nll_final : {}".format(step_nll[-1]) joint_str = "\n".join([str1, str2, str3]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() if ((i % 5000) == 0): # Get some validation samples for evaluating model performance xb = to_fX( Xva[0:100] ) xi, xo, xm = construct_masked_data(xb, drop_prob=drop_prob, \ occ_dim=occ_dim, data_mean=data_mean) xi = np.repeat(xi, 2, axis=0) xo = np.repeat(xo, 2, axis=0) xm = np.repeat(xm, 2, axis=0) # draw some sample imputations from the model samp_count = xi.shape[0] _, model_samps = GPSI.sample_imputer(xi, xo, xm, use_guide_policy=False) seq_len = len(model_samps) seq_samps = np.zeros((seq_len*samp_count, model_samps[0].shape[1])) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = model_samps[s2][s1] idx += 1 file_name = "{}_samples_ng_b{}.png".format(result_tag, i) utils.visualize_samples(seq_samps, file_name, num_rows=20) # get visualizations of policy parameters file_name = "{}_gen_gen_weights_b{}.png".format(result_tag, i) W = GPSI.gen_gen_weights.get_value(borrow=False) utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20) file_name = "{}_gen_inf_weights_b{}.png".format(result_tag, i) W = GPSI.gen_inf_weights.get_value(borrow=False).T utils.visualize_samples(W[:,:obs_dim], file_name, num_rows=20)
def test_seq_cond_gen_copy(step_type='add', res_tag="AAA"): ############################## # File tag, for output stuff # ############################## result_tag = "{}TEST_{}".format(RESULT_PATH, res_tag) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) dataset = 'data/mnist.pkl.gz' datasets = load_udm(dataset, as_shared=False, zero_mean=False) Xtr = datasets[0][0] Xva = datasets[1][0] Xte = datasets[2][0] # merge validation set and training set, and test on test set. #Xtr = np.concatenate((Xtr, Xva), axis=0) #Xva = Xte Xtr = to_fX(shift_and_scale_into_01(Xtr)) Xva = to_fX(shift_and_scale_into_01(Xva)) # basic params batch_size = 128 traj_len = 20 im_dim = 28 obs_dim = im_dim*im_dim def sample_batch(np_ary, bs=100): row_count = np_ary.shape[0] samp_idx = npr.randint(low=0,high=row_count,size=(bs,)) xb = np_ary.take(samp_idx, axis=0) return xb ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ total_steps = traj_len init_steps = 5 exit_rate = 0.1 nll_weight = 0.0 x_dim = obs_dim y_dim = obs_dim z_dim = 128 att_spec_dim = 5 rnn_dim = 512 mlp_dim = 512 def visualize_attention(result, pre_tag="AAA", post_tag="AAA"): seq_len = result[0].shape[0] samp_count = result[0].shape[1] # get generated predictions x_samps = np.zeros((seq_len*samp_count, obs_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): x_samps[idx] = result[0][s2,s1,:] idx += 1 file_name = "{0:s}_traj_xs_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(x_samps, file_name, num_rows=samp_count) # get sequential attention maps seq_samps = np.zeros((seq_len*samp_count, obs_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = result[1][s2,s1,:] idx += 1 file_name = "{0:s}_traj_att_maps_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=samp_count) # get sequential attention maps (read out values) seq_samps = np.zeros((seq_len*samp_count, obs_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = result[2][s2,s1,:] idx += 1 file_name = "{0:s}_traj_read_outs_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=samp_count) # get original input sequences seq_samps = np.zeros((seq_len*samp_count, obs_dim)) idx = 0 for s1 in range(samp_count): for s2 in range(seq_len): seq_samps[idx] = result[3][s2,s1,:] idx += 1 file_name = "{0:s}_traj_xs_in_{1:s}.png".format(pre_tag, post_tag) utils.visualize_samples(seq_samps, file_name, num_rows=samp_count) return rnninits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } # module for doing local 2d read defined by an attention specification img_scale = 1.0 # image coords will range over [-img_scale...img_scale] read_N = 2 # use NxN grid for reader reader_mlp = FovAttentionReader2d(x_dim=obs_dim, width=im_dim, height=im_dim, N=read_N, img_scale=img_scale, att_scale=0.5, **inits) read_dim = reader_mlp.read_dim # total number of "pixels" read by reader # MLP for updating belief state based on con_rnn writer_mlp = MLP([None, None], [rnn_dim, mlp_dim, obs_dim], \ name="writer_mlp", **inits) # mlps for processing inputs to LSTMs con_mlp_in = MLP([Identity()], \ [ z_dim, 4*rnn_dim], \ name="con_mlp_in", **inits) var_mlp_in = MLP([Identity()], \ [(read_dim + read_dim + att_spec_dim + rnn_dim), 4*rnn_dim], \ name="var_mlp_in", **inits) gen_mlp_in = MLP([Identity()], \ [ (read_dim + att_spec_dim + rnn_dim), 4*rnn_dim], \ name="gen_mlp_in", **inits) # mlps for turning LSTM outputs into conditionals over z_gen con_mlp_out = CondNet([], [rnn_dim, att_spec_dim], \ name="con_mlp_out", **inits) gen_mlp_out = CondNet([], [rnn_dim, z_dim], name="gen_mlp_out", **inits) var_mlp_out = CondNet([], [rnn_dim, z_dim], name="var_mlp_out", **inits) # LSTMs for the actual LSTMs (obviously, perhaps) con_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="con_rnn", **rnninits) gen_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="gen_rnn", **rnninits) var_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="var_rnn", **rnninits) SCG = SeqCondGenRAM( x_and_y_are_seqs=False, total_steps=total_steps, init_steps=init_steps, exit_rate=exit_rate, nll_weight=nll_weight, step_type=step_type, x_dim=obs_dim, y_dim=obs_dim, reader_mlp=reader_mlp, writer_mlp=writer_mlp, con_mlp_in=con_mlp_in, con_mlp_out=con_mlp_out, con_rnn=con_rnn, gen_mlp_in=gen_mlp_in, gen_mlp_out=gen_mlp_out, gen_rnn=gen_rnn, var_mlp_in=var_mlp_in, var_mlp_out=var_mlp_out, var_rnn=var_rnn) SCG.initialize() compile_start_time = time.time() # build the attention trajectory sampler SCG.build_attention_funcs() # quick test of attention trajectory sampler Xb = sample_batch(Xtr, bs=32) result = SCG.sample_attention(Xb, Xb) visualize_attention(result, pre_tag=result_tag, post_tag="b0") # build the main model functions (i.e. training and cost functions) SCG.build_model_funcs() compile_end_time = time.time() compile_minutes = (compile_end_time - compile_start_time) / 60.0 print("THEANO COMPILE TIME (MIN): {}".format(compile_minutes)) # TEST SAVE/LOAD FUNCTIONALITY param_save_file = "{}_params.pkl".format(result_tag) SCG.save_model_params(param_save_file) SCG.load_model_params(param_save_file) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ print("Beginning to train the model...") out_file = open("{}_results.txt".format(result_tag), 'wb') out_file.flush() costs = [0. for i in range(10)] learn_rate = 0.0001 momentum = 0.95 for i in range(250000): lr_scale = min(1.0, ((i+1) / 5000.0)) mom_scale = min(1.0, ((i+1) / 10000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # set sgd and objective function hyperparams for this update SCG.set_sgd_params(lr=lr_scale*learn_rate, mom_1=mom_scale*momentum, mom_2=0.99) SCG.set_lam_kld(lam_kld_q2p=0.95, lam_kld_p2q=0.05, \ lam_kld_amu=0.0, lam_kld_alv=0.1) # perform a minibatch update and record the cost for this batch Xb = sample_batch(Xtr, bs=batch_size) result = SCG.train_joint(Xb, Xb) costs = [(costs[j] + result[j]) for j in range(len(result))] # output diagnostic information and checkpoint parameters, etc. if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " total_cost: {0:.4f}".format(costs[0]) str3 = " nll_term : {0:.4f}".format(costs[1]) str4 = " kld_q2p : {0:.4f}".format(costs[2]) str5 = " kld_p2q : {0:.4f}".format(costs[3]) str6 = " kld_amu : {0:.4f}".format(costs[4]) str7 = " kld_alv : {0:.4f}".format(costs[5]) str8 = " reg_term : {0:.4f}".format(costs[6]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7, str8]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 500) == 0): SCG.save_model_params("{}_params.pkl".format(result_tag)) ############################################# # check model performance on validation set # ############################################# Xb = sample_batch(Xva, bs=500) result = SCG.compute_nll_bound(Xb, Xb) str2 = " va_total_cost: {0:.4f}".format(float(result[0])) str3 = " va_nll_term : {0:.4f}".format(float(result[1])) str4 = " va_kld_q2p : {0:.4f}".format(float(result[2])) str5 = " va_kld_p2q : {0:.4f}".format(float(result[3])) str6 = " va_kld_amu : {0:.4f}".format(float(result[4])) str7 = " va_kld_alv : {0:.4f}".format(float(result[5])) str8 = " va_reg_term : {0:.4f}".format(float(result[6])) joint_str = "\n".join([str2, str3, str4, str5, str6, str7, str8]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() ########################################### # sample and draw attention trajectories. # ########################################### Xb = sample_batch(Xva, bs=32) result = SCG.sample_attention(Xb, Xb) post_tag = "b{0:d}".format(i) visualize_attention(result, pre_tag=result_tag, post_tag=post_tag)
def test_dA(learning_rate=0.1, training_epochs=30, dataset='./data/mnist.pkl.gz', batch_size=25, output_folder='dA_plots'): """ Blargh! """ datasets = load_udm(dataset) train_set_x, train_set_y = datasets[0] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images if not os.path.isdir(output_folder): os.makedirs(output_folder) os.chdir(output_folder) #################################### # BUILDING THE MODEL NO CORRUPTION # #################################### rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2 ** 30)) da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x, n_visible=28 * 28, n_hidden=500) cost, updates = da.get_cost_updates(corruption_level=0., learning_rate=learning_rate) train_da = theano.function(inputs=[index], outputs=[cost], updates=updates, givens={ x: train_set_x[index*batch_size:(index+1)*batch_size,:] }) ############ # TRAINING # ############ # go through training epochs for epoch in xrange(training_epochs): # go through training set c = [] t1 = time.clock() for batch_index in xrange(n_train_batches): c.append(train_da(batch_index)) t2 = time.clock() print "Training epoch {0:d}, cost {1:.4f}, time {2:.4f}".format( \ epoch, numpy.mean(c), (t2 - t1)) image = PIL.Image.fromarray( tile_raster_images(X=da.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters_corruption_00.png') ##################################### # BUILDING THE MODEL CORRUPTION 30% # ##################################### rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2 ** 30)) da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x, n_visible=28 * 28, n_hidden=500) cost, updates = da.get_cost_updates(corruption_level=0.3, learning_rate=learning_rate) train_da = theano.function([index], cost, updates=updates, givens={x: train_set_x[index * batch_size: (index + 1) * batch_size]}) ############ # TRAINING # ############ # go through training epochs for epoch in xrange(training_epochs): # go through training set c = [] t1 = time.clock() for batch_index in xrange(n_train_batches): c.append(train_da(batch_index)) t2 = time.clock() print "Training epoch {0:d}, cost {1:.4f}, time {2:.4f}".format( \ epoch, numpy.mean(c), (t2 - t1)) image = PIL.Image.fromarray( tile_raster_images(X=da.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters_corruption_30.png') os.chdir('../')