def run_cond_aevb(base_data, cond_data): start_time = time.time() # Create aevb function # Training parameters D_c = cond_data.shape[1] D_b = base_data.shape[1] N_data = cond_data.shape[0] assert cond_data.shape[0] == base_data.shape[0] enc_layers = [ D_c, hidden_units, hidden_units, hidden_units, 2 * latent_dimensions ] dec_layers = [ latent_dimensions + D_b, hidden_units, hidden_units, hidden_units, D_c ] N_weights_enc, encoder, encoder_log_like = make_gaussian_nn(enc_layers) N_weights_dec, decoder, decoder_log_like = make_binary_nn(dec_layers) # Optimize aevb batch_size = 100 num_training_iters = 1600 rs = npr.RandomState(0) parser = WeightsParser() parser.add_shape('encoding weights', (N_weights_enc, )) parser.add_shape('decoding weights', (N_weights_dec, )) initial_combined_weights = rs.randn(len(parser)) * param_scale batch_idxs = make_batches(N_data, batch_size) def batch_value_and_grad(weights, iter): iter = iter % len(batch_idxs) # cur_base = base_data[batch_idxs[iter]] cur_cond = cond_data[batch_idxs[iter]] cur_im = base_data[batch_idxs[iter]] cur_b = apply_mask(cur_im) return lower_bound(weights, encoder, decoder_log_like, N_weights_enc, cur_b, cur_cond, samples_per_image, latent_dimensions, rs) lb_grad = grad(batch_value_and_grad) base_test = np.repeat(apply_mask(base_data[0:10, :]), 10, axis=0) def callback(params, i, grad): ml = batch_value_and_grad(params, i) print "log marginal likelihood:", ml # #Generate samples num_samples = 100 images_per_row = 10 zs = rs.randn(100, latent_dimensions) # zs = rs.randn(10,latent_dimensions) # zs = np.repeat(zs,10,axis = 0) # base_test = base_data[0:num_samples,:] # base_test = np.repeat(base_data[0:10,:],10,axis = 0) dec_in = np.concatenate((zs, base_test), axis=1) samples = decoder(parser.get(params, 'decoding weights'), dec_in) fig = plt.figure(1) fig.clf() ax = fig.add_subplot(111) # plot_images(samples, ax, ims_per_row=images_per_row) plot_shape = (100, 784) im_samples = np.zeros(plot_shape) im_mean = np.zeros(plot_shape) im_map = np.zeros(plot_shape) for k in xrange(plot_shape[0]): if k % 10 == 0: im_samples[k, :] = base_test[k, :] im_mean[k, :] = base_test[k, :] im_map[k, :] = base_test[k, :] else: im_mean[k, :] = samples[k - 1, :] im_samples[k, :] = np.random.binomial(1, samples[k - 1, :]) im_map[k, :] = np.round(samples[k - 1, :]) plot_images(im_samples, ax, ims_per_row=images_per_row) plt.savefig('samples.png') fig = plt.figure(1) fig.clf() ax = fig.add_subplot(111) plot_images(im_mean, ax, ims_per_row=images_per_row) plt.savefig('mean_samples.png') fig = plt.figure(1) fig.clf() ax = fig.add_subplot(111) plot_images(im_map, ax, ims_per_row=images_per_row) plt.savefig('map_samples.png') fig = plt.figure(1) fig.clf() ax = fig.add_subplot(111) plot_images(base_test, ax, ims_per_row=images_per_row) plt.savefig('blurred_samples.png') final_params = adam(lb_grad, initial_combined_weights, num_training_iters, callback=callback) def decoder_with_weights(zs): return decoder(parser.get(final_params, 'decoding weights'), zs) return decoder_with_weights finish_time = time.time() print "total runtime", finish_time - start_time
def dropout_grad(weights, i): mask = npr.RandomState(seed * 10**6 + i).rand( len(weights)) > dropout_fraction masked_weights = weights * mask / (1 - dropout_fraction) return grad(masked_weights, i)
def init_var_params(D, rs=npr.RandomState(0), **kwargs): log_weights = np.ones(k) component_weights = [ init_component_var_params(D, rs=rs, **kwargs) for i in range(k) ] return np.concatenate([log_weights] + component_weights)
# Initialize the pseudo inputs for the first layer by sampling from the data, the pseudo outputs equal to the inputs x0[0] = np.ndarray.flatten( np.array(X)[ rs.choice(len(X), num_pseudo_params, replace=False), :]) y0[0] = x0[0] # For every other layer, set the inducing outputs to the inducing inputs (which are sampled from N(0,.01)) and lengthscale large for layer in xrange(1, n_layers): y0[layer] = x0[layer] layer_params[layer][3] = np.log(1) return pack_all_params(layer_params, x0, y0) # Initialize covariance parameters and hiddens. rs = npr.RandomState(1234) init_params = .1 * rs.randn(total_num_params) print("Optimizing covariance parameters...") objective = lambda params: -log_likelihood(params) if smart_init == 1: init_params = smart_initialize_params(init_params) plot_xs = np.reshape(np.linspace(-5, 5, 300), (300, 1)) plot_full_gp(ax_first, init_params, plot_xs) ax_first.set_title("Initial full predictions") print("Objective: ", objective(init_params)) cov_params = minimize(value_and_grad(objective), init_params,
def build_parabola(D=1, n_data=20, noise_std=0.1): rs = npr.RandomState(0) inputs = np.linspace(-4, 4, num=n_data) targets = inputs**2 + rs.randn(n_data) * noise_std inputs = inputs.reshape((len(inputs), D)) return inputs, targets
# Model hyper-parameters T = 10 Dx = 5 Dy = 3 alpha = 0.42 r = 0.1 obs = 'sparse' # Training parameters param_scale = 0.5 num_epochs = 1000 step_size = 0.001 N = 4 data_seed = npr.RandomState(0) model_params = init_model_params(Dx, Dy, alpha, r, obs, data_seed) print("Generating data...") x_true, y_true = generate_data(model_params, T, data_seed) lml = log_marginal_likelihood(model_params, T, y_true) print("True log-marginal likelihood: "+str(lml)) seed = npr.RandomState(0) # Initialize proposal parameters prop_params = init_prop_params(T, Dx, param_scale, seed) combined_init_params = (model_params, prop_params) lgss_smc_obj = lgss_smc(T, Dx, Dy, N)
def sample_data(n_data=80, noise_std=0.1, context_size=3): rs = npr.RandomState(0) inputs = np.linspace(-6,6,n_data) targets = np.sin(inputs)**3 + rs.randn(n_data) * noise_std return inputs[:, None], targets[:, None]
def sample(var_param, n_samples, seed=None): my_rs = rs if seed is None else npr.RandomState(seed) mean, beta = unpack_params(var_param) L = beta_to_L(beta) return np.dot( my_rs.randn(n_samples, dim), L) + mean
def sample(var_param, n_samples, seed=None): my_rs = rs if seed is None else npr.RandomState(seed) mean, log_scale = unpack_params(var_param) return mean + np.exp(log_scale)*my_rs.standard_t(df, size=(n_samples, dim))
def run_aevb(train_images): # run_aevb(train_images) start_time = time.time() # Create aevb function # Training parameters D = train_images.shape[1] dec_layers = [latent_dimensions, hidden_units,hidden_units, D] init_mean = np.zeros(latent_dimensions) init_log_stddevs = np.log(1*np.ones(latent_dimensions)) init_log_stepsize = np.log(.001) rs = np.random.npr.RandomState(0) sample_and_run_grad, parser = build_grad_sampler(latent_dimensions,100, approx=True) N_weights_dec, decoder, decoder_log_like = make_binary_nn(dec_layers) N_weights_enc = len(parser) encoder = sample_and_run_grad parser.add_shape('decoding weights', (N_weights_dec,)) params = np.zeros(len(parser)) parser.put(params, 'mean', init_mean) parser.put(params, 'log_stddev', init_log_stddevs) parser.put(params, 'log_stepsize', init_log_stepsize) dec_w = get_pretrained_dec_w() parser.put(params, 'decoding weights',dec_w) # parser.put(params, 'decoding weights',rs.randn(N_weights_dec) * param_scale) # Optimize aevb batch_size = 100 num_training_iters = 1600 rs = npr.RandomState(0) batch_idxs = make_batches(train_images.shape[0], batch_size) def batch_value_and_grad(weights, iter): iter = iter % len(batch_idxs) cur_data = train_images[batch_idxs[iter]] return lower_bound(weights,encoder,decoder_log_like,N_weights_enc,cur_data,samples_per_image,latent_dimensions,rs) lb_grad = grad(batch_value_and_grad) def callback(params, i, grad): ml = batch_value_and_grad(params,i) print "log marginal likelihood:", ml #Print params print 'norm of stdev', np.linalg.norm(np.exp(parser.get(params, 'mean'))) print 'stepsize' , np.exp(parser.get(params,'log_stepsize')) #Generate samples num_samples = 100 images_per_row = 10 zs = rs.randn(num_samples,latent_dimensions) # samples = np.random.binomial(1,decoder(parser.get(params, 'decoding weights'), zs)) samples = decoder(parser.get(params, 'decoding weights'), zs) fig = plt.figure(1) fig.clf() ax = fig.add_subplot(111) plot_images(samples, ax, ims_per_row=images_per_row) plt.savefig('samples.png') final_params = adam(lb_grad, params, num_training_iters, callback=callback) def decoder_with_weights(zs): return decoder(parser.get(final_params, 'decoding weights'), zs) return decoder_with_weights finish_time = time.time() print "total runtime", finish_time - start_time
def train_nn(pred_fun, loss_fun, num_weights, train_smiles, train_raw_targets, train_params, seed=0, validation_smiles=None, validation_raw_targets=None): """loss_fun has inputs (weights, smiles, targets)""" print("Total number of weights in the network:", num_weights) init_weights = npr.RandomState(seed).randn( num_weights) * train_params['init_scale'] num_print_examples = 100 train_targets, undo_norm = normalize_array(train_raw_targets) training_curve = [] def callback(weights, iter): if iter % 10 == 0: train_preds = undo_norm( pred_fun(weights, train_smiles[:num_print_examples])) cur_loss = loss_fun(weights, train_smiles[:num_print_examples], train_targets[:num_print_examples]) training_curve.append(cur_loss) print("Iteration", iter, "loss", cur_loss,\ "train RMSE", rmse(train_preds, train_raw_targets[:num_print_examples]),\ "max of weights", np.max(np.abs(weights)), end="") if validation_smiles is not None: validation_preds = undo_norm( pred_fun(weights, validation_smiles)) print("Validation RMSE", iter, ":", rmse(validation_preds, validation_raw_targets), end="") print("") # Build gradient using autograd. print("gradding") grad_fun = grad(loss_fun) grad_fun_with_data = build_batched_grad( grad=grad_fun, batch_size=train_params['batch_size'], inputs=train_smiles, targets=train_targets) # grad_fun_with_data is function that computes the gradient of the loss given the weights and the batch number (to determine training weights and targets) print("optimizing") # Optimize weights. trained_weights = adam(grad_fun_with_data, init_weights, callback=callback, num_iters=train_params['num_iters'], step_size=train_params['step_size']) print("optimized") def predict_func(new_smiles): """Returns to the original units that the raw targets were in.""" return undo_norm(pred_fun(trained_weights, new_smiles)) return predict_func, trained_weights, training_curve
def init_net_params(scale, layer_sizes, rs=npr.RandomState(0)): """Build a (weights, biases) tuples for all layers.""" return [(scale * rs.randn(m, n), # weight matrix scale * rs.randn(n)) # bias vector for m, n in zip(layer_sizes[:-1], layer_sizes[1:])]
def train(self, X, indX, XY, V, indV, VY, IM, VIM, count_dict, word_dict, embed_map, prog): """ Trains the LBL """ self.start = self.seed self.init_params(embed_map, count_dict) self.step = 0 inds = np.arange(len(X)) numbatches = len(inds) / self.batchsize tic = time.time() bleu = [0.0]*4 best = 0.0 scores = '/'.join([str(b) for b in bleu]) patience = 10 count = 0 done = False # delete x=[] y=[] # Main loop lm_tools.display_phase(1) for epoch in range(self.maxepoch): if done: break self.epoch = epoch prng = npr.RandomState(self.seed + epoch + 1) prng.shuffle(inds) for minibatch in range(numbatches): batchX = X[inds[minibatch::numbatches]] batchY = XY[inds[minibatch::numbatches]].toarray() batchindX = indX[inds[minibatch::numbatches]].astype(int).flatten() batchindX = np.floor(batchindX/5).astype(int) batchIm = IM[batchindX] loss_val = self.compute_obj(self.params, batchX, batchIm, batchY) self.backward(self.params, batchX, batchIm, batchY) self.update_params(batchX) if np.isnan(loss_val): print 'NaNs... breaking out' done = True break # Print out progress if np.mod(minibatch * self.batchsize, prog['_details']) == 0 and minibatch > 0: print "epoch/pts: %04d/%05d, cross-entropy loss: %.2f, time: %.2f" % (epoch+1, minibatch * self.batchsize, loss_val, (time.time()-tic)/60) if np.mod(minibatch * self.batchsize, prog['_samples']) == 0 and minibatch > 0: print "best: %s" % (scores) print '\nSamples:' # lm_tools.generate_and_show(self, word_dict, count_dict, VIM, k=3) VIM_example = VIM[prog['_val_example_idx'], :] VIM_file_list = prog['_val_example_file'] #lm_tools.generate_and_show_html(self, word_dict, count_dict, VIM_example, VIM_file_list, show=prog['_show_browser'], k=3) print ' ' if np.mod(minibatch * self.batchsize, prog['_update']) == 0 and minibatch > 0: self.update_hyperparams() self.step += 1 print "learning rate: %.4f, momentum: %.4f" % (self.eta_t, self.p_t) # Compute BLEU if np.mod(minibatch * self.batchsize, prog['_bleu']) == 0 and minibatch > 0: bleu = lm_tools.compute_bleu(self, word_dict, count_dict, VIM, prog, k=3) x.append(minibatch * self.batchsize) y.append(bleu[-1]) if bleu[-1] >= best: count = 0 best = bleu[-1] scores = '/'.join([str(b) for b in bleu]) print 'bleu score = {}'.format(bleu[-1]) lm_tools.save_model(self, self.loc) else: count += 1 if count == patience: done = True break self.update_hyperparams() self.step += 1 pl.plot(x,y) pl.show() return best
def create_rnn_params(input_size, state_size, output_size, param_scale=0.01, rs=npr.RandomState(0)): return {'init hiddens': rs.randn(1, state_size) * param_scale, 'change': rs.randn(input_size + state_size + 1, state_size) * param_scale, 'predict': rs.randn(state_size + 1, output_size) * param_scale}
def init_random_params(scale, layer_sizes, rs=npr.RandomState(0)): """Build a list of (weights, biases) tuples, one for each layer.""" return [(rs.randn(insize, outsize) * scale, # weight matrix rs.randn(outsize) * scale) # bias vector for insize, outsize in zip(layer_sizes[:-1], layer_sizes[1:])]
def sample(var_param, n_samples, seed=None): my_rs = rs if seed is None else npr.RandomState(seed) mean, log_std = unpack_params(var_param) return my_rs.randn(n_samples, dim) * np.exp(log_std) + mean
train_data = train_data.reshape(len(train_data), 1) # train_data = np.loadtxt('/cluster/mshen/prj/gans/out/2017-06-19/a_generate/X.csv', delimiter = ',') # train_data = train_data.reshape(len(train_data), 1) init_gen_params = init_random_params(param_scale, gen_layer_sizes) init_dsc_params = init_random_params(param_scale, dsc_layer_sizes) num_batches = int(np.ceil(len(train_data) / batch_size)) def batch_indices(iter): idx = iter % num_batches return slice(idx * batch_size, (idx + 1) * batch_size) # Define training objective seed = npr.RandomState(1) def objective(gen_params, dsc_params, iter): idx = batch_indices(iter) return gan_objective(gen_params, dsc_params, train_data[idx], batch_size, noise_dim, seed) # Get gradients of objective using autograd. both_objective_grad = multigrad(objective, argnums=[0, 1]) print( " Epoch | Objective | Fake probability | Real Probability " ) def print_perf(gen_params, dsc_params, iter, gen_gradient, dsc_gradient): if iter % 10 == 0:
# Load MNIST and Set Up Data N_data, train_images, train_labels, test_images, test_labels = load_mnist() train_images = np.round(train_images[0:10000]) train_labels = train_labels[0:10000] test_images = np.round(test_images[0:10000]) # Starter Code for 4d # A correct solution here only requires you to correctly write the neglogprob! # Because this setup is numerically finicky # the default parameterization I've given should give results if neglogprob is correct. K = 30 D = 784 # Random initialization, with set seed for easier debugging # Try changing the weighting of the initial randomization, default 0.01 init_params = npr.RandomState(0).randn(K, D) * 0.01 # Implemented batching for you batch_size = 10 num_batches = int(np.ceil(len(train_images) / batch_size)) def batch_indices(iter): idx = iter % num_batches return slice(idx * batch_size, (idx + 1) * batch_size) # This is numerically stable code to for the log of a bernoulli density # In particular, notice that we're keeping everything as log, and using logaddexp # We never want to take things out of log space for stability def bernoulli_log_density(targets, unnormalized_logprobs):
def init_prop_params(T, Dx, scale = 0.5, rs = npr.RandomState(0)): return [(scale * rs.randn(Dx), # Bias 1. + scale * rs.randn(Dx), # Linear times A/mu0 scale * rs.randn(Dx)) # Log-var for t in range(T)]
def run_variational_network(train_images, N_weights_dec, decoder, decoder_log_like, trained_weights, all_mean): start_time = time.time() # Create aevb function # Training parameters D = train_images.shape[1] enc_layers = [D, hidden_units, hidden_units, 2 * latent_dimensions] N_weights_enc, encoder, encoder_log_like = make_gaussian_nn(enc_layers) # Optimize aevb batch_size = 10 num_training_iters = 1600 rs = npr.RandomState(0) parser = WeightsParser() parser.add_shape('encoding weights', (N_weights_enc, )) initial_enc_w = rs.randn(len(parser)) * param_scale batch_idxs = make_batches(train_images.shape[0], batch_size) banded_cov = create_banded_cov(all_cov.shape[0], 10) log_prior = build_logprob_mvn(all_mean, banded_cov) def batch_value_and_grad(enc_w, iter): iter = iter % len(batch_idxs) cur_data = train_images[batch_idxs[iter]] return enc_lower_bound(enc_w, trained_weights, encoder, decoder_log_like, log_prior, N_weights_enc, cur_data, samples_per_image, latent_dimensions, rs) lb_grad = grad(batch_value_and_grad) def callback(params, i, grad): ml = batch_value_and_grad(params, i) print "log marginal likelihood:", ml #Generate samples num_samples = 100 images_per_row = 10 # zs = train_images[0:100,:] zs = np.zeros((100, 10)) zs[:, 1] = .5 zs[:, 5] = .5 (mus, log_sigs) = encoder(params, zs) # sigs = np.exp(log_sigs) # noise = rs.randn(1,100,784) # samples = mus + sigs*noise # samples = np.reshape(samples,(100*1,784),order = 'F') samples = mus fig = plt.figure(1) fig.clf() ax = fig.add_subplot(111) plot_images(samples, ax, ims_per_row=images_per_row) plt.savefig('samples.png') final_params = adam(lb_grad, initial_enc_w, num_training_iters, callback=callback) def decoder_with_weights(zs): return decoder(parser.get(final_params, 'decoding weights'), zs) return decoder_with_weights finish_time = time.time() print "total runtime", finish_time - start_time
def main(num_petridish_iter = 1, num_top_points = 2, perf_thresh = 0.97, layer_sizes= [2, 3, 1], L2_reg= 0.00001, param_scale = 0.15, batch_size = 20, num_epochs = 20, step_size = 0.01, hyper_iter = 600, hyper_step_size = 2.0, hyper_decay = 0.4, hyper_decay_after = 200, hyper_decay_every = 150, hyper_L2_reg = 0.00001, rank_loss_scaling_factor = 100.0, mse_tolerance = None, outputFname = '/tmp/results', train_sorted_c_r_list = None, full_sorted_c_r_list=None, fake_train_images = None, fake_train_labels = None, fake_valid_images = None, fake_valid_labels = None): seed = None random_state=npr.RandomState(seed=seed) #seed is supplied from outerloop_petridish #Generate weight initialization #Following two lines will Generate new init file from Macbook with petridish (seed=0) for each new layer_size config #init_params_list = petridish.generate_init_params(full_sorted_c_r_list, param_scale, layer_sizes, random_state) dir_path = os.path.dirname(os.path.realpath(__file__))+'/' init_param_fname = dir_path+'init_param_files/'+str(layer_sizes)+'_'+str(param_scale)+'.pkl' #helper.save_pickle(init_param_fname, init_params_list) #init_params_list = helper.load_pickle(init_param_fname) if fake_valid_images is None or fake_valid_labels is None: fake_valid_images = np.copy(fake_train_images) fake_valid_labels = np.copy(fake_train_labels) fake_train_images = random_state.randn(batch_size, layer_sizes[0]) fake_train_labels = np.round(random_state.rand(batch_size, layer_sizes[-1])) #np.random.randint(2, size=(batch_size, layer_sizes[-1])) #fake_valid_images = random_state.randn(batch_sexp_numze, layer_sizes[0]) #fake_valid_labels = np.random.randint(2, size=(batch_size, layer_sizes[-1])) fake_valid_images = np.copy(fake_train_images) fake_valid_labels = np.copy(fake_train_labels) #1. Warm up petridish with K number of points # a. call petridish with specific weight initialization, and other hyper-parameters # b. returns learnt training data #Sample K=4 random points (for now 6 fixed points) best_scores = [] #List of ground-truth values of the best predicted scores at each iteration. Start with the first K scores. for c, r in train_sorted_c_r_list: best_scores.append(r) #for the warm-up, best scores have ground-truth values of the initial training points #Warming up Petridish success_flag = False iteration = 0 lines = [] #Contains all the logs to be returned while iteration < num_petridish_iter: lines.append('Outerloop: Iteration '+ str(iteration)+'\n') lines.append('train_sorted_c_r_list ' + str(train_sorted_c_r_list)+'\n') outputFname_iter = outputFname#+str(iteration)+'.txt' #if os.path.exists(outputFname_iter): # os.remove(outputFname_iter) #Remove file if it exists so that we do not keep appending to it combined_init_params, mask_params, init_params = helper.get_init_param_list(1, train_sorted_c_r_list, param_scale, layer_sizes, None, random_state) #combined_init_params, mask_params = petridish.create_combined_init_params(init_params_list[0:len(train_sorted_c_r_list)], layer_sizes) #Train results = petridish.main(layer_sizes= layer_sizes, L2_reg= L2_reg, param_scale = param_scale, batch_size = batch_size, num_epochs = num_epochs, step_size = step_size, hyper_iter =hyper_iter, hyper_step_size = hyper_step_size, hyper_decay = hyper_decay, hyper_decay_after = hyper_decay_after, hyper_decay_every = hyper_decay_every, hyper_L2_reg = hyper_L2_reg, rank_loss_scaling_factor = rank_loss_scaling_factor, mse_tolerance = mse_tolerance, outputFname = outputFname_iter, sorted_c_r_list = train_sorted_c_r_list, init_params_list=None, combined_init_params = combined_init_params, mask_params = mask_params, random_state = random_state, hyper_train = True, fake_train_images = fake_train_images, fake_train_labels = fake_train_labels, fake_valid_images = fake_valid_images, fake_valid_labels = fake_valid_labels) #Extract trained data (This currently returns True if rank order correct) (train_mse, success_flag, train_score_list, learnt_images, learnt_train_labels, learnt_valid_images, learnt_valid_labels) = results for idx, (c, r) in enumerate(train_sorted_c_r_list): lines.append('Outerloop: Train Point Scores '+ str(c)+' ' + str(-train_score_list[idx])+'\n') ##If results did not converge, then check for intermediate convergence (first MSE convergence and then rank convergence) #if success_flag == False: # extracted_learnt_images = helper.read_images_from_file(fname=outputFname_iter, num_images=len(fake_train_images), checkStr="MSE Loss converged") # if extracted_learnt_images is None: #No MSE convergence # extracted_learnt_images = helper.read_images_from_file(fname=outputFname_iter, num_images=len(fake_train_images), checkStr="Rank order correct") # else: # print ('Outerloop: MSE converged') #if extracted_learnt_images is not None: # learnt_images = extracted_learnt_images # print ('Outerloop: Rank correct') #2. Run full grid-search (architecture search/auto-tune) with petridish # a. Evaluate hundreds of points cheaply by running petridish in test/evaluation mode (no hyper-training) # b. Take Y (say two) points from petridish (two best? or best and worst?) #Reduce the number of points over which we will do petridish search #Exclude the points that we have already evaluated ground truth for. No need predicting those #For now, predict all for debugging purposes. But for selecting top predicted points, select from test_sorted_c_r_list (TODO) test_sorted_c_r_list = sorted(list(set(full_sorted_c_r_list)-set(train_sorted_c_r_list)), key = lambda tup: tup[1], reverse=True) #TODO: Change this approach of set #if len(test_sorted_c_r_list) != (len(full_sorted_c_r_list) - len(train_sorted_c_r_list)): # print ("Outerloop: Error in Computing test_sorted_c_r_list"); import sys; sys.exit() #combined_init_params, mask_params = petridish.create_combined_init_params(init_params_list[0:len(test_sorted_c_r_list)], layer_sizes) combined_init_params, mask_params, _ = helper.get_init_param_list(1, test_sorted_c_r_list, param_scale, layer_sizes, init_params, random_state) #Test (Setting hyper_train=False and fake_train_images = learn_images) results = petridish.main(layer_sizes= layer_sizes, L2_reg= L2_reg, param_scale = param_scale, batch_size = batch_size, num_epochs = num_epochs, step_size = step_size, hyper_iter =hyper_iter, hyper_step_size = hyper_step_size, hyper_decay = hyper_decay, hyper_decay_after = hyper_decay_after, hyper_decay_every = hyper_decay_every, hyper_L2_reg = hyper_L2_reg, rank_loss_scaling_factor = rank_loss_scaling_factor, sorted_c_r_list = test_sorted_c_r_list, init_params_list=None, combined_init_params = combined_init_params, mask_params = mask_params, random_state = random_state, hyper_train = False, fake_train_images = learnt_images, fake_train_labels = fake_train_labels, fake_valid_images = learnt_valid_images, fake_valid_labels = learnt_valid_labels) #Extract Results (test_mse, success_flag, test_score_list, learnt_images, learnt_train_labels, learnt_valid_images, learnt_valid_labels) = results for idx, (c, r) in enumerate(test_sorted_c_r_list): lines.append('Outerloop: Test Point Scores '+ str(c)+' '+ str(-test_score_list[idx])+'\n') top_c_r_list = helper.extract_top_arch(test_score_list, test_sorted_c_r_list, K=num_top_points-1) #Extract top num_top_points-1 points with best predicted performance #Select one point randomly (not the best). This is to prevent the model from greedily selecting low-performance points in a local region #Ideally, this should be done with some random probability i.e initially, when the model is poor, do random selection #Later, when the model is more accurate, do only greedy selection reduced_c_r_list = [(c,r) for (c,r) in test_sorted_c_r_list if (c,r) not in top_c_r_list] randomly_selected_c_r_list = helper.randomly_select_K(reduced_c_r_list, perf_thresh, K=1) top_c_r_list = top_c_r_list + randomly_selected_c_r_list lines.append('Train and Test mse '+ str(train_mse)+' '+ str(test_mse)+'\n') #3. Evaluate ground truth for the points with best predicted score for idx, (c, r) in enumerate(top_c_r_list): if idx == num_top_points: break best_scores.append(r) #Evaluate ground-truth for the best points and append to this list #4. Stopping condition - if we reached the best lines.append('Outerloop: Scores '+ str(best_scores)+'\n') for score in best_scores: if (score >= perf_thresh): lines.append('Outerloop: Found best solution'+str(best_scores)+'\n') return (lines, best_scores, train_mse, test_mse) lines.append('Adding Top predicted points '+str(top_c_r_list)+'\n') #Continue - Add Y top (one or two) points to the train_sorted_c_r_list for further evaluation train_sorted_c_r_list = helper.add_new_arch_ground_truth(train_sorted_c_r_list, top_c_r_list) #5. Run petridish again with additional points (Restart or resume from learnt images/learn images+learnt weights?) #6. Go Back to 2. iteration = iteration + 1 lines.append('Outerloop: NOT Found best solution'+str(best_scores)+'\n') return (lines, best_scores, train_mse, test_mse)
test_images = np.round(test_images[0:10000]) # test_labels = np.round(test_images[0:10000]) a typo test_labels = test_labels[0:10000] K = 10 prior_std = 10. # Choose two pixels and plot the K specific weights against eachother contourK = 2 px1 = 392 # Middle Pixel # px2 = px1 + 28*5 +1 # Middle Pixel + 5 rows down px2 = px1+14 # Middle left-most edge # Random initialization, with set seed for easier debugging # Try changing the weighting of the initial randomization, default 0.01 init_params = (npr.RandomState(0).randn(K, D) * 0.01, npr.RandomState(1).randn(K, D) * 0.01) def logistic_logprob(params, images, labels): # params is a block of S x K x D params # images is N x D # labels is N x K one-hot # return S logprobs, summing over N mul = np.einsum('skd,nd->snk', params, images) normalized = mul - logsumexp(mul, axis=-1, keepdims=True) return np.einsum('snk,nk->s', normalized, labels) def diag_gaussian_log_density(x, mu, log_std): # assumes that mu and log_std are (S x K X D), # so we sum out the last two dimensions.
def build_step_function_dataset(D=1, n_data=40, noise_std=0.1): rs = npr.RandomState(0) inputs = np.linspace(-2, 2, num=n_data) targets = np.sign(inputs) + rs.randn(n_data) * noise_std inputs = inputs.reshape((len(inputs), D)) return inputs, targets
def objective(var_params, iter): return -elbo_estimate(var_params, logprob_given_data, num_samples=50, rs=npr.RandomState(iter))
def build_mog_bbsvi(logprob, num_samples, k=10, rs=npr.RandomState(0)): init_component_var_params = init_gaussian_var_params component_log_density = variational_log_density_gaussian component_sample = sample_diag_gaussian def unpack_mixture_params(mixture_params): log_weights = log_normalize(mixture_params[:k]) var_params = np.reshape(mixture_params[k:], (k, -1)) return log_weights, var_params def init_var_params(D, rs=npr.RandomState(0), **kwargs): log_weights = np.ones(k) component_weights = [ init_component_var_params(D, rs=rs, **kwargs) for i in range(k) ] return np.concatenate([log_weights] + component_weights) def sample(var_mixture_params, num_samples, rs): """Sample locations aren't a continuous function of parameters due to multinomial sampling.""" log_weights, var_params = unpack_mixture_params(var_mixture_params) samples = np.concatenate([ component_sample(params_k, num_samples, rs)[:, np.newaxis, :] for params_k in var_params ], axis=1) ixs = np.random.choice(k, size=num_samples, p=np.exp(log_weights)) return np.array([samples[i, ix, :] for i, ix in enumerate(ixs)]) def mixture_log_density(var_mixture_params, x): """Returns a weighted average over component densities.""" log_weights, var_params = unpack_mixture_params(var_mixture_params) component_log_densities = np.vstack( [component_log_density(params_k, x) for params_k in var_params]).T # print ((component_log_densities).shape) # print ((component_log_densities + log_weights).shape) # fasdfa # print (logsumexp(component_log_densities + log_weights, axis=1, keepdims=False).shape) # fsafda return logsumexp(component_log_densities + log_weights, axis=1, keepdims=False) #over clusters def mixture_elbo(var_mixture_params, t): # We need to only sample the continuous component parameters, # and integrate over the discrete component choice def mixture_lower_bound(params): """Provides a stochastic estimate of the variational lower bound.""" samples = component_sample(params, num_samples, rs) log_qs = mixture_log_density(var_mixture_params, samples) log_ps = logprob(samples, t) log_ps = np.reshape(log_ps, (num_samples, -1)) log_qs = np.reshape(log_qs, (num_samples, -1)) return np.mean(log_ps - log_qs) #over samples # log_w = log_ps - log_qs # elbo = logmeanexp(log_w) # return elbo log_weights, var_params = unpack_mixture_params(var_mixture_params) component_elbos = np.stack( [mixture_lower_bound(params_k) for params_k in var_params]) # print (component_elbos.shape) # print (log_weights.shape) # fasdf # return np.sum(component_elbos + log_weights) #over clusters return np.sum(component_elbos * np.exp(log_weights)) return init_var_params, mixture_elbo, mixture_log_density, sample
def train_nn(pred_fun, loss_fun, num_weights, train_aa, train_raw_targets, train_params, seed=0, validation_aa=None, validation_raw_targets=None): """loss_fun has inputs (weights, smiles, targets)""" print "Total number of weights in the network:", num_weights init_weights = npr.RandomState(seed).randn( num_weights) * train_params['init_scale'] num_print_examples = 32 train_targets, undo_norm = normalize_array(train_raw_targets) training_curve = [[], [], []] # Test error, Val error def callback(weights, iter): if iter % 10 == 0: print "max of weights", np.max(np.abs(weights)) selection = npr.choice(train_aa.size, size=num_print_examples) train_preds = undo_norm(pred_fun(weights, train_aa[selection])) cur_loss = loss_fun(weights, train_aa[selection], train_targets[selection]) # train_preds = undo_norm(pred_fun(weights, train_aa[:num_print_examples])) # cur_loss = loss_fun(weights, train_aa[:num_print_examples], train_targets[:num_print_examples]) training_curve[0].append(cur_loss) train_RMSE = rmse(train_preds, train_raw_targets[selection]) # train_RMSE = rmse(train_preds, train_raw_targets[:num_print_examples]) training_curve[1].append(train_RMSE) print "Iteration", iter, "loss", cur_loss,\ "train RMSE", train_RMSE, if validation_aa is not None: selection = npr.choice(validation_aa.size, size=num_print_examples) validation_preds = undo_norm( pred_fun(weights, validation_aa[selection])) val_RMSE = rmse(validation_preds, validation_raw_targets[selection]) training_curve[2].append(val_RMSE) print "Validation RMSE", iter, ":", val_RMSE, # Build gradient using autograd. grad_fun = grad(loss_fun) grad_fun_with_data = build_batched_grad(grad_fun, train_params['batch_size'], train_aa, train_targets) # Optimize weights. trained_weights = adam(grad_fun_with_data, init_weights, callback=callback, num_iters=train_params['num_iters'], step_size=train_params['step_size']) def predict_func(new_aa): """Returns to the original units that the raw targets were in.""" return undo_norm(pred_fun(trained_weights, new_aa)) return predict_func, trained_weights, training_curve
def init_net_params(layer_sizes, scale=0.1, rs=npr.RandomState(0)): return [(scale * rs.randn(m, n), scale * rs.randn(n)) for m, n in zip(layer_sizes[:-1], layer_sizes[1:])]
combined_init_params = (init_gen_params, init_rec_params) # print(np.array(init_gen_params).shape) # print(np.array(init_rec_params).shape) print("Loading training data...") X = bimodal_data() print(X.shape) num_batches = int(np.ceil(len(X) / batch_size)) def batch_indices(iter): idx = iter % num_batches return slice(idx * batch_size, (idx + 1) * batch_size) # Define training objective seed = npr.RandomState(0) def objective(combined_params, iter): data_idx = batch_indices(iter) gen_params, rec_params = combined_params return -vae_lower_bound(gen_params, rec_params, X[data_idx], seed) / data_dim # Get gradients of objective using autograd. objective_grad = grad(objective) #TO VIZUALIZE # plt.ion() # plt.show(block=False) print(
plot_xs = np.reshape(np.linspace(-5, 5, 300), (300, 1)) pred_mean, pred_cov = combined_predict_fun(params, X, y, plot_xs) plot_gp(ax_end_to_end, X, y, pred_mean, pred_cov, plot_xs) ax_end_to_end.set_title("X to y") layer1_params, layer2_params, hiddens = unpack_all_params(params) h_star_mean, h_star_cov = predict_layer_funcs[0]( layer1_params, X, hiddens, plot_xs) y_star_mean, y_star_cov = predict_layer_funcs[0]( layer2_params, np.atleast_2d(hiddens).T, y, plot_xs) plot_gp(ax_x_to_h, X, hiddens, h_star_mean, h_star_cov, plot_xs) ax_x_to_h.set_title("X to hiddens") plot_gp(ax_h_to_y, np.atleast_2d(hiddens).T, y, y_star_mean, y_star_cov, plot_xs) ax_h_to_y.set_title("hiddens to y") plt.draw() plt.pause(1.0 / 60.0) # Initialize covariance parameters and hiddens. rs = npr.RandomState(0) init_params = 0.1 * rs.randn(total_num_params) print("Optimizing covariance parameters...") def objective(params): return -log_marginal_likelihood(params) cov_params = minimize(value_and_grad(objective), init_params, jac=True, method='CG', callback=callback) plt.pause(10.0)
def fit_nn_reg(X, y, hidden_layer_sizes, batch_size, epochs, X_test, y_test, mean_y_train=0.0, std_y_train=1.0, nonln='relu', weight_prior_std=1.0, noise_var=0.1, plot_toy=False): layer_sizes = np.array([X.shape[1]] + hidden_layer_sizes + [1]) if nonln == 'tanh': nonlinearity = np.tanh elif nonln == 'relu': nonlinearity = lambda x: np.maximum(x, 0.0) elif nonln == 'rbf': nonlinearity = lambda x: norm.pdf(x, 0, 1) elif nonln == 'sin': nonlinearity = lambda x: np.sin(x) elif nonln == 'sigmoid': nonlinearity = lambda x: 1 / (1 + np.exp(-x)) num_weights, predictions, logprob, get_error \ = make_nn_funs(layer_sizes, nonlinearity=nonlinearity, weight_prior_std=weight_prior_std, noise_var=noise_var) logprob_grad = grad(logprob) Ntrain = X.shape[0] print(" Epoch | train RMSE | test RMSE") if plot_toy: # Set up figure. fig = plt.figure(figsize=(12, 8), facecolor='white') ax = fig.add_subplot(111, frameon=True) plt.show(block=False) def print_perf(epoch, w): rmse_train = get_error(w, X, y, location=0.0, scale=1.0) rmse_test = get_error(w, X_test, y_test, location=0.0, scale=1.0) print("{0:15}|{1:15}|{2:15}|".format(epoch, rmse_train, rmse_test)) if plot_toy: # Plot data and functions. plt.cla() ax.plot(X.ravel(), y.ravel(), 'bx') plot_inputs = np.reshape(np.linspace(-7, 7, num=300), (300, 1)) outputs = predictions(w, plot_inputs) ax.plot(plot_inputs, outputs) ax.set_ylim([-1, 1]) plt.draw() plt.pause(1.0 / 60.0) # Train with adam batch_idxs = make_batches(X.shape[0], batch_size) # Initialize parameters rs = npr.RandomState(0) init_weights = 0.1 * rs.randn(num_weights) w = init_weights N_test = X_test.shape[0] m1 = 0 m2 = 0 beta1 = 0.9 beta2 = 0.999 epsilon = 1e-8 alpha = 1e-2 t = 0 log_prob_vec = [] for epoch in range(epochs): permutation = np.random.choice(range(X.shape[0]), X.shape[0], replace=False) print_perf(epoch, w) for idxs in batch_idxs: t += 1 lp = logprob(w, X[permutation[idxs]], y[permutation[idxs]], X.shape[0]) log_prob_vec.append(lp) grad_w = logprob_grad(w, X[permutation[idxs]], y[permutation[idxs]], X.shape[0]) m1 = beta1 * m1 + (1 - beta1) * grad_w m2 = beta2 * m2 + (1 - beta2) * grad_w**2 m1_hat = m1 / (1 - beta1**t) m2_hat = m2 / (1 - beta2**t) w += alpha * m1_hat / (np.sqrt(m2_hat) + epsilon) t += 1 return w, np.array(log_prob_vec)