def fit(self, target_log_q, n_iter=1000, n_samples=1, n_samples_per_report=10, report_interval=10, annealed=True, step_size=0.01, l2_penalty=1.0): """Optimize with parameters of self.model to minimize the variational free energy between target_log_q and the distribution of y = model.transform(x), x ~ N(0,1)""" if annealed: beta = np.linspace(0.01, 1.0, n_iter) else: beta = np.ones(n_iter) self.optimization_history = [] progress_log_callback = self.progress_logger_factory( target_log_q, n_samples_per_report, report_interval) normalization = lambda params: l2_penalty * np.sum(np.abs(params)**2) reparam_gradient = lambda params, i: clip_gradients( self.reparameterization_gradient(params, target_log_q, n_samples, beta[i]), 1) + grad(normalization )(params) self.params = adam(grad=reparam_gradient, init_params=self.model.params, step_size=step_size, callback=progress_log_callback, num_iters=n_iter)
def train(self, X_train, F_train, y_train, batch_size=32, num_iters=1000, lr=1e-3, param_scale=0.01, log_every=100, init_weights=None): grad_fun = build_batched_grad_fences(grad(self.objective), batch_size, X_train, F_train, y_train) print('Batched gradient fences building completed') if init_weights is None: init_weights = self.init_weights(param_scale) saved_weights = np.zeros((num_iters, self.num_weights)) def callback(weights, i, gradients): # 计算全样本log-likelihood过于缓慢,摒弃。 apl = self.average_path_length(weights, X_train, F_train, y_train) saved_weights[i, :] = weights loss_train = self.objective(weights, X_train, F_train, y_train) if i % log_every == 0: print('model: gru | iter: {} | loss: {:.2f} | apl: {:.2f}'. format(i, loss_train, apl)) print('Optimization started.') print(self.num_weights) optimized_weights = adam(grad_fun, init_weights, num_iters=num_iters, step_size=lr, callback=callback) self.saved_weights = saved_weights self.weights = optimized_weights return optimized_weights
def train(self, X_train, y_train, batch_size=32, num_iters=1000, lr=1e-3, param_scale=0.01, log_every=100, init_weights=None): grad_fun = build_batched_grad(grad(self.objective), batch_size, X_train, y_train) if init_weights is None: init_weights = self.init_weights(param_scale) def callback(weights, i, gradients): loss_train = self.objective(weights, X_train, y_train) if i % log_every == 0: print('model: mlp | iter: {} | loss: {:.2f}'.format( i, loss_train)) optimized_weights = adam(grad_fun, init_weights, num_iters=num_iters, step_size=lr, callback=callback) self.weights = optimized_weights return optimized_weights
def fit(self, inputs, targets, A=None, num_epochs=64, batch_size=256, step_size=0.001, rs=npr, nonlinearity=relu, verbose=False, normalize=False, always_include=None, **input_grad_kwargs): X = inputs.astype(np.float32) y = one_hot(targets) if A is None: A = np.zeros_like(X).astype(bool) params = init_random_params(0.1, [X.shape[1]] + self.layers + [y.shape[1]], rs=rs) if type(verbose) == int: v = verbose verbose = lambda x: x % v == 0 batch_size = min(batch_size, X.shape[0]) num_batches = int(np.ceil(X.shape[0] / batch_size)) def batch_indices(iteration): idx = iteration % num_batches return slice(idx * batch_size, (idx+1) * batch_size) def objective(params, iteration): idx = batch_indices(iteration) Ai = A[idx] Xi = X[idx] yi = y[idx] if always_include is not None: Ai = np.vstack((A[always_include], Ai)) Xi = np.vstack((X[always_include], Xi)) yi = np.vstack((y[always_include], yi)) if normalize: sumA = max(1., float(Ai.sum())) lenX = max(1., float(len(Xi))) else: sumA = 1. lenX = 1. crossentropy = -np.sum(feed_forward(params, Xi, nonlinearity) * yi) / lenX rightreasons = self.l2_grads * l2_norm(input_gradients(params, **input_grad_kwargs)(Xi)[Ai]) / sumA smallparams = self.l2_params * l2_norm(params) if verbose and verbose(iteration): print('Iteration={}, crossentropy={}, rightreasons={}, smallparams={}, sumA={}, lenX={}'.format( iteration, crossentropy.value, rightreasons.value, smallparams.value, sumA, lenX)) return crossentropy + rightreasons + smallparams self.params = adam(grad(objective), params, step_size=step_size, num_iters=num_epochs*num_batches)
def fit(self, target_log_q, init_params=None, n_iter=1000, n_samples=1, n_samples_per_report=10, report_interval=10, step_size=0.01, annealed=True): """Fit normalizing flow to target_log_q by minimizing variational free energy""" if annealed: beta = np.linspace(0.001, 1.0, n_iter) else: beta = np.ones(n_iter) if init_params == None: init_params = 0.01 * np.random.randn(self.n_params) self.optimization_history = [] progress_log_callback = self.progress_logger_factory(target_log_q, n_samples_per_report, report_interval) reparam_gradient = lambda params, i: self.reparameterization_gradient(params, target_log_q, n_samples, beta[i]) self.params = adam(grad=reparam_gradient, init_params=init_params, step_size=step_size, callback=progress_log_callback, num_iters=n_iter)
def fit(self, inputs, targets, A=None, num_epochs=64, batch_size=256, step_size=0.001, rs=npr, nonlinearity=relu, **input_grad_kwargs): X = inputs.astype(np.float32) y = one_hot(targets) if A is None: A = np.zeros_like(X).astype(bool) params = init_random_params(0.1, [X.shape[1]] + self.layers + [y.shape[1]], rs=rs) batch_size = min(batch_size, X.shape[0]) num_batches = int(np.ceil(X.shape[0] / batch_size)) def batch_indices(iteration): idx = iteration % num_batches return slice(idx * batch_size, (idx + 1) * batch_size) def objective(params, iteration): idx = batch_indices(iteration) return -( np.sum(feed_forward(params, X[idx], nonlinearity) * y[idx]) # cross-entropy - self.l2_params * l2_norm(params) # L2 regularization on parameters directly - self.l2_grads * l2_norm( input_gradients( # "Explanation regularization" params, **input_grad_kwargs)(X[idx])[A[idx]])) self.params = adam(grad(objective), params, step_size=step_size, num_iters=num_epochs * num_batches)
def train(train_data, test_data, layer_widths, step_size, num_epochs, batch_size): num_batches = int(np.ceil(len(train_data) / batch_size)) num_iters = num_batches * num_epochs print("num_batches: ", num_batches) print("num_iters: ", num_iters) def objective(params, iteration): idx = iteration % num_batches chunk = slice(idx * batch_size, (idx + 1) * batch_size) return auto_enc_loss(train_data[chunk], params) def print_perf(params, iteration, gradient): if iteration % num_batches == 0: print(100.0 * iteration / num_iters, '% done') print('Training error: ', auto_enc_loss(train_data, params)) print('Test error: ', auto_enc_loss(test_data, params)) objective_grad = grad(objective) params = init_params(layer_widths, 0.1) optimized_params = adam( grad=objective_grad, init_params=params, step_size=step_size, num_iters=num_iters, callback=print_perf ) # optimized_params = rmsprop( # grad=objective_grad, # init_params=params, # step_size=step_size, # num_iters=num_iters, # callback=print_perf # ) return optimized_params
elbos.append(elbo_val) if t % 50 == 0: print("Iteration {} lower bound {}".format(t, elbo_val)) init_mean = -1 * np.ones(D) init_log_std = -5 * np.ones(D) init_var_params = np.concatenate([init_mean, init_log_std]) variational_params = optfun(num_iters, init_var_params, callback) return np.array(elbos) # let's optimize this with a few different step sizes elbo_lists = [] step_sizes = [.1, .25, .5] for step_size in step_sizes: # optimize with standard gradient + adam optfun = lambda n, init, cb: adam(gradient, init, step_size=step_size, num_iters=n, callback=cb) standard_lls = optimize_and_lls(optfun) # optimize with natural gradient + sgd, no momentum optnat = lambda n, init, cb: sgd(natural_gradient, init, step_size=step_size, num_iters=n, callback=cb, mass=.001) natural_lls = optimize_and_lls(optnat) elbo_lists.append((standard_lls, natural_lls)) # visually compare the ELBO plt.figure(figsize=(12,8)) colors = ['b', 'k', 'g'] for col, ss, (stand_lls, nat_lls) in zip(colors, step_sizes, elbo_lists): plt.plot(np.arange(len(stand_lls)), stand_lls, '--', label="standard (adam, step-size = %2.2f)"%ss, alpha=.5, c=col) plt.plot(np.arange(len(nat_lls)), nat_lls, '-',
L2_VAR_2, NUM_TRAIN, train_images, train_labels, C, D, L) # Build callback for ADAM optimizer def init_callback(params, t, g): lik = -objective(params, t) print("Initialization iteration {} log-likelihood {}".format( t, lik)) # initialize weights pre_init_weights = np.ones(L) # optimize weights print("Initializing weights...") init_weights = adam(init_gradient, pre_init_weights, step_size=0.1, num_iters=INIT_ITERS, callback=init_callback) # pickle processed data in /cache (if doesn't already exist) if not os.path.exists('cache'): print('creating cache folder') os.makedirs('cache') if not os.path.isfile(picklefilename): print('saving pickled regression initalization data') np.savez(picklefilename, init_weights=init_weights) ############################################################### ############################################################### # OPTIMIZE NOISE-AWARE LIKELIHOOD # ###############################################################
print("Iteration {} lower bound {}".format( t, -objective(params, t, 1000))) if t % 10 == 0: plt.cla() # plot target target_distribution = lambda x: np.exp(lnpdf(x, t)) plot_isocontours(ax, target_distribution, fill=True) # plot approximate distribution plot_q_dist(ax, params) ax.set_xlim((-3, 3)) ax.set_ylim((-4, 4)) plt.draw() plt.pause(1.0 / 30.0) ##################### # Run optimization # ##################### print("Optimizing variational parameters...") th = .5 * npr.randn(num_variational_params) - 3. num_objective_samps = 10 def grad_wrap(th, t): return gradient(th, t, num_objective_samps) variational_params = adam(grad_wrap, th, step_size=.02, num_iters=10000, callback=callback)
ax.set_xticks([]) ax.set_ylim([-4, 1]) ax.set_xlim([-2, 2]) # Set up figure. fig = plt.figure(figsize=(8,8), facecolor='white') ax = fig.add_subplot(111, frameon=False) plt.ion() plt.show(block=False) num_plotting_samples = 51 def callback(params, t, g): print("Iteration {} lower bound {}".format(t, -objective(params, t))) plt.cla() target_distribution = lambda x : np.exp(log_posterior(x, t)) var_distribution = lambda x: np.exp(diag_gaussian_density_from_params(params, x)) plot_isocontours(ax, target_distribution) plot_isocontours(ax, var_distribution) rs = npr.RandomState(0) samples = iwae_sample(log_posterior, params, t, k, num_plotting_samples, rs) plt.plot(samples[:, 0], samples[:, 1], 'x') plt.draw() plt.pause(1.0/30.0) print("Optimizing variational parameters...") adam(grad(objective), init_gaussian_var_params(D), step_size=0.1, num_iters=2000, callback=callback)
# log_weights = params[:10] - logsumexp(params[:10]) print("Iteration {} lower bound {}".format(t, -objective(params, t))) # print (np.exp(log_weights)) plt.cla() target_distribution = lambda x: np.exp(log_density(x)) var_distribution = lambda x: np.exp(variational_log_density(params, x)) plot_isocontours(ax, target_distribution) plot_isocontours(ax, var_distribution, cmap=plt.cm.bone) ax.set_autoscale_on(False) # rs = npr.RandomState(0) # samples = variational_sampler(params, num_plotting_samples, rs) # plt.plot(samples[:, 0], samples[:, 1], 'x') plt.draw() plt.pause(1.0/30.0) print("Optimizing variational parameters...") variational_params = adam(grad(objective), init_var_params(D), step_size=0.1, num_iters=2000, callback=callback)
elbos.append(elbo_val) if t % 50 == 0: print("Iteration {} lower bound {}".format(t, elbo_val)) init_mean = -1 * np.ones(D) init_log_std = -5 * np.ones(D) init_var_params = np.concatenate([init_mean, init_log_std]) variational_params = optfun(num_iters, init_var_params, callback) return np.array(elbos) # let's optimize this with a few different step sizes elbo_lists = [] step_sizes = [.1, .25, .5] for step_size in step_sizes: # optimize with standard gradient + adam optfun = lambda n, init, cb: adam( gradient, init, step_size=step_size, num_iters=n, callback=cb) standard_lls = optimize_and_lls(optfun) # optimize with natural gradient + sgd, no momentum optnat = lambda n, init, cb: sgd(natural_gradient, init, step_size=step_size, num_iters=n, callback=cb, mass=.001) natural_lls = optimize_and_lls(optnat) elbo_lists.append((standard_lls, natural_lls)) # visually compare the ELBO plt.figure(figsize=(12, 8)) colors = ['b', 'k', 'g']
def callback(params, t, g): print("Iteration {} lower bound {}".format(t, -objective(params, t))) # Sample functions from posterior. rs = npr.RandomState(0) mean, log_std = unpack_params(params) #rs = npr.RandomState(0) sample_weights = rs.randn(10, num_weights) * np.exp(log_std) + mean plot_inputs = np.linspace(-8, 8, num=400) outputs = predictions(sample_weights, np.expand_dims(plot_inputs, 1)) # Plot data and functions. plt.cla() ax.plot(inputs.ravel(), targets.ravel(), 'bx') ax.plot(plot_inputs, outputs[:, :, 0].T) ax.set_ylim([-2, 3]) plt.draw() plt.pause(1.0/60.0) # Initialize variational parameters rs = npr.RandomState(0) init_mean = rs.randn(num_weights) init_log_std = -5 * np.ones(num_weights) init_var_params = np.concatenate([init_mean, init_log_std]) print("Optimizing variational parameters...") variational_params = adam(gradient, init_var_params, step_size=0.1, num_iters=1000, callback=callback)
inputs, targets = build_toy_dataset() def objective(weights, t): return -logprob(weights, inputs, targets)\ -log_gaussian(weights, weight_prior_variance) print(grad(objective)(init_params, 0)) # Set up figure. fig = plt.figure(figsize=(12,8), facecolor='white') ax = fig.add_subplot(111, frameon=False) plt.show(block=False) def callback(params, t, g): print("Iteration {} log likelihood {}".format(t, -objective(params, t))) # Plot data and functions. plt.cla() ax.plot(inputs.ravel(), targets.ravel(), 'bx', ms=12) plot_inputs = np.reshape(np.linspace(-7, 7, num=300), (300,1)) outputs = nn_predict(params, plot_inputs) ax.plot(plot_inputs, outputs, 'r', lw=3) ax.set_ylim([-1, 1]) plt.draw() plt.pause(1.0/60.0) print("Optimizing network parameters...") optimized_params = adam(grad(objective), init_params, step_size=0.01, num_iters=1000, callback=callback)
log_likelihoods = [] print(" Epoch | params ") def print_logLikelihood(params, iter, gradient): log_likelihood = logLikelihood(params, iter) h1, h2, s = feed_forward(params, iter) #h = np.vstack((h1, h2)) plt.scatter(h1, h2) plt.show() log_likelihoods.append(log_likelihood) print("{:15}|{:20}".format(iter, log_likelihood)) #h1, h2 = np.random.randn(batch_size,2).T #x1, x2 = inverse_flow(h1, h2, params) #plt.scatter(x1, x2) #plt.show() optimized_params = adam(grad_logLikelihood, init_params, step_size=learning_rate, num_iters=num_epoch, callback=print_logLikelihood) x_axis = np.linspace(0, num_epoch, num_epoch) plt.plot(x_axis, log_likelihoods) plt.show() h1 = np.random.randn(batch_size, 1) h2 = np.random.randn(batch_size, 1) x1, x2 = inverse_flow(h1, h2, optimized_params) plt.scatter(x1, x2) plt.show()
training_text = one_hot_to_string(train_inputs[:,t,:]) predicted_text = one_hot_to_string(logprobs[:,t,:]) print(training_text.replace('\n', ' ') + "|" + predicted_text.replace('\n', ' ')) def training_loss(params, iter): return -rnn_log_likelihood(params, train_inputs, train_inputs) def callback(weights, iter, gradient): if iter % 10 == 0: print("Iteration", iter, "Train loss:", training_loss(weights, 0)) print_training_prediction(weights) # Build gradient of loss function using autograd. training_loss_grad = grad(training_loss) print("Training RNN...") trained_params = adam(training_loss_grad, init_params, step_size=0.1, num_iters=1000, callback=callback) print() print("Generating text from RNN...") num_letters = 30 for t in range(20): text = "" for i in range(num_letters): seqs = string_to_one_hot(text, num_chars)[:, np.newaxis, :] logprobs = rnn_predict(trained_params, seqs)[-1].ravel() text += chr(npr.choice(len(logprobs), p=np.exp(logprobs))) print(text)
# Testing the mapping mechanism w_spc = utils.spacing_gen(10, -1, +1, dim=1) #A,B,C,L,P = sig.params_init(10,mode='linear') params = sig.params_init(num_sig=50, mode='random') z_spc = sig.reparam(w_spc, params, indep=False) dzdw = sig.df_dw(w_spc, params) #z_spc = sig.reparam(w_spc, A,B,C,L,P, indep=True) #disply.line_2d(w_spc, z_spc ) SAMPLING_SIZE = 1000 log_qw, w_gen = utils.uniform_init(-1, +1, dim=1) log_pz, pz_gen = utils.gaussian_mix_init(np.array([1.0, -0.5, -2.]), np.array([0.1, 0.2, 0.05]), np.array([0.3, 0.3, 0.4])) #sig.plot_qz(params,log_qw,target=log_qw, testing=True) #w_samples = w_gen(SAMPLING_SIZE) grad_kl = sig.grad_kl_init(log_pz, log_qw, params, w_gen, SAMPLING_SIZE) trained_params = adam(grad_kl, params, step_size=0.1, num_iters=500) #sig.plot_qz(params,log_qw,target=log_pz, testing=True) sig.plot_qz(trained_params, log_qw, target=log_pz, testing=True) #grad_A = sig.grad_kl(w_samples,log_pz, log_qw, params) #print #print grad_A print('Done')
g - ignore ''' def callback(params, t, g): print("i {}, lower bound {}, test {}, train {} ".format( t, -objective(params, t), accuracy(test_images, test_labels, params), accuracy(train_images, train_labels, params))) print("Optimizing variational parameters...") init_mean = 0 * np.ones(D) init_log_std = 0 * np.ones(D) init_var_params = np.concatenate([init_mean, init_log_std]) variational_params = adam(gradient, init_var_params, step_size=learning_rate, num_iters=train_iters, callback=callback) # ---------------- STOCHASTIC VARIATIONAL INFERENCE DONE --------------- # now get Monte Carlo estimate p(t | x) over the test and training set print('TRAIN set accuracy: ', accuracy(train_images, train_labels, variational_params)) print('TEST set accuracy: ', accuracy(test_images, test_labels, variational_params)) means = variational_params[:D].reshape(784, 10).T std = np.exp(variational_params[D:]).reshape(784, 10).T save_images(means, 'svi_means_sigma_%.5f.png' % sigma_prior, ims_per_row=5)
predicted_text.replace('\n', ' ')) def training_loss(params, iter): return -rnn_log_likelihood(params, train_inputs, train_inputs) def callback(weights, iter, gradient): if iter % 10 == 0: print("Iteration", iter, "Train loss:", training_loss(weights, 0)) print_training_prediction(weights) # Build gradient of loss function using autograd. training_loss_grad = grad(training_loss) print("Training RNN...") trained_params = adam(training_loss_grad, init_params, step_size=0.1, num_iters=1000, callback=callback) print() print("Generating text from RNN...") num_letters = 30 for t in range(20): text = "" for i in range(num_letters): seqs = string_to_one_hot(text, num_chars)[:, np.newaxis, :] logprobs = rnn_predict(trained_params, seqs)[-1].ravel() text += chr(npr.choice(len(logprobs), p=np.exp(logprobs))) print(text)
Z, zdir='z', offset=-100, cmap=cm.coolwarm, zorder=0, levels=np.linspace(0, 30, 30)) a = Arrow(params[0], params[1], -g[0], -g[1], width=0.5, zorder=2) ax2.add_patch(a) art3d.pathpatch_2d_to_3d(a, z=-100, zdir="z") # ax2.plot([params[0], params[0]], # [params[1], params[1]], # [-50, elbo(params, 0)], '--', linewidth=2.0, zorder=5) # ax2.scatter(params[0], params[1], elbo(params, 0), marker='o', s=100) plt.draw() plt.pause(1.0 / 30.0) gradient = grad(elbo) init_mean = 4 * np.ones(1) init_log_std = -5 * np.ones(1) init_var_params = np.concatenate([init_mean, init_log_std]) variational_params = adam(gradient, init_var_params, step_size=0.1, num_iters=400, callback=callback) plt.show()
def run_variational_inference_gumbel(Ys, A, W_true, Ps_true, Cs, etasq, stepsize=0.1, init_with_true=True, num_iters=250, temp_prior=0.1, num_sinkhorn=20, num_mcmc_samples=500, temp=1): def sample_q(params, unpack_W, unpack_Ps, Cs, num_sinkhorn, temp): # Sample W mu_W, log_sigmasq_W, log_mu_Ps = params W_flat = mu_W + np.sqrt(np.exp(log_sigmasq_W)) * npr.randn(*mu_W.shape) W = unpack_W(W_flat) #W = W_true # Sample Ps: run sinkhorn to move mu close to Birkhoff Ps = [] for log_mu_P , unpack_P, C in \ zip(log_mu_Ps, unpack_Ps, Cs): # Unpack the mean, run sinkhorn, the pack it again log_mu_P = unpack_P(log_mu_P) a = log_mu_P.shape log_mu_P = ( log_mu_P + -np.log(-np.log(np.random.uniform(0, 1, (a[0], a[1]))))) / temp log_mu_P = sinkhorn_logspace(log_mu_P - 1e8 * (1 - C), num_sinkhorn) log_mu_P = log_mu_P[C] ##Notice how we limit the variance P = np.exp(log_mu_P) P = unpack_P(P) Ps.append(P) Ps = np.array(Ps) return W, Ps def elbo(params, unpack_W, unpack_Ps, Ys, A, Cs, etasq, num_sinkhorn, num_mcmc_samples, temp_prior, temp): """ Provides a stochastic estimate of the variational lower bound. sigma_Lim: limits for the variance of the re-parameterization of the permutation """ def gumbel_distance(log_mu_Ps, temp_prior, temperature, Cs): arr = 0 for n in range(len(log_mu_Ps)): log_mu_P = unpack_Ps[n](log_mu_Ps[n]) C = Cs[n] log_mu_P = log_mu_P[C] log_mu_P = log_mu_P[:] arr += np.sum( np.log(temp_prior) - 0.5772156649 * temp_prior / temperature - log_mu_P * temp_prior / temperature - np.exp( gammaln(1 + temp_prior / temperature) - log_mu_P * temp_prior / temperature) - (np.log(temperature) - 1 - 0.5772156649)) return arr M, T, N = Ys.shape assert A.shape == (N, N) assert len(unpack_Ps) == M mu_W, log_sigmasq_W, log_mu_Ps = params L = 0 for smpl in range(num_mcmc_samples): W, Ps = sample_q(params, unpack_W, unpack_Ps, Cs, num_sinkhorn, temp) # Compute the ELBO L += log_likelihood(Ys, A, W, Ps, etasq) / num_mcmc_samples L += gumbel_distance(log_mu_Ps, temp_prior, temp, Cs) # Add the entropy terms L += gaussian_entropy(log_sigmasq_W) fac = 1000 ## This terms adds the KL divergence between the W prior and posterior with entries of W having a prior variance # sigma = 1/fac, for details see the appendix of the VAE paper. L += - 0.5 * log_sigmasq_W.size * (np.log(2 * np.pi)) -\ 0.5 * fac* np.sum(np.exp(log_sigmasq_W)) - 0.5 * fac * np.sum( np.power(mu_W, 2)) # Normalize objective L /= (T * M * N) return L M, T, N = Ys.shape # Initialize variational parameters if init_with_true: mu_W, log_sigmasq_W, unpack_W, log_mu_Ps, unpack_Ps = \ initialize_params_gumbel(A, Cs, map_W=W_true) else: mu_W, log_sigmasq_W, unpack_W, log_mu_Ps, unpack_Ps = \ initialize_params_gumbel(A, Cs) # Make a function to convert an array of params into # a set of parameters mu_W, sigmasq_W, [mu_P1, sigmasq_P1, ... ] flat_params, unflatten = \ flatten((mu_W, log_sigmasq_W, log_mu_Ps )) objective = \ lambda flat_params, t: \ -1 * elbo(unflatten(flat_params), unpack_W, unpack_Ps, Ys, A, Cs, etasq, num_sinkhorn, num_mcmc_samples, temp_prior, temp) # Define a callback to monitor optimization progress elbos = [] lls = [] mses = [] num_corrects = [] times = [] W_samples = [] Ps_samples = [] def collect_stats(params, t): if t % 10 == 0: W_samples.append([]) Ps_samples.append([]) for i in range(100): W, Ps = sample_q(unflatten(params), unpack_W, unpack_Ps, Cs, num_sinkhorn, temp) W_samples[-1].append(W) Ps_samples[-1].append(Ps) times.append(time.time()) elbos.append(-1 * objective(params, 0)) # Sample the variational posterior and compute num correct matches mu_W, log_sigmasq_W, log_mu_Ps = unflatten(params) W, Ps = sample_q(unflatten(params), unpack_W, unpack_Ps, Cs, 10, 1.0) list = [] for i in range(A.shape[0]): list.extend(np.where(Ps[0, i, :] + Ps_true[0, i, :] == 1)[0]) mses.append(np.mean((W * A - W_true * A)**2)) # Round doubly stochastic matrix P to the nearest permutation matrix num_correct = np.zeros(M) Ps2 = np.zeros((Ps.shape[0], A.shape[0], A.shape[0])) for m, P in enumerate(Ps): row, col = linear_sum_assignment(-P + 1e8 * (1 - Cs[m])) Ps2[m] = perm_to_P(col) num_correct[m] = n_correct(perm_to_P(col), Ps_true[m]) num_corrects.append(num_correct) lls.append(log_likelihood(Ys, A, W, Ps2, etasq) / (M * T * N)) def callback(params, t, g): collect_stats(params, t) print( "Iteration {}. ELBO: {:.4f} LL: {:.4f} MSE(W): {:.4f}, Num Correct: {}" .format(t, elbos[-1], lls[-1], mses[-1], num_corrects[-1])) # Run optimizer callback(flat_params, -1, None) variational_params = adam(grad(objective), flat_params, step_size=stepsize, num_iters=num_iters, callback=callback) times = np.array(times) times -= times[0] return times, np.array(elbos), np.array(lls), np.array(mses), \ np.array(num_corrects), Ps_samples, W_samples, A, W_true
# construct recognition and decoder networks and initialize them recognize, recogn_params = \ init_gresnet(P, [(40, np.tanh), (40, np.tanh), (2*N, gaussian_info)]) decode, loglike_params = \ init_gresnet(N, [(40, np.tanh), (40, np.tanh), (2*P, gaussian_mean)]) loglike = make_loglike(decode) # initialize gmm parameters pgm_params = init_pgm_param(T, N, alpha=1., niw_conc=1., random_scale=3.) params = pgm_params, loglike_params, recogn_params # set up encoder/decoder and plotting encode_mean, decode_mean = make_encoder_decoder(recognize, decode) plot = make_plotter_2d(recognize, decode, data, num_clusters, params, plot_every=100, plot_every_density=500) # instantiate svae gradient function gradfun = make_gradfun(run_inference, recognize, loglike, pgm_prior_params, data) # optimize params = adam(gradfun(batch_size=50, num_samples=1, callback=plot), params, num_iters=1000)
seed = npr.RandomState(0) def objective(combined_params, iter): data_idx = batch_indices(iter) gen_params, rec_params = combined_params return -vae_lower_bound(gen_params, rec_params, train_images[data_idx], seed) / data_dim # Get gradients of objective using autograd. objective_grad = grad(objective) print( " Epoch | Objective | Fake probability | Real Probability " ) def print_perf(combined_params, iter, grad): if iter % 10 == 0: gen_params, rec_params = combined_params bound = np.mean(objective(combined_params, iter)) print("{:15}|{:20}".format(iter // num_batches, bound)) fake_data = generate_from_prior(gen_params, 20, latent_dim, seed) save_images(fake_data, 'vae_samples.png', vmin=0, vmax=1) # The optimizers provided can optimize lists, tuples, or dicts of parameters. optimized_params = adam(objective_grad, combined_init_params, step_size=step_size, num_iters=num_epochs * num_batches, callback=print_perf)
annealing = (.999**iter) # fake_data = generate_from_prior(gen_params, 20, latent_dim, seed) # save_images(fake_data, 'vae_samples.png', vmin=0, vmax=1) # epoch = iter//num_batches # batch_size = original_batch_size * (.5**epoch) # if batch_size < 1: batch_size =1 # print ('batch size', batch_size) print("{}|{:15}|{:20}|{:25}".format(iter, iter // num_batches, bound, annealing)) # if new_epoch != epoch: # batch_size = batch_size / 2 # if batch_size < 1: batch_size =1 # print ('batch size', batch_size) # epoch = new_epoch # # The optimizers provided can optimize lists, tuples, or dicts of parameters. # optimized_params = adam(objective_grad, combined_init_params, step_size=step_size, # num_iters=num_epochs * num_batches, callback=print_perf) # The optimizers provided can optimize lists, tuples, or dicts of parameters. optimized_params = adam(objective_grad, params, step_size=step_size, num_iters=2000, callback=print_perf) params = optimized_params
combined_init_params = (init_gen_params, init_rec_params) num_batches = int(np.ceil(len(train_images) / batch_size)) def batch_indices(iter): idx = iter % num_batches return slice(idx * batch_size, (idx+1) * batch_size) # Define training objective seed = npr.RandomState(0) def objective(combined_params, iter): data_idx = batch_indices(iter) gen_params, rec_params = combined_params return -vae_lower_bound(gen_params, rec_params, train_images[data_idx], seed) / data_dim # Get gradients of objective using autograd. objective_grad = grad(objective) print(" Epoch | Objective | Fake probability | Real Probability ") def print_perf(combined_params, iter, grad): if iter % 10 == 0: gen_params, rec_params = combined_params bound = np.mean(objective(combined_params, iter)) print("{:15}|{:20}".format(iter//num_batches, bound)) fake_data = generate_from_prior(gen_params, 20, latent_dim, seed) save_images(fake_data, 'vae_samples.png', vmin=0, vmax=1) # The optimizers provided can optimize lists, tuples, or dicts of parameters. optimized_params = adam(objective_grad, combined_init_params, step_size=step_size, num_iters=num_epochs * num_batches, callback=print_perf)
if iter % 10 == 0: bound = training_loss_noAnneal(params) print("{:15}|{:20}".format(iter, bound)) with open(r'dkfTrainTest.csv', 'a') as f: btrain = np.mean(training_loss_noAnneal(params)) if btrain > 1: btrain = 1 if btrain < -1: btrain = -1 writer = csv.writer(f) writer.writerow([btrain]) training_loss_grad = grad(training_loss) #pdb.set_trace() trained_params = adam(training_loss_grad, params, step_size=0.05, num_iters=1000, callback=print_perf) def plotTrainingCurve(): X = np.genfromtxt(r'dkfTrainTest.csv', delimiter=',') t = np.arange(X.shape[0]) plt.clf() plt.plot(t, X) #plt.plot(t,X[:,1]) #plt.legend(['Train', 'Test']) plt.savefig('trainingCurvedkf.jpg') plotTrainingCurve()
ax.set_xticks([]) fig = plt.figure(figsize=(8,8), facecolor='white') ax = fig.add_subplot(111, frameon=False) plt.ion() plt.show(block=False) num_plotting_samples = 51 def callback(params, t, g): print("Iteration {} lower bound {}".format(t, -objective(params, t))) print (params) plt.cla() target_distribution = lambda x: np.exp(log_density(x, t)) var_distribution = lambda x: np.exp(variational_log_density(params, x)) plot_isocontours(ax, target_distribution) plot_isocontours(ax, var_distribution, cmap=plt.cm.bone) ax.set_autoscale_on(False) rs = npr.RandomState(np.random.randint(0,6)) samples = variational_sampler(params, num_plotting_samples, rs) plt.plot(samples[:, 0], samples[:, 1], 'x') plt.draw() plt.pause(1.0/30.0) print("Optimizing variational parameters...") variational_params = adam(grad(objective), init_var_params(D), step_size=0.1, num_iters=2000, callback=callback)
plt.ion() plt.show(block=False) num_plotting_samples = 51 def callback(params, t, g): print("Iteration {} lower bound {}".format(t, -objective(params, t))) plt.cla() target_distribution = lambda x: np.exp(log_posterior(x, t)) plot_isocontours(ax, target_distribution) for inner_params in np.split(params, k): var_distribution = lambda x: np.exp( diag_gaussian_density_from_params(inner_params, x)) plot_isocontours(ax, var_distribution) rs = npr.RandomState(0) samples = iwae_qf_sample(log_posterior, params, t, k, num_plotting_samples, rs) plt.plot(samples[:, 0], samples[:, 1], 'x') plt.draw() plt.pause(1.0 / 30.0) print("Optimizing variational parameters...") adam(grad(objective), init_qf_params(k, D), step_size=0.1, num_iters=2000, callback=callback)