def main(): (parser, loss) = KLD(50) print(parser) print(parser.idxs_and_shapes) datum = {} datum['mu1']=np.zeros(50) datum['mu2']=np.ones(50) datum['sig1']=5 datum['sig2']=6 trial_vecs = [] for _ in range(5): trial_vecs.append(np.random.rand(50)) value_and_grad_fun = value_and_grad(pairwise_distance) value, grad = value_and_grad_fun(trial_vecs) print(trial_vecs) weights = parser.stack(datum) value_and_grad_fun = value_and_grad(loss) value, grad = value_and_grad_fun(weights) print(value) weights = weights - 10e-4*grad value, grad = value_and_grad_fun(weights) print(value) pass
def opt_traj(func, fdict, T, opt_method = 'SGD', init = None, \ learning_rate = 0.1, seed = 100, momentum = False, noise_level = 0.0): # do optimization and return the trajectory params = {'x': 0.0, 'y': 0.0} domain = fdict['domain'] optimum = fdict['optimum'] loss_and_grad = value_and_grad(func) #quick_grad_check(func, params) params = init_params(params, domain, init, seed) check_grads(func, params) opt_server = Parameter_Server(opt_method, momentum) opt_server.init_gradient_storage(params) x_traj = [] y_traj = [] f_traj = [] print 'optimising function using %s...' % opt_method for t in xrange(T): (func_value, func_grad) = loss_and_grad(params) x_traj.append(params['x']) y_traj.append(params['y']) f_traj.append(func_value) func_grad = inject_noise(func_grad, noise_level) if opt_method == 'SGD': norm = np.sqrt(func_grad['x'] ** 2 + func_grad['y'] ** 2) if norm >= 2.0: func_grad['x'] /= norm / 2; func_grad['y'] /= norm / 2 params = opt_server.update(params, func_grad, learning_rate) return np.array(x_traj), np.array(y_traj), np.array(f_traj)
def test_value_and_grad(): fun = lambda x: np.sum(np.sin(x)**2) dfun = grad(fun) dfun_both = value_and_grad(fun) x = npr.randn(5) check_equivalent(fun(x), dfun_both(x)[0]) check_equivalent(dfun(x), dfun_both(x)[1])
def value_and_grad(learner, trainingData, weights, extraObjective=None): def trainIt(weights): global globalObjective globalObjective += learner.train(trainingData, weights) if extraObjective is not None: globalObjective += extraObjective(weights) return globalObjective return autograd.value_and_grad(trainIt)
def max_likelihood(self, data, weights=None, stats=None, lmbda=0.1): """ As an alternative to MCMC with Polya-gamma augmentation, we also implement maximum likelihood learning via gradient descent with autograd. This follows the pybasicbayes convention. :param data: list of tuples, (x,y), for each dataset. :param weights: Not used in this implementation. :param stats: Not used in this implementation. """ import autograd.numpy as anp from autograd import value_and_grad, hessian_vector_product from scipy.optimize import minimize assert weights is None assert stats is None if not isinstance(data, list): assert isinstance(data, tuple) and len(data) == 2 data = [data] # Define a helper function for the log of the logistic fn def loglogistic(psi): return psi - anp.log(1+anp.exp(psi)) # optimize each row of A and b for n in range(self.D_out): # Define an objective function for the n-th row of hstack((A, b)) # This is the negative log likelihood of the n-th column of data. def nll(abn): an, bn = abn[:-1], abn[-1] T = 0 ll = 0 for (x, y) in data: T += x.shape[0] yn = y[:, n] psi = anp.dot(x, an) + bn ll += anp.sum(yn * loglogistic(psi)) ll += anp.sum((1 - yn) * loglogistic(-1. * psi)) # Include a penalty on the weights ll -= lmbda * T * anp.sum(an**2) ll -= lmbda * T * bn**2 return -1 * ll / T abn0 = np.concatenate((self.A[n], self.b[n])) res = minimize(value_and_grad(nll), abn0, tol=1e-3, method="Newton-CG", jac=True, hessp=hessian_vector_product(nll)) assert res.success self.A[n] = res.x[:-1] self.b[n] = res.x[-1]
def test_return_both(): fun = lambda x : 3.0 * x**3.2 d_fun = grad(fun) f_and_d_fun = value_and_grad(fun) test_x = 1.7 f, d = f_and_d_fun(test_x) assert f == fun(test_x) assert d == d_fun(test_x)
def test_value_and_grad(): fun = lambda x: np.sum(np.sin(x)**2) dfun = grad(fun) dfun_both = value_and_grad(fun) x = npr.randn(5) assert not isbox(dfun_both(x)[0]) check_equivalent(fun(x), dfun_both(x)[0]) check_equivalent(dfun(x), dfun_both(x)[1]) def fun2(x): return dfun_both(x)[0] check_grads(fun2)(x)
def test_comparison_values(): compare_funs = [lambda x, y : np.sum(x < x), lambda x, y : np.sum(x <= y), lambda x, y : np.sum(x > y), lambda x, y : np.sum(x >= y), lambda x, y : np.sum(x == y), lambda x, y : np.sum(x != y)] for arg1, arg2 in arg_pairs(): for fun in compare_funs: fun_val = fun(arg1, arg2) fun_val_from_grad, _ = value_and_grad(fun)(arg1, arg2) assert fun_val == fun_val_from_grad, (fun_val, fun_val_from_grad)
def choose_next_point(domain_min, domain_max, acquisition_function, num_tries=15, rs=npr.RandomState(0)): """Uses gradient-based optimization to find next query point.""" init_points = rs.rand(num_tries, D) * (domain_max - domain_min) + domain_min grad_obj = value_and_grad(lambda x: -acquisition_function(x)) def optimize_point(init_point): print('.', end='') result = minimize(grad_obj, x0=init_point, jac=True, method='L-BFGS-B', options={'maxiter': 10}, bounds=list(zip(domain_min, domain_max))) return result.x, acquisition_function(result.x) optimzed_points, optimized_values = list(zip(*list(map(optimize_point, init_points)))) print() best_ix = np.argmax(optimized_values) return np.atleast_2d(optimzed_points[best_ix])
def _M_step(self, free_vars, resp, alpha, free_vars_shape, fixed_vars, is_fixed_vars, priors, optim_opts={}, **kwargs): # inconvenient reshaping of responsibilities responsibs = ([item[:, i] for item in resp] for i in range(len(self._ifix))) Cg = self.latentforces[0].kernel(self.ttc[:, None]) Cg[np.diag_indices_from(Cg)] += 1e-5 Lg = np.linalg.cholesky(Cg) Cginv = cho_solve((Lg, True), np.eye(Lg.shape[0])) rr = [*responsibs] def _objfunc(arg): g, vbeta, mu_ivp = _var_mixer(arg, free_vars_shape, fixed_vars, is_fixed_vars) # some reshaping beta = vbeta.reshape((self.dim.R+1, self.dim.D)) mu_ivp = mu_ivp.reshape((len(self._ifix), len(self.Y_train_), self.dim.K)) vals = [] for i, ifx in enumerate(self._ifix): vals.append( self.forward_error(g, beta, alpha, mu_ivp[i], ifx, rr[i])) logprior = -0.5*np.dot(g, np.dot(Cginv, g)) for vn, x in zip(['beta'], [vbeta]): try: prior_logpdf = priors[vn] logprior = logprior + prior_logpdf(x) except KeyError: pass return np.sum(vals) - logprior res = minimize(autograd.value_and_grad(_objfunc), free_vars, jac=True, **optim_opts) return res.x
def test_value_and_multigrad(): def complicated_fun(a,b,c,d,e,f=1.1, g=9.0): return a + np.sin(b) + np.cosh(c) + np.cos(d) + np.tan(e) + f + g A = 0.5 B = -0.3 C = 0.2 D = -1.1 E = 0.7 F = 0.6 G = -0.1 dfun = grad(complicated_fun, argnum=[3, 1]) dfun_both = value_and_grad(complicated_fun, argnum=[3, 1]) check_equivalent(complicated_fun(A, B, C, D, E, f=F, g=G), dfun_both(A, B, C, D, E, f=F, g=G)[0]) check_equivalent(dfun(A, B, C, D, E, f=F, g=G), dfun_both(A, B, C, D, E, f=F, g=G)[1])
def distance_from_target_image(smoke): return np.mean((target - smoke)**2) def convert_param_vector_to_matrices(params): vx = np.reshape(params[:(rows*cols)], (rows, cols)) vy = np.reshape(params[(rows*cols):], (rows, cols)) return vx, vy def objective(params): init_vx, init_vy = convert_param_vector_to_matrices(params) final_smoke = simulate(init_vx, init_vy, init_smoke, simulation_timesteps) return distance_from_target_image(final_smoke) # Specify gradient of objective function using autograd. objective_with_grad = value_and_grad(objective) fig = plt.figure(figsize=(8,8)) ax = fig.add_subplot(111, frameon=False) def callback(params): init_vx, init_vy = convert_param_vector_to_matrices(params) simulate(init_vx, init_vy, init_smoke, simulation_timesteps, ax) print("Optimizing initial conditions...") result = minimize(objective_with_grad, init_dx_and_dy, jac=True, method='CG', options={'maxiter':25, 'disp':True}, callback=callback) print("Rendering optimized flow...") init_vx, init_vy = convert_param_vector_to_matrices(result.x) simulate(init_vx, init_vy, init_smoke, simulation_timesteps, ax, render=True)
sum_loss_M += loss_M(Y, X, U, b) diff_U = U - sigma_u * np.eye(U.shape[0]) sum_reg_u += np.linalg.norm(diff_U, ord='fro')**2 Rx = lambda_x * np.sum((np.linalg.norm(X, axis=1)**2)) Ru = lambda_u * sum_reg_u return Rx + Ru + sum_loss_M # %% def paramsX_to_minimize(params, Ys, Us, b): X = params.reshape((int(params.size / D), D)) return loss_all(Ys, X, Us, b) paramsX_to_minimize_with_grad = value_and_grad(paramsX_to_minimize) # %% def paramsUs_to_minimize(params, Ys, X, b): return loss_all(Ys, X, params, b) paramsUs_to_minimize_with_grad = value_and_grad(paramsUs_to_minimize) # %% def transform_clusters(raw_clusters): ''' input [ [['item1','item3'],['item2','item4']],
self.locationMaps, 1) self.graphic.update( 'forward time: {0} of {1}'.format(t, self.nTime), self.pausetime) measurements = [] for lExp in range(-4, 5): myProblem = parabolicProblem() myProblem.l = 10**lExp print "Regularization is {0}".format(myProblem.l) x = 0.0 * np.copy( myProblem.referenceRHS[0::myProblem.nDofs]) #np.zeros(myProblem.nTime) myProWithGrad = value_and_grad(myProblem) #print "value_and_grad: " + str(myProWithGrad(x)) #print x0 #x0[0::myProblem.nDofs] = np.ones(myProblem.nTime)*0.001 x, f, d = scipy.optimize.fmin_l_bfgs_b(myProWithGrad, x, fprime=None, args=(), approx_grad=0, bounds=None, m=10, factr=1e0, pgtol=1e-20, iprint=1,
]), alpha=.15, fc='Blue', ec='None') # Show samples from posterior. rs = npr.RandomState(0) sampled_funcs = rs.multivariate_normal(pred_mean, pred_cov, size=10) ax.plot(plot_xs, sampled_funcs.T) ax.plot(X, y, 'kx') ax.set_ylim([-1.5, 1.5]) ax.set_xticks([]) ax.set_yticks([]) plt.draw() plt.pause(20.0 / 60.0) # Initialize covariance parameters rs = npr.RandomState(0) init_params = 0.1 * rs.randn(num_params) import pdb pdb.set_trace() print("Optimizing covariance parameters...") cov_params = minimize(value_and_grad(objective), init_params, jac=True, method='CG', callback=callback) plt.pause(10.0)
def _value_and_grad(x, i): v, g = value_and_grad(unflatten(x), i) return v, flatten(g)[0]
def _fit_stochastic_em(self, optimizer, datas, inputs, masks, tags, num_epochs=100, **kwargs): """ Replace the M-step of EM with a stochastic gradient update using the ELBO computed on a minibatch of data. """ M = len(datas) T = sum([data.shape[0] for data in datas]) # A helper to grab a minibatch of data perm = [np.random.permutation(M) for _ in range(num_epochs)] def _get_minibatch(itr): epoch = itr // M m = itr % M i = perm[epoch][m] return datas[i], inputs[i], masks[i], tags[i][i] # Define the objective (negative ELBO) def _objective(params, itr): # Grab a minibatch of data data, input, mask, tag = _get_minibatch(itr) Ti = data.shape[0] # E step: compute expected latent states with current parameters Ez, Ezzp1, _ = self.expected_states(data, input, mask, tag) # M step: set the parameter and compute the (normalized) objective function self.params = params pi0 = self.init_state_distn.initial_state_distn log_Ps = self.transitions.log_transition_matrices( data, input, mask, tag) log_likes = self.observations.log_likelihoods( data, input, mask, tag) # Compute the expected log probability # (Scale by number of length of this minibatch.) obj = self.log_prior() obj += np.sum(Ez[0] * np.log(pi0)) * M obj += np.sum(Ezzp1 * log_Ps) * (T - M) / (Ti - 1) obj += np.sum(Ez * log_likes) * T / Ti assert np.isfinite(obj) return -obj / T # Set up the progress bar lls = [-_objective(self.params, 0) * T] pbar = trange(num_epochs * M) pbar.set_description("Epoch {} Itr {} LP: {:.1f}".format( 0, 0, lls[-1])) # Run the optimizer step = dict(sgd=sgd_step, rmsprop=rmsprop_step, adam=adam_step)[optimizer] state = None for itr in pbar: self.params, val, g, state = step(value_and_grad(_objective), self.params, itr, state, **kwargs) epoch = itr // M m = itr % M lls.append(-val * T) pbar.set_description("Epoch {} Itr {} LP: {:.1f}".format( epoch, m, lls[-1])) pbar.update(1) return lls
def initialize(self, base_model, datas, inputs=None, masks=None, tags=None, num_em_iters=50, num_tr_iters=50): print("Initializing...") print("First with FA using {} steps of EM.".format(num_em_iters)) fa, xhats, Cov_xhats, lls = factor_analysis_with_imputation( self.D, datas, masks=masks, num_iters=num_em_iters) if self.D == 1 and base_model.transitions.__class__.__name__ == "DDMTransitions": d_init = np.mean([y[0:3] for y in datas], axis=(0, 1)) u_sum = np.array([np.sum(u) for u in inputs]) y_end = np.array([y[-3:] for y in datas]) u_l, u_u = np.percentile( u_sum, [20, 80]) # use 20th and 80th percentile input y_U = y_end[np.where(u_sum >= u_u)] y_L = y_end[np.where(u_sum <= u_l)] C_init = (1.0 / 2.0) * np.mean( (np.mean(y_U, axis=0) - np.mean(y_L, axis=0)), axis=0) self.Cs = C_init.reshape([1, self.N, self.D]) self.ds = d_init.reshape([1, self.N]) self.inv_etas = np.log(fa.sigmasq).reshape([1, self.N]) else: # define objective Td = sum([x.shape[0] for x in xhats]) def _objective(params, itr): new_datas = [np.dot(x, params[0].T) + params[1] for x in xhats] obj = base_model.log_likelihood(new_datas, inputs=inputs) return -obj / Td # initialize R and r R = 0.1 * np.random.randn(self.D, self.D) r = 0.01 * np.random.randn(self.D) params = [R, r] print( "Next by transforming latents to match AR-HMM prior using {} steps of max log likelihood." .format(num_tr_iters)) state = None lls = [-_objective(params, 0) * Td] pbar = trange(num_tr_iters) pbar.set_description("Epoch {} Itr {} LP: {:.1f}".format( 0, 0, lls[-1])) for itr in pbar: params, val, g, state = sgd_step(value_and_grad(_objective), params, itr, state) lls.append(-val * Td) pbar.set_description("LP: {:.1f}".format(lls[-1])) pbar.update(1) R = params[0] r = params[1] # scale x's to be max at 1.1 for d in range(self.D): x_transformed = [(np.dot(x, R.T) + r)[:, d] for x in xhats] max_x = np.max(x_transformed) R[d, :] *= 1.1 / max_x r[d] *= 1.1 / max_x self.Cs = (fa.W @ np.linalg.inv(R)).reshape([1, self.N, self.D]) self.ds = fa.mean - fa.W @ np.linalg.inv(R) @ r self.inv_etas = np.log(fa.sigmasq).reshape([1, self.N])
def optimize_gp_params(init_params, X, y): log_hyperprior = lambda params: np.sum(norm.logpdf(params, 0., 100.)) objective = lambda params: -log_marginal_likelihood(params, X, y) -log_hyperprior(params) return minimize(value_and_grad(objective), init_params, jac=True, method='CG').x
ax_true_latents.set_title("True latents") ax_true_latents.set_xticks([]) ax_true_weights.set_xticks([]) ax_true_latents.set_yticks([]) ax_true_weights.set_yticks([]) def objective(params): weight_matrix, latents, noise_std = unpack_params(params) return -logprob(weight_matrix, latents, noise_std, data)/n_samples def callback(params): weights, latents, noise_std = unpack_params(params) print("Log likelihood {}, noise_std {}".format(-objective(params), noise_std)) ax_est_weights.cla() ax_est_weights.scatter(weights[:, 0], weights[:, 1]) ax_est_weights.set_title("Estimated weights") ax_est_latents.cla() color_scatter(ax_est_latents, latents[0, :], latents[1, :]) ax_est_latents.set_title("Estimated latents") ax_est_weights.set_yticks([]) ax_est_latents.set_yticks([]) ax_est_weights.set_xticks([]) ax_est_latents.set_xticks([]) plt.draw() plt.pause(1.0/60.0) # Initialize and optimize model. rs = npr.RandomState(0) init_params = rs.randn(total_num_params) minimize(value_and_grad(objective), init_params, jac=True, method='CG', callback=callback) plt.pause(20)
# Build likelihood model. L2_reg = 1 layer_sizes = [784, 200, 100, 10] num_weights, make_predictions, likelihood = make_classification_nn( layer_sizes) classifier_loglik = lambda image, c: make_predictions( trained_weights, np.atleast_2d(image))[:, c] data_L = create_prob_of_data(parameters, encoder, decoder_log_like) # Combine prior and likelihood. model_ll = lambda image, c: data_L(image) + classifier_loglik(image, c) def model_nll(image, c): return -1 * model_ll(image, c) model_nll_with_grad = value_and_grad(model_nll) # Optimize a random image to maximize this likelihood. cur_class = 2 start_image = np.zeros((28 * 28)) # quick_grad_check(data_L, start_image) def callback(image): #print "Cur loglik: ", image_prior_nll(image), "mean loglik:", image_prior_nll(all_mean) matplotlib.image.imsave("optimizing", image.reshape((28, 28))) # Optimize using conjugate gradients. result = minimize(model_nll_with_grad, callback=callback,
def _step(self, optimizer, X, scalings): obj, grad = value_and_grad(calc_potential_energy)(scalings, X) scalings = optimizer.next(scalings, np.array(grad)) scalings = normalize(scalings, xl=0, xu=scalings.max()) return scalings, obj
nruns_J = int(sys.argv[5]) replicate_point = (len(sys.argv) >= 7 and sys.argv[6] == "-rep") fn_in = dir_in + fn fn_out = dir_out + fn alphas,means,icf,x,wishart_gamma,wishart_m = gmm.read_gmm_instance(fn_in + ".txt", replicate_point) start = t.time() for i in range(nruns_f): err = gmm.gmm_objective(alphas,means,icf,x,wishart_gamma,wishart_m) end = t.time() tf = (end - start)/nruns_f k = alphas.size grad_gmm_objective_split_inner_wrapper = value_and_grad(gmm_objective_split_inner_wrapper) grad_gmm_objective_split_other_wrapper = value_and_grad(gmm_objective_split_other_wrapper) start = t.time() for i in range(nruns_J): grad = grad_gmm_objective_split_other_wrapper((alphas,means,icf),x,wishart_gamma,wishart_m) for ix in range(x.shape[0]): grad = add_grad(grad,grad_gmm_objective_split_inner_wrapper((alphas,means,icf),x[ix,:])) end = t.time() tJ = 0 name = "Autograd_split" if nruns_J>0: tJ = (end - start)/nruns_J gmm.write_J(fn_out + "_J_" + name + ".txt",grad[1]) gmm.write_times(fn_out + "_times_" + name + ".txt",tf,tJ)
print("Training text Predicted text") logprobs = np.asarray(pred_fun(weights, train_inputs)) for t in range(logprobs.shape[1]): training_text = one_hot_to_string(train_targets[:, t, :]) predicted_text = one_hot_to_string(logprobs[:, t, :]) print( training_text.replace('\n', ' ') + "| " + predicted_text.replace('\n', ' ')) def callback(weights): print("Train loss:", loss_fun(weights, train_inputs, train_targets)) print_training_prediction(weights, train_inputs, train_targets) # Build gradient of loss function using autograd. loss_and_grad = value_and_grad(loss_fun) # Wrap function to only have one argument, for scipy.minimize. def training_loss_and_grad(weights): return loss_and_grad(weights, train_inputs, train_targets) init_weights = npr.randn(num_weights) * param_scale # Check the gradients numerically, just to be safe quick_grad_check(loss_fun, init_weights, (train_inputs, train_targets)) print("Training LSTM...") result = minimize(training_loss_and_grad, init_weights, jac=True, method='CG', options={'maxiter': train_iters},
fn_out = dir_out + fn def gmm_objective_wrapper(params, x, wishart_gamma, wishart_m): return gmm.gmm_objective(params[0], params[1], params[2], x, wishart_gamma, wishart_m) alphas, means, icf, x, wishart_gamma, wishart_m = gmm.read_gmm_instance(fn_in + ".txt", replicate_point) start = t.time() for i in range(nruns_f): err = gmm.gmm_objective(alphas, means, icf, x, wishart_gamma, wishart_m) end = t.time() tf = (end - start) / nruns_f k = alphas.size grad_gmm_objective_wrapper = value_and_grad(gmm_objective_wrapper) start = t.time() for i in range(nruns_J): grad = grad_gmm_objective_wrapper((alphas, means, icf), x, wishart_gamma, wishart_m) end = t.time() tJ = 0 name = "Autograd" if nruns_J > 0: tJ = (end - start) / nruns_J gmm.write_J(fn_out + "_J_" + name + ".txt", grad[1]) gmm.write_times(fn_out + "_times_" + name + ".txt", tf, tJ)
def bealeFunction(conf): global line, point, path, f f = lambda x, y: (1.5 - x + x * y)**2 + (2.25 - x + x * y**2)**2 + ( 2.625 - x + x * y**3)**2 xmin, xmax, xstep = -4.5, 4.5, .2 ymin, ymax, ystep = -4.5, 4.5, .2 x, y = np.meshgrid(np.arange(xmin, xmax + xstep, xstep), np.arange(ymin, ymax + ystep, ystep)) z = f(x, y) minima = np.array([3., .5]) minima_ = minima.reshape(-1, 1) x0 = np.array([3., 4.]) func = value_and_grad(lambda args: f(*args)) path_ = [x0] res = minimize(func, x0=x0, method='Newton-CG', jac=True, tol=1e-20, callback=make_minimize_cb(path_)) path = np.array(path_).T #3D surface plot fig = plt.figure(figsize=(8, 5)) ax = plt.axes(projection='3d', elev=50, azim=-50) ax.plot_surface(x, y, z, norm=LogNorm(), rstride=1, cstride=1, edgecolor='none', alpha=.8, cmap=plt.cm.jet) ax.plot(minima_[0], minima_[1], f(minima_[0], minima_[1]), 'r*', markersize=10) line, = ax.plot([], [], [], 'b', label='Newton-CG', lw=2) point, = ax.plot([], [], [], 'bo') ax.set_xlabel('$x$') ax.set_ylabel('$y$') ax.set_zlabel('$z$') ax.set_xlim((xmin, xmax)) ax.set_ylim((ymin, ymax)) anim = animation.FuncAnimation(fig, animate, init_func=init, frames=path.shape[1], interval=60, repeat_delay=5, blit=True) anim.save('basic_animation.mp4', fps=30, extra_args=['-vcodec', 'libx264']) return
def experiment(sname, seed, nystr=False): def LMO_err(params, M=2): al, bl = np.exp(params) L = bl * bl * np.exp(-L0 / al / al / 2) + 1e-6 * EYEN if nystr: tmp_mat = L @ eig_vec_K C = L - tmp_mat @ np.linalg.inv(eig_vec_K.T @ tmp_mat / N2 + inv_eig_val_K) @ tmp_mat.T / N2 c = C @ W_nystr_Y * N2 else: LWL_inv = chol_inv(L @ W @ L + L / N2 + JITTER * EYEN) C = L @ LWL_inv @ L / N2 c = C @ W @ Y * N2 c_y = c - Y lmo_err = 0 N = 0 for ii in range(1): permutation = np.random.permutation(X.shape[0]) for i in range(0, X.shape[0], M): indices = permutation[i:i + M] K_i = W[np.ix_(indices, indices)] * N2 C_i = C[np.ix_(indices, indices)] c_y_i = c_y[indices] b_y = np.linalg.inv(np.eye(M) - C_i @ K_i) @ c_y_i # print(I_CW_inv.shape,c_y_i.shape) lmo_err += b_y.T @ K_i @ b_y N += 1 return lmo_err[0, 0] / N / M**2 def callback0(params, timer=None): global Nfeval, prev_norm, opt_params, opt_test_err if Nfeval % 1 == 0: n_params = len(params) al, bl = np.exp(params) L = bl * bl * np.exp(-L0 / al / al / 2) + 1e-6 * EYEN if nystr: tmp_mat = eig_vec_K.T @ L alpha = EYEN - eig_vec_K @ np.linalg.inv( tmp_mat @ eig_vec_K / N2 + inv_eig_val_K) @ tmp_mat / N2 alpha = alpha @ W_nystr_Y * N2 else: LWL_inv = chol_inv(L @ W @ L + L / N2 + JITTER * EYEN) alpha = LWL_inv @ L @ W @ Y test_L = bl * bl * np.exp( -test_L0 / al / al / 2) # l(test_X,X,al,bl) pred_mean = test_L @ alpha if timer: return test_err = ((pred_mean - test_G)**2).mean( ) # ((pred_mean-test_G)**2/np.diag(pred_cov)).mean()+(np.log(np.diag(pred_cov))).mean() norm = alpha.T @ L @ alpha Nfeval += 1 if prev_norm is not None: if norm[0, 0] / prev_norm >= 3: if opt_params is None: opt_test_err = test_err opt_params = params print(True, opt_params, opt_test_err, prev_norm, norm[0, 0]) raise Exception if prev_norm is None or norm[0, 0] <= prev_norm: prev_norm = norm[0, 0] opt_test_err = test_err opt_params = params print('params,test_err, norm: ', opt_params, opt_test_err, prev_norm, norm[0, 0]) folder = ROOT_PATH + "/MMR_IVs/results/mendelian/" + sname + "/" os.makedirs(folder, exist_ok=True) train, dev, test = load_data(ROOT_PATH + "/data/mendelian/" + sname + '.npz', Torch=False) X = train.x Y = train.y Z = train.z test_X = test.x test_G = test.g t0 = time.time() EYEN = np.eye(X.shape[0]) N2 = X.shape[0]**2 W = np.load(ROOT_PATH + '/mendelian_precomp/{}_train_K.npy'.format(sname)) / N2 L0, test_L0 = _sqdist(X, None), _sqdist(test_X, X) params0 = np.random.randn(2) / 10 bounds = None # [[0.01,10],[0.01,5]] if nystr: for _ in range(seed + 1): random_indices = np.sort( np.random.choice(range(W.shape[0]), nystr_M, replace=False)) eig_val_K, eig_vec_K = nystrom_decomp(W * N2, random_indices) inv_eig_val_K = np.diag(1 / eig_val_K / N2) W_nystr = eig_vec_K @ np.diag(eig_val_K) @ eig_vec_K.T / N2 W_nystr_Y = W_nystr @ Y obj_grad = value_and_grad(lambda params: LMO_err(params)) try: res = minimize(obj_grad, x0=params0, bounds=bounds, method='L-BFGS-B', jac=True, options={'maxiter': 5000}, callback=callback0) except Exception as e: print(e) PATH = ROOT_PATH + "/MMR_IVs/results/mendelian/" + sname + "/" np.save(PATH + 'LMO_errs_{}_nystr_{}.npy'.format(seed, train.x.shape[0]), [opt_params, prev_norm, opt_test_err])
def experiment(sname, seed, nystr=True): def LMO_err(params, M=2, verbal=False): global Nfeval params = np.exp(params) al, bl = params[:-1], params[ -1] # params[:int(n_params/2)], params[int(n_params/2):] # [np.exp(e) for e in params] if train.x.shape[1] < 5: train_L = bl**2 * np.exp(-train_L0 / al**2 / 2) + 1e-4 * EYEN else: train_L, dev_L = 0, 0 for i in range(len(al)): train_L += train_L0[i] / al[i]**2 train_L = bl * bl * np.exp(-train_L / 2) + 1e-4 * EYEN tmp_mat = train_L @ eig_vec_K C = train_L - tmp_mat @ np.linalg.inv(eig_vec_K.T @ tmp_mat / N2 + inv_eig_val) @ tmp_mat.T / N2 c = C @ W_nystr_Y * N2 c_y = c - train.y lmo_err = 0 N = 0 for ii in range(1): permutation = np.random.permutation(train.x.shape[0]) for i in range(0, train.x.shape[0], M): indices = permutation[i:i + M] K_i = train_W[np.ix_(indices, indices)] * N2 C_i = C[np.ix_(indices, indices)] c_y_i = c_y[indices] b_y = np.linalg.inv(np.eye(M) - C_i @ K_i) @ c_y_i lmo_err += b_y.T @ K_i @ b_y N += 1 return lmo_err[0, 0] / M**2 def callback0(params): global Nfeval, prev_norm, opt_params, opt_test_err if Nfeval % 1 == 0: params = np.exp(params) print('params:', params) al, bl = params[:-1], params[-1] if train.x.shape[1] < 5: train_L = bl**2 * np.exp(-train_L0 / al**2 / 2) + 1e-4 * EYEN test_L = bl**2 * np.exp(-test_L0 / al**2 / 2) else: train_L, test_L = 0, 0 for i in range(len(al)): train_L += train_L0[i] / al[i]**2 test_L += test_L0[i] / al[i]**2 train_L = bl * bl * np.exp(-train_L / 2) + 1e-4 * EYEN test_L = bl * bl * np.exp(-test_L / 2) if nystr: tmp_mat = eig_vec_K.T @ train_L alpha = EYEN - eig_vec_K @ np.linalg.inv( tmp_mat @ eig_vec_K / N2 + inv_eig_val) @ tmp_mat / N2 alpha = alpha @ W_nystr_Y * N2 else: LWL_inv = chol_inv(train_L @ train_W @ train_L + train_L / N2 + JITTER * EYEN) alpha = LWL_inv @ train_L @ train_W @ train.y pred_mean = test_L @ alpha test_err = ((pred_mean - test.g)**2).mean() norm = alpha.T @ train_L @ alpha Nfeval += 1 if prev_norm is not None: if norm[0, 0] / prev_norm >= 3: if opt_test_err is None: opt_test_err = test_err opt_params = params print(True, opt_params, opt_test_err, prev_norm, norm[0, 0]) raise Exception if prev_norm is None or norm[0, 0] <= prev_norm: prev_norm = norm[0, 0] opt_test_err = test_err opt_params = params print(True, opt_params, opt_test_err, prev_norm, norm[0, 0]) train, dev, test = load_data(ROOT_PATH + '/data/' + sname + '/main.npz') del dev # avoid same indices when run on the cluster for _ in range(seed + 1): random_indices = np.sort( np.random.choice(range(train.x.shape[0]), nystr_M, replace=False)) EYEN = np.eye(train.x.shape[0]) N2 = train.x.shape[0]**2 # precompute to save time on parallized computation if train.z.shape[1] < 5: ak = get_median_inter_mnist(train.z) else: ak = np.load(ROOT_PATH + '/mnist_precomp/{}_ak.npy'.format(sname)) train_W = np.load(ROOT_PATH + '/mnist_precomp/{}_train_K0.npy'.format(sname)) train_W = (np.exp(-train_W / ak / ak / 2) + np.exp( -train_W / ak / ak / 200) + np.exp(-train_W / ak / ak * 50)) / 3 / N2 if train.x.shape[1] < 5: train_L0 = _sqdist(train.x, None) test_L0 = _sqdist(test.x, train.x) else: L0s = np.load(ROOT_PATH + '/mnist_precomp/{}_Ls.npz'.format(sname)) train_L0 = L0s['train_L0'] # dev_L0 = L0s['dev_L0'] test_L0 = L0s['test_L0'] del L0s if train.x.shape[1] < 5: params0 = np.random.randn(2) * 0.1 else: params0 = np.random.randn(len(train_L0) + 1) * 0.1 bounds = None eig_val_K, eig_vec_K = nystrom_decomp(train_W * N2, random_indices) W_nystr_Y = eig_vec_K @ np.diag(eig_val_K) @ eig_vec_K.T @ train.y / N2 inv_eig_val = np.diag(1 / eig_val_K / N2) obj_grad = value_and_grad(lambda params: LMO_err(params)) res = minimize(obj_grad, x0=params0, bounds=bounds, method='L-BFGS-B', jac=True, options={ 'maxiter': 5000, 'disp': True, 'ftol': 0 }, callback=callback0) PATH = ROOT_PATH + "/MMR_IVs/results/" + sname + "/" os.makedirs(PATH, exist_ok=True) np.save(PATH + 'LMO_errs_{}_nystr.npy'.format(seed), [opt_params, prev_norm, opt_test_err])
# Show posterior marginals. plot_xs = np.reshape(np.linspace(-7, 7, 300), (300,1)) pred_mean, pred_cov = predict(params, X, y, plot_xs) marg_std = np.sqrt(np.diag(pred_cov)) ax.plot(plot_xs, pred_mean, 'b') ax.fill(np.concatenate([plot_xs, plot_xs[::-1]]), np.concatenate([pred_mean - 1.96 * marg_std, (pred_mean + 1.96 * marg_std)[::-1]]), alpha=.15, fc='Blue', ec='None') # Show samples from posterior. rs = npr.RandomState(0) sampled_funcs = rs.multivariate_normal(pred_mean, pred_cov, size=10) ax.plot(plot_xs, sampled_funcs.T) ax.plot(X, y, 'kx') ax.set_ylim([-1.5, 1.5]) ax.set_xticks([]) ax.set_yticks([]) plt.draw() plt.pause(1.0/60.0) # Initialize covariance parameters rs = npr.RandomState(0) init_params = 0.1 * rs.randn(num_params) print("Optimizing covariance parameters...") cov_params = minimize(value_and_grad(objective), init_params, jac=True, method='CG', callback=callback) plt.pause(10.0)
Fhat = forward_pass(W1, W2, W3, b1, b2, b3, x) # ######## # Note that this function assumes a Gaussian likelihood (with variance 1) # You must modify this function to consider a categorical (generalized Bernoulli) likelihood # ######## #nll = 0.5*np.sum(np.square(Fhat - y)) + 0.5*y.size*np.log(2.*np.pi)(Gaussian likelihood) #Implementation of Categorical (Generalized Bernoulli) Likelihood #Compute Inner Product Vector inner_prod_v = np.einsum('ij, ij->i', Fhat, y) nll = np.sum(inner_prod_v) nnll = -1 * nll return nnll nll_gradients = value_and_grad(negative_log_likelihood, argnum=[0, 1, 2, 3, 4, 5]) """ returns the output of `negative_log_likelihood` as well as the gradient of the output with respect to all weights and biases Inputs: same as negative_log_likelihood (W1, W2, W3, b1, b2, b3, x, y) Outputs: (nll, (W1_grad, W2_grad, W3_grad, b1_grad, b2_grad, b3_grad)) nll : output of `negative_log_likelihood` W1_grad : (M, 784) gradient of the nll with respect to the weights of first (hidden) layer W2_grad : (M, M) gradient of the nll with respect to the weights of second (hidden) layer W3_grad : (10, M) gradient of the nll with respect to the weights of third (output) layer b1_grad : (M, 1) gradient of the nll with respect to the biases of first (hidden) layer b2_grad : (M, 1) gradient of the nll with respect to the biases of second (hidden) layer b3_grad : (10, 1) gradient of the nll with respect to the biases of third (output) layer """
sampler_params = np.zeros(len(parser)) parser.put(sampler_params, 'mean', init_mean) parser.put(sampler_params, 'log_stddev', init_stddevs) parser.put(sampler_params, 'log_stepsizes', init_log_stepsizes) parser.put(sampler_params, 'log_noise_sizes', init_log_noise_sizes) def get_batch_marginal_likelihood_estimate(sampler_params): samples, marginal_likelihood_estimates = sample_and_run_langevin( sampler_params, rs, num_samples) matplotlib.image.imsave("optimizing", (samples[0, :].reshape( (28, 28))).value) return np.mean(marginal_likelihood_estimates) ml_and_grad = value_and_grad(get_batch_marginal_likelihood_estimate) # Optimize Langevin parameters. # for i in xrange(num_sampler_optimization_steps): # ml, dml = ml_and_grad(sampler_params) # print "log marginal likelihood:", ml # plot_sampler_params(sampler_params, 'sampler_params.png') # sampler_params = sampler_params + sampler_learn_rate * dml # print 'dml norm', np.linalg.norm(dml) # print 'dml max', np.max(dml) # fig = plt.figure() # fig.clf() # ax = fig.add_subplot(111) # ax.plot(dml[-(2*num_langevin_steps):-1],'o') # plt.savefig('dml.png')
logprobs = np.asarray(pred_fun(weights, train_inputs)) for t in range(logprobs.shape[1]): training_text = one_hot_to_string(train_inputs[:,t,:]) predicted_text = one_hot_to_string(logprobs[:,t,:]) print(training_text.replace('\n', ' ') + "|" + predicted_text.replace('\n', ' ')) # Wrap function to only have one argument, for scipy.minimize. def training_loss(weights): return -loglike_fun(weights, train_inputs, train_inputs) def callback(weights): print("Train loss:", training_loss(weights)) print_training_prediction(weights) # Build gradient of loss function using autograd. training_loss_and_grad = value_and_grad(training_loss) init_weights = npr.randn(num_weights) * param_scale # Check the gradients numerically, just to be safe quick_grad_check(training_loss, init_weights) print("Training LSTM...") result = minimize(training_loss_and_grad, init_weights, jac=True, method='CG', options={'maxiter':train_iters}, callback=callback) trained_weights = result.x print("\nGenerating text from LSTM model...") num_letters = 30 for t in range(20): text = "" for i in range(num_letters):
def _fit_variational_em(self, variational_posterior, datas, inputs, masks, tags, learning=True, alpha=.75, optimizer="adam", num_iters=100, **kwargs): """ Let gamma denote the emission parameters and theta denote the transition and initial discrete state parameters. This is a mix of EM and SVI: 1. Sample x ~ q(x; phi) 2. Compute L(x, theta') = E_p(z | x, theta)[log p(x, z; theta')] 3. Set theta = (1 - alpha) theta + alpha * argmax L(x, theta') 4. Set gamma = gamma + eps * nabla log p(y | x; gamma) 5. Set phi = phi + eps * dx/dphi * d/dx [L(x, theta) + log p(y | x; gamma) - log q(x; phi)] """ # Optimize the standard ELBO when updating gamma (emissions params) # and phi (variational params) T = sum([data.shape[0] for data in datas]) def _objective(params, itr): if learning: self.emissions.params, variational_posterior.params = params else: variational_posterior.params = params obj = self._surrogate_elbo(variational_posterior, datas, inputs, masks, tags, **kwargs) return -obj / T # Initialize the parameters if learning: params = (self.emissions.params, variational_posterior.params) else: params = variational_posterior.params # Set up the progress bar elbos = [-_objective(params, 0) * T] pbar = trange(num_iters) pbar.set_description("Surrogate ELBO: {:.1f}".format(elbos[0])) # Run the optimizer step = dict(sgd=sgd_step, rmsprop=rmsprop_step, adam=adam_step)[optimizer] state = None for itr in pbar: # Update the emission and variational posterior parameters params, val, g, state = step(value_and_grad(_objective), params, itr, state) elbos.append(-val * T) # Update progress bar pbar.set_description("Surrogate ELBO: {:.1f}".format(elbos[-1])) pbar.update() # Save the final emission and variational parameters if learning: self.emissions.params, variational_posterior.params = params else: variational_posterior.params = params return elbos
def experiment(sname, seed, datasize, nystr=False, args=None): np.random.seed(1) random.seed(1) def LMO_err(params, M=10): np.random.seed(2) random.seed(2) al, bl = np.exp(params) L = bl * bl * np.exp(-L0 / al / al / 2) * np.exp( -L1 / al / al / 2) + 1e-6 * EYEN if nystr: tmp_mat = L @ eig_vec_K C = L - tmp_mat @ np.linalg.inv(eig_vec_K.T @ tmp_mat / N2 + inv_eig_val_K) @ tmp_mat.T / N2 c = C @ W_nystr_Y * N2 else: LWL_inv = chol_inv(L @ W @ L + L / N2 + JITTER * EYEN) C = L @ LWL_inv @ L / N2 c = C @ W @ Y * N2 c_y = c - Y lmo_err = 0 N = 0 for ii in range(1): permutation = np.random.permutation(X.shape[0]) for i in range(0, X.shape[0], M): indices = permutation[i:i + M] K_i = W[np.ix_(indices, indices)] * N2 C_i = C[np.ix_(indices, indices)] c_y_i = c_y[indices] b_y = np.linalg.inv(np.eye(M) - C_i @ K_i) @ c_y_i lmo_err += b_y.T @ K_i @ b_y N += 1 return lmo_err[0, 0] / N / M**2 def callback0(params, timer=None): global Nfeval, prev_norm, opt_params, opt_test_err np.random.seed(3) random.seed(3) if Nfeval % 1 == 0: al, bl = params L = bl * bl * np.exp(-L0 / al / al / 2) * np.exp( -L1 / al / al / 2) + 1e-6 * EYEN if nystr: alpha = EYEN - eig_vec_K @ np.linalg.inv( eig_vec_K.T @ L @ eig_vec_K / N2 + np.diag(1 / eig_val_K / N2)) @ eig_vec_K.T @ L / N2 alpha = alpha @ W_nystr @ Y * N2 else: LWL_inv = chol_inv(L @ W @ L + L / N2 + JITTER * EYEN) alpha = LWL_inv @ L @ W @ Y # L_W_inv = chol_inv(W*N2+L_inv) test_L = bl * bl * np.exp(-test_L0 / al / al / 2) * np.exp( -test_L1 / al / al / 2) pred_mean = test_L @ alpha if timer: return test_err = ((pred_mean - test_Y)**2).mean( ) # ((pred_mean-test_Y)**2/np.diag(pred_cov)).mean()+(np.log(np.diag(pred_cov))).mean() norm = alpha.T @ L @ alpha Nfeval += 1 if prev_norm is not None: if norm[0, 0] / prev_norm >= 3: if opt_params is None: opt_test_err = test_err opt_params = params print(True, opt_params, opt_test_err, prev_norm) raise Exception if prev_norm is None or norm[0, 0] <= prev_norm: prev_norm = norm[0, 0] opt_test_err = test_err opt_params = params print('params,test_err, norm: ', opt_params, opt_test_err, prev_norm) def get_causal_effect(params, do_A, w): "to be called within experiment function." np.random.seed(4) random.seed(4) al, bl = params L = bl * bl * np.exp(-L0 / al / al / 2) * np.exp( -L1 / al / al / 2) + 1e-6 * EYEN if nystr: alpha = EYEN - eig_vec_K @ np.linalg.inv( eig_vec_K.T @ L @ eig_vec_K / N2 + np.diag(1 / eig_val_K / N2)) @ eig_vec_K.T @ L / N2 alpha = alpha @ W_nystr @ Y * N2 else: LWL_inv = chol_inv(L @ W @ L + L / N2 + JITTER * EYEN) alpha = LWL_inv @ L @ W @ Y # L_W_inv = chol_inv(W*N2+L_inv) EYhat_do_A = [] for a in do_A: a = np.repeat(a, [w.shape[0]]).reshape(-1, 1) w = w.reshape(-1, 1) aw = np.concatenate([a, w], axis=-1) ate_L0 = _sqdist(aw[:, 0:1], X[:, 0:1]) ate_L1 = _sqdist(aw[:, 1:2], X[:, 1:2]) ate_L = bl * bl * np.exp(-ate_L0 / al / al / 2) * np.exp( -ate_L1 / al / al / 2) h_out = ate_L @ alpha mean_h = np.mean(h_out).reshape(-1, 1) EYhat_do_A.append(mean_h) print('a = {}, beta_a = {}'.format(np.mean(a), mean_h)) return np.concatenate(EYhat_do_A) # train,dev,test = load_data(ROOT_PATH+'/data/zoo/{}_{}.npz'.format(sname,datasize)) # X = np.vstack((train.x,dev.x)) # Y = np.vstack((train.y,dev.y)) # Z = np.vstack((train.z,dev.z)) # test_X = test.x # test_Y = test.g t1 = time.time() train, dev, test = load_data(ROOT_PATH + "/data/zoo/" + sname + '/main_{}.npz'.format(args.sem)) # train, dev, test = train[:300], dev[:100], test[:100] t2 = time.time() print('t2 - t1 = ', t2 - t1) Y = np.concatenate((train.y, dev.y), axis=0).reshape(-1, 1) # test_Y = test.y AZ_train, AW_train = bundle_az_aw(train.a, train.z, train.w) AZ_test, AW_test = bundle_az_aw(test.a, test.z, test.w) AZ_dev, AW_dev = bundle_az_aw(dev.a, dev.z, test.w) X, Z = np.concatenate((AW_train, AW_dev), axis=0), np.concatenate( (AZ_train, AZ_dev), axis=0) test_X, test_Y = AW_test, test.y.reshape(-1, 1) # TODO: is test.g just test.y? t3 = time.time() print('t3 - t2', t3 - t2) EYEN = np.eye(X.shape[0]) ak0, ak1 = get_median_inter_mnist(Z[:, 0:1]), get_median_inter_mnist(Z[:, 1:2]) N2 = X.shape[0]**2 W0, W1 = _sqdist(Z[:, 0:1], None), _sqdist(Z[:, 1:2], None) print('av kernel indicator: ', args.av_kernel) W = np.exp(-W0 / ak0 / ak0 / 2) * np.exp(-W1 / ak1 / ak1 / 2) / N2 if not args.av_kernel \ else (np.exp(-W0 / ak0 / ak0 / 2) + np.exp(-W0 / ak0 / ak0 / 200) + np.exp( -W0 / ak0 / ak0 * 50)) / 3 / N2 * (np.exp(-W1 / ak1 / ak1 / 2) + np.exp(-W1 / ak1 / ak1 / 200) + np.exp( -W1 / ak1 / ak1 * 50)) / 3 # W = (np.exp(-W0 / ak0 / ak0 / 2) + np.exp(-W0 / ak0 / ak0 / 200) + np.exp( # -W0 / ak0 / ak0 * 50)) / 3 / N2 * (np.exp(-W1 / ak1 / ak1 / 2) + np.exp(-W1 / ak1 / ak1 / 200) + np.exp( # -W1 / ak1 / ak1 * 50)) / 3 # TODO: recompute W for my case del W0, W1 L0, test_L0 = _sqdist(X[:, 0:1], None), _sqdist(test_X[:, 0:1], X[:, 0:1]) L1, test_L1 = _sqdist(X[:, 1:2], None), _sqdist(test_X[:, 1:2], X[:, 1:2]) t4 = time.time() print('t4 - t3', t4 - t3) # measure time # callback0(np.random.randn(2)/10,True) # np.save(ROOT_PATH + "/MMR_IVs/results/zoo/" + sname + '/LMO_errs_{}_nystr_{}_time.npy'.format(seed,train.x.shape[0]),time.time()-t0) # return # params0 = np.random.randn(2) # /10 params0 = np.array([1., 1.]) print('starting param: ', params0) bounds = None # [[0.01,10],[0.01,5]] if nystr: for _ in range(seed + 1): random_indices = np.sort( np.random.choice(range(W.shape[0]), nystr_M, replace=False)) eig_val_K, eig_vec_K = nystrom_decomp(W * N2, random_indices) inv_eig_val_K = np.diag(1 / eig_val_K / N2) W_nystr = eig_vec_K @ np.diag(eig_val_K) @ eig_vec_K.T / N2 W_nystr_Y = W_nystr @ Y t5 = time.time() print('t5 - t4', t5 - t4) obj_grad = value_and_grad(lambda params: LMO_err(params)) try: res = minimize(obj_grad, x0=params0, bounds=bounds, method='L-BFGS-B', jac=True, options={'maxiter': 5000}, callback=callback0) # res stands for results (not residuals!). except Exception as e: print(e) PATH = ROOT_PATH + "/MMR_IVs/results/zoo/" + sname + "/" if not os.path.exists(PATH + str(date.today())): os.mkdir(PATH + str(date.today())) assert opt_params is not None params = opt_params do_A = np.load(ROOT_PATH + "/data/zoo/" + sname + '/do_A_{}.npz'.format(args.sem))['do_A'] EY_do_A_gt = np.load(ROOT_PATH + "/data/zoo/" + sname + '/do_A_{}.npz'.format(args.sem))['gt_EY_do_A'] w_sample = train.w EYhat_do_A = get_causal_effect(params=params, do_A=do_A, w=w_sample) plt.figure() plt.plot([i + 1 for i in range(20)], EYhat_do_A) plt.xlabel('A') plt.ylabel('EYdoA-est') plt.savefig( os.path.join( PATH, str(date.today()), 'causal_effect_estimates_nystr_{}'.format(AW_train.shape[0]) + '.png')) plt.close() print('ground truth ate: ', EY_do_A_gt) visualise_ATEs(EY_do_A_gt, EYhat_do_A, x_name='E[Y|do(A)] - gt', y_name='beta_A', save_loc=os.path.join(PATH, str(date.today())) + '/', save_name='ate_{}_nystr.png'.format(AW_train.shape[0])) causal_effect_mean_abs_err = np.mean(np.abs(EY_do_A_gt - EYhat_do_A)) causal_effect_mae_file = open( os.path.join(PATH, str(date.today()), "ate_mae_{}_nystrom.txt".format(AW_train.shape[0])), "a") causal_effect_mae_file.write( "mae_: {}\n".format(causal_effect_mean_abs_err)) causal_effect_mae_file.close() os.makedirs(PATH, exist_ok=True) np.save( os.path.join( PATH, str(date.today()), 'LMO_errs_{}_nystr_{}.npy'.format(seed, AW_train.shape[0])), [opt_params, prev_norm, opt_test_err])
reproj_err[i] = compute_reproj_err(cams[obs[i, 0]], X[obs[i, 1]], w[i], feats[i]) w_err = 1. - np.square(w) return (reproj_err, w_err) ########## derivative extras ############# def compute_w_err(w): return 1. - w * w compute_w_err_d = value_and_grad(compute_w_err) def compute_reproj_err_wrapper(params, feat): X_off = BA_NCAMPARAMS return compute_reproj_err(params[0:X_off], params[X_off:X_off + 3], params[-1], feat) compute_reproj_err_d = jacobian(compute_reproj_err_wrapper) def compute_ba_J(cams, X, w, obs, feats): p = obs.shape[0] reproj_err_d = [] for i in range(p):
def _fit_bbvi(self, variational_posterior, datas, inputs, masks, tags, verbose=2, learning=True, optimizer="adam", num_iters=100, **kwargs): """ Fit with black box variational inference using a Gaussian approximation for the latent states x_{1:T}. """ # Define the objective (negative ELBO) T = sum([data.shape[0] for data in datas]) def _objective(params, itr): if learning: self.params, variational_posterior.params = params else: variational_posterior.params = params obj = self._bbvi_elbo(variational_posterior, datas, inputs, masks, tags) return -obj / T # Initialize the parameters if learning: params = (self.params, variational_posterior.params) else: params = variational_posterior.params # Set up the progress bar elbos = [-_objective(params, 0) * T] pbar = ssm_pbar(num_iters, verbose, "LP: {:.1f}", [elbos[0]]) # Run the optimizer step = dict(sgd=sgd_step, rmsprop=rmsprop_step, adam=adam_step)[optimizer] state = None for itr in pbar: params, val, g, state = step(value_and_grad(_objective), params, itr, state) elbos.append(-val * T) # TODO: Check for convergence -- early stopping # Update progress bar if verbose == 2: pbar.set_description("ELBO: {:.1f}".format(elbos[-1])) pbar.update() # Save the final parameters if learning: self.params, variational_posterior.params = params else: variational_posterior.params = params return np.array(elbos)
ax_smart_full = fig.add_subplot(322, frameon=False) ax_smart_one = fig.add_subplot(324, frameon=False) ax_smart_two = fig.add_subplot(326, frameon=False) plt.show(block=False) for initialization in initialization_set: init_params = .1 * npr.randn(total_num_params) deep_map = create_deep_map(init_params) if initialization: init_params = initialize(deep_map, X, num_pseudo_params) print("Optimizing covariance parameters...") objective = lambda params: -log_likelihood(params,X,y,n_samples) params = minimize(value_and_grad(objective), init_params, jac=True, method='BFGS', callback=callback,options={'maxiter':1000}) params = params['x'] plot_xs = np.reshape(np.linspace(-5, 5, 300), (300,1)) if initialization: ax_full = ax_smart_full ax_one = ax_smart_one ax_two = ax_smart_two title = "Two Layers, Smart Initialization" else: ax_full = ax_random_full ax_one = ax_random_one ax_two = ax_random_two title = "Two Layers, Random Initialization"
return minimize_cb def init(): line.set_data([], []) point.set_data([], []) return line, point def animate(i): line.set_data(*path[::, :i]) point.set_data(*path[::, i - 1:i]) return line, point func = value_and_grad(lambda args: f(*args)) res = minimize(func, x0=x0, method='Newton-CG', jac=True, tol=1e-20, callback=make_minimize_cb(path_)) path = np.array(path_).T path.shape fig, ax = plt.subplots(figsize=(10, 6)) ax.contour(x, y,
def grad_rho(parameters, X_data, Y_data, sample_indices, kernel_keyword= "RBF", reg = 0.000001): grad_K = value_and_grad(rho, 1) rho_value, gradient = grad_K(parameters, X_data, Y_data, sample_indices, kernel_keyword, reg = reg) return rho_value, gradient
def __init__(self, NSIDE, npix, clv=True): """ Args: NSIDE (int) : the healpix NSIDE parameter, must be a power of 2, less than 2**30 npix (int) : number of pixel in the X and Y axis of the final projected map rot_velocity (float) : rotation velocity of the star in the equator in km/s Returns: None """ self.NSIDE = int(NSIDE) self.npix = int(npix) self.hp_npix = hp.nside2npix(NSIDE) # self.rot_velocity = rot_velocity self.clv = clv # Generate the indices of all healpix pixels self.indices = np.arange(hp.nside2npix(NSIDE), dtype='int') self.n_healpix_pxl = len(self.indices) # Define the orthographic projector that generates the maps of the star on the plane of the sky self.projector = hp.projector.OrthographicProj(xsize=int(self.npix)) # This function returns the pixel associated with a vector (x,y,z). This is needed by the projector self.f_vec2pix = lambda x, y, z: hp.pixelfunc.vec2pix(int(self.NSIDE), x, y, z) # Generate a mesh grid of X and Y points in the plane of the sky that covers only the observed hemisphere of the star x = np.linspace(-2.0,0.0,int(self.npix/2)) y = np.linspace(-1.0,1.0,int(self.npix/2)) X, Y = np.meshgrid(x,y) # Rotational velocity vector (pointing in the z direction and unit vector) omega = np.array([0,0,1]) # Compute the radial vector at each position in the map and the projected velocity on the plane of the sky radial_vector = np.array(self.projector.xy2vec(X.flatten(), Y.flatten())).reshape((3,int(self.npix/2),int(self.npix/2))) self.vel_projection = np.cross(omega[:,None,None], radial_vector, axisa=0, axisb=0)[:,:,0] # Compute the mu angle (astrocentric angle) self.mu_angle = radial_vector[0,:,:] # Read all Kurucz models from the database. Hardwired temperature and mu angles print("Reading Kurucz spectra...") self.T = 3500 + 250 * np.arange(27) self.mus = np.array([1.0,0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1,0.05,0.02])[::-1] for i in tqdm(range(27)): f = 'kurucz_models/RESULTS/T_{0:d}_logg4.0_feh0.0.spec'.format(self.T[i]) vel, _, spec = _read_kurucz_spec(f) if (i == 0): self.nlambda, self.nmus = spec.shape self.velocity = np.zeros((self.nlambda)) self.spectrum = np.zeros((27,self.nmus,self.nlambda)) self.velocity = vel self.spectrum[i,:,:] = spec[:,::-1].T # Generate a fake temperature map in the star using spherical harmonics # self.temperature_map = 5000 * np.ones(self.npix) # self.temperature_map = 5000 + 250 * hp.sphtfunc.alm2map(np.ones(10,dtype='complex'),self.NSIDE) #np.random.rand(self.n_healpix_pxl) * 2000 + 5000 # self.temperature_map = 5000 * np.ones(self.hp_npix) self.coeffs = hp.sphtfunc.map2alm(self.temperature_map) self.velocity_per_pxl = self.velocity[1] - self.velocity[0] self.freq_grid = np.fft.fftfreq(self.nlambda) self.gradient = value_and_grad(self.loss)
k_xx = calcSigma(x,x,l) marg_data = 0.5* np.dot(y.T,np.dot(np.linalg.inv(k_xx+ (noise_var**2)*np.identity(k_xx.shape[0])),y)) - 0.5 * \ np.log(np.linalg.det(np.linalg.inv(k_xx+ (noise_var**2)*np.identity(k_xx.shape[0])))) - (len(y)*0.5) * np.log(2*np.pi) return -1.0*marg_data ################################### #### Gradient #### ################################### g_ml = lambda l: marg_likelihood(x_train,y_train,l) init_params = 0.1 * rs.randn(num_params) grad_ml = grad(g_ml) cov_params = minimize(value_and_grad(g_ml),init_params,jac=True, method = 'CG') print marg_likelihood(x_train,y_train,length_scale) print grad_ml(length_scale) print "Initial Parameters: ", init_params print "Optimized Parameters: ", cov_params.x opt_length_scale = np.exp(cov_params.x[0]) import pdb pdb.set_trace() Omg = np.linalg.inv( K + ((noise_var/2.)**2*np.identity(n_train)) ) Beta = np.dot(Omg,y_train).reshape((-1,1))
def run_expt(config, loss_opt=0): ttl = config_to_str(config) print '\nstarting experiment {}'.format(ttl) print config Xtrain, Ytrain, params_true, true_fun, fun_name = demo.make_data_linreg_1d(config['N'], config['fun_type']) data_dim = Xtrain.shape[1] N = Xtrain.shape[0] Xtrain, Ytrain = opt.shuffle_data(Xtrain, Ytrain) model_type = config['model_type'] if model_type == 'linear': model = LinregModel(data_dim, add_ones=True) params, loss = model.ols_fit(Xtrain, Ytrain) elif model_type[0:3] == 'mlp': _, layer_sizes = model_type.split(':') layer_sizes = [int(n) for n in layer_sizes.split('-')] model = MLP(layer_sizes, 'regression', L2_reg=0.001, Ntrain=N) else: raise ValueError('unknown model type {}'.format(model_type)) initial_params = model.init_params() param_dim = len(initial_params) plot_data = (data_dim == 1) plot_params = (param_dim == 2) nplots = 2 if plot_data: nplots += 1 if plot_params: nplots += 1 plot_rows, plot_cols = util.nsubplots(nplots) if config['optimizer'] == 'BFGS': obj_fun = lambda params: model.PNLL(params, Xtrain, Ytrain) logger = opt.OptimLogger(lambda params, iter: obj_fun(params), store_freq=1, print_freq=10) params = opt.bfgs(autograd.value_and_grad(obj_fun), initial_params, logger.callback, config['num_epochs']) if config['optimizer'] == 'SGD': B = config['batch_size'] M = N / B # num_minibatches_per_epoch (num iter per epoch) max_iters = config['num_epochs'] * M grad_fun = opt.build_batched_grad(model.gradient, config['batch_size'], Xtrain, Ytrain) #obj_fun = opt.build_batched_grad(model.PNLL, config['batch_size'], Xtrain, Ytrain) obj_fun = lambda params, iter: model.PNLL(params, Xtrain, Ytrain) logger = opt.OptimLogger(obj_fun, store_freq=M, print_freq=M*10, store_params=plot_params) if config.has_key('lr_fun'): if config['lr_fun'] == 'exp': lr_fun = lambda iter: opt.lr_exp_decay(iter, config['init_lr'], config['lr_decay']) elif config['lr_fun'] == 'const': lr_fun = opt.const_lr(config['init_lr']) else: raise ValueError('Unknown lr-fun {}'.format(lr_fun)) #sgd_fun = config['sgd-fun'] #params = sgd_fun(grad_fun, initial_params, logger.callback, \ # max_iters, lr_fun, *config['args']) if config['sgd-method'] == 'momentum': params = opt.sgd(grad_fun, initial_params, logger.callback, \ max_iters, lr_fun, config['mass']) elif config['sgd-method'] == 'RMSprop': params = opt.rmsprop(grad_fun, initial_params, logger.callback, \ max_iters, lr_fun, config['grad_sq_decay']) elif config['sgd-method'] == 'ADAM': params = opt.adam(grad_fun, initial_params, logger.callback, \ max_iters, lr_fun, config['grad_decay'], config['grad_sq_decay']) elif config['sgd-method'] == 'AutoADAM': eval_fn = lambda params: model.PNLL(params, Xtrain, Ytrain) params, lr, scores = opt.autoadam(grad_fun, initial_params, logger.callback, \ max_iters, eval_fn, config['auto-method']) config['init_lr'] = lr config['lr_fun'] = 'const' ttl = config_to_str(config) print 'autoadam: chose {:0.3f} as lr'.format(lr) print scores else: raise ValueError('Unknown SGD method {}'.format(config['method'])) training_loss = model.PNLL(params, Xtrain, Ytrain) print 'finished fitting, training loss {:0.3f}, {} obj calls, {} grad calls'.\ format(training_loss, model.num_obj_fun_calls, model.num_grad_fun_calls) fig = plt.figure() ax = fig.add_subplot(plot_rows, plot_cols, 1) opt.plot_loss_trace(logger.obj_trace, loss_opt, ax) ax.set_title('final objective {:0.3f}'.format(training_loss)) ax.set_xlabel('epochs') ax = fig.add_subplot(plot_rows, plot_cols, 2) ax.plot(logger.grad_norm_trace) ax.set_title('gradient norm vs num updates') if plot_data: ax = fig.add_subplot(plot_rows, plot_cols, 3) predict_fun = lambda X: model.predictions(params, X) demo.plot_data_and_predictions_1d(Xtrain, Ytrain, true_fun, predict_fun, ax) if plot_params: ax = fig.add_subplot(plot_rows, plot_cols, 4) loss_fun = lambda w0, w1: model.PNLL(np.array([w0, w1]), Xtrain, Ytrain) demo.plot_error_surface_2d(loss_fun, params, params_true, config['fun_type'], ax) demo.plot_param_trace_2d(logger.param_trace, ax) fig.suptitle(ttl) folder = 'figures/linreg-sgd' fname = os.path.join(folder, 'linreg_1d_sgd_{}.png'.format(ttl)) plt.savefig(fname) return training_loss
def RMSprop(g,w,x_train,y_train,alpha,max_its,batch_size,**kwargs): verbose = True if 'verbose' in kwargs: verbose = kwargs['verbose'] # rmsprop params gamma=0.9 eps=10**-8 if 'gamma' in kwargs: gamma = kwargs['gamma'] if 'eps' in kwargs: eps = kwargs['eps'] # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w) grad = value_and_grad(g_flat) # initialize average gradient avg_sq_grad = np.ones(np.size(w)) if 'ave_sq_grad' in kwargs: avg_sq_grad = kwargs['avg_sq_grad'] # record history num_train = y_train.shape[1] w_hist = [unflatten(w)] train_hist = [g_flat(w,x_train,y_train,np.arange(num_train))] # how many mini-batches equal the entire dataset? num_batches = int(np.ceil(np.divide(num_train, batch_size))) # over the line for k in range(max_its): # loop over each minibatch start = timer() train_cost = 0 for b in range(num_batches): # collect indices of current mini-batch batch_inds = np.arange(b*batch_size, min((b+1)*batch_size, num_train)) # plug in value into func and derivative cost_eval,grad_eval = grad(w,x_train,y_train,batch_inds) grad_eval.shape = np.shape(w) # update exponential average of past gradients avg_sq_grad = gamma*avg_sq_grad + (1 - gamma)*grad_eval**2 # take descent step w = w - alpha*grad_eval / (avg_sq_grad**(0.5) + eps) end = timer() # update training and validation cost train_cost = g_flat(w,x_train,y_train,np.arange(num_train)) # record weight update, train and val costs w_hist.append(unflatten(w)) train_hist.append(train_cost) if verbose == True: print ('step ' + str(k+1) + ' done in ' + str(np.round(end - start,1)) + ' secs, train cost = ' + str(np.round(train_hist[-1][0],4))) if verbose == True: print ('finished all ' + str(max_its) + ' steps') return w_hist,train_hist,avg_sq_grad
data = make_pinwheel_data() def objective(params): return -log_marginal_likelihood(params, data) def plot_gmm(params, ax, num_points=100): angles = np.expand_dims(np.linspace(0, 2*np.pi, num_points), 1) xs, ys = np.cos(angles), np.sin(angles) circle_pts = np.concatenate([xs, ys], axis=1) * 2.0 for log_proportion, mean, chol in zip(*unpack_params(params)): cur_pts = mean + np.dot(circle_pts, chol) ax.plot(cur_pts[:, 0], cur_pts[:, 1], '-') fig = plt.figure(figsize=(12,8), facecolor='white') ax = fig.add_subplot(111, frameon=False) plt.show(block=False) def callback(params): print("Log likelihood {}".format(-objective(params))) ax.cla() ax.plot(data[:, 0], data[:, 1], 'bx') ax.set_xticks([]) ax.set_yticks([]) plot_gmm(params, ax) plt.draw() plt.pause(1.0/60.0) # Initialize and optimize model. rs = npr.RandomState(0) init_params = rs.randn(num_gmm_params) * 0.1 minimize(value_and_grad(objective), init_params, jac=True, method='CG', callback=callback)
p = obs.shape[0] reproj_err = np.empty((p,2)) for i in range(p): reproj_err[i] = compute_reproj_err(cams[obs[i,0]],X[obs[i,1]],w[i],feats[i]) w_err = 1. - np.square(w) return (reproj_err, w_err) ########## derivative extras ############# def compute_w_err(w): return 1. - w*w compute_w_err_d = value_and_grad(compute_w_err) def compute_reproj_err_wrapper(params,feat): X_off = BA_NCAMPARAMS return compute_reproj_err(params[0:X_off],params[X_off:X_off+3],params[-1],feat) compute_reproj_err_d = jacobian(compute_reproj_err_wrapper) def compute_ba_J(cams, X, w, obs, feats): p = obs.shape[0] reproj_err_d = [] for i in range(p): params = np.hstack((cams[obs[i,0]],X[obs[i,1]],w[i])) reproj_err_d.append(compute_reproj_err_d(params,feats[i])) w_err_d = [] for curr_w in w:
def optimize_newton(fo, diagonal, random_sample, var_sample, tol, \ num_intents, num_var_samples, T, joint_sample_x, joint_sample_y, \ var_samples_x, var_samples_y, frame, num_peds, time_array, \ ess, top_Z_indices, robot_mu_x, robot_mu_y, ped_mu_x, ped_mu_y, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x, inv_cov_ped_y, \ one_over_cov_sum_x, one_over_cov_sum_y, \ one_over_cov_sumij_x, one_over_cov_sumij_y, normalize, \ ll_converge, vg, hess_opt, opt_iter_robot, opt_iter_all, \ x_ped, y_ped, agent_disrupt, robot_agent_disrupt, opt_method): if random_sample: f = [0. for _ in range(num_intents + 1)] ll = [0. for _ in range(num_intents + 1)] if var_sample: f = [0. for _ in range(2 * num_var_samples + 1)] ll = [0. for _ in range(2 * num_var_samples + 1)] ped_mu_x_ess = [0. for _ in range(ess)] ped_mu_y_ess = [0. for _ in range(ess)] inv_cov_ped_x_ess = [0. for _ in range(ess)] inv_cov_ped_y_ess = [0. for _ in range(ess)] one_over_cov_sum_x_ess = [0. for _ in range(ess)] one_over_cov_sum_y_ess = [0. for _ in range(ess)] one_over_std_sum_x_ess = [0. for _ in range(ess)] one_over_std_sum_y_ess = [0. for _ in range(ess)] for ped in range(ess): top = top_Z_indices[ped] ped_mu_x_ess[ped] = ped_mu_x[top] ped_mu_y_ess[ped] = ped_mu_y[top] inv_cov_ped_x_ess[ped] = inv_cov_ped_x[top] inv_cov_ped_y_ess[ped] = inv_cov_ped_y[top] one_over_cov_sum_x_ess[ped] = one_over_cov_sum_x[top] one_over_cov_sum_y_ess[ped] = one_over_cov_sum_y[top] # one_over_std_sum_x_ess[ped] = one_over_std_sum_x[top] # one_over_std_sum_y_ess[ped] = one_over_std_sum_y[top] t0 = time.time() if random_sample: for intent in range(num_intents + 1): if intent == 0: x0 = robot_mu_x x0 = np.concatenate((x0, robot_mu_y)) for ped in range(ess): top = top_Z_indices[ped] x0 = np.concatenate((x0, ped_mu_x[top])) x0 = np.concatenate((x0, ped_mu_y[top])) else: x0 = joint_sample_x[num_peds, intent - 1, :] x0 = np.concatenate((x0, joint_sample_y[num_peds, intent - 1, :])) for ped in range(ess): top = top_Z_indices[ped] x0 = np.concatenate((x0, joint_sample_x[top, intent - 1, :])) x0 = np.concatenate((x0, joint_sample_y[top, intent - 1, :])) if opt_iter_robot or opt_iter_all: f[intent] = optimize_iterate(fo, tol, diagonal, frame, x0, num_peds, ess,\ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ one_over_cov_sumij_x, one_over_cov_sumij_y, \ normalize, ll_converge, T, opt_iter_robot, opt_iter_all) if diagonal: ll[intent] = so_diagonal.ll(f[intent], num_peds, ess,\ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ one_over_cov_sumij_x, one_over_cov_sumij_y, \ normalize, T) else: if fo: ll[intent] = fo_dense.ll(f[intent], num_peds, ess,\ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, normalize) else: ll[intent] = so_dense.ll(f[intent], num_peds, ess,\ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ one_over_cov_sumij_x, one_over_cov_sumij_y, \ normalize, T) else: if diagonal: f[intent] = sp.optimize.minimize(so_diagonal.ll, x0, \ args=(num_peds, ess,\ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, \ inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ one_over_cov_sumij_x, one_over_cov_sumij_y, normalize, T), \ method=opt_method, jac=so_diagonal.d_ll, \ hess=so_diagonal.dd_ll, \ options={'xtol': tol}) #trust-ncg--VERY SLOW #trust-krylov---VERY SLOW #Newton-CG---.56 SECONDS, GOOD RESULT #trust-exact---.52 seconds. ll[intent] = so_diagonal.ll(f[intent].x, num_peds, ess, \ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ one_over_cov_sumij_x, one_over_cov_sumij_y, \ normalize, T) else: if fo: f[intent] = sp.optimize.minimize(fo_dense.ll, x0, args=(num_peds, ess,\ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, normalize), \ method=opt_method, jac=so_dense.d_ll, hess=fo_dense.dd_ll, \ options={'xtol': tol}) ll[intent] = fo_dense.ll(f[intent].x, num_peds, ess, \ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, normalize) else: f[intent] = sp.optimize.minimize(so_dense.ll, x0, args=(num_peds, ess,\ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ one_over_cov_sumij_x, one_over_cov_sumij_y, \ normalize, T), \ method=opt_method, jac=so_dense.d_ll, hess=so_dense.dd_ll, \ options={'xtol': tol}) ll[intent] = so_dense.ll(f[intent].x, num_peds, ess, \ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ one_over_cov_sumij_x, one_over_cov_sumij_y, \ normalize, T) ll[intent] = math.trunc(ll[intent] * 1e3) / 1e3 print('intent =', intent, end=" ", flush=True) if var_sample: # high_value_var_sampler(var_samples_x, var_samples_y) for var in range(2 * num_var_samples + 1): if var == 0: x0 = robot_mu_x x0 = np.concatenate((x0, robot_mu_y)) for ped in range(ess): top = top_Z_indices[ped] x0 = np.concatenate((x0, ped_mu_x[top])) x0 = np.concatenate((x0, ped_mu_y[top])) else: x0 = var_samples_x[num_peds, var - 1, :] x0 = np.concatenate((x0, var_samples_y[num_peds, var - 1, :])) for ped in range(ess): top = top_Z_indices[ped] x0 = np.concatenate((x0, var_samples_x[top, var - 1, :])) x0 = np.concatenate((x0, var_samples_y[top, var - 1, :])) if opt_iter_robot or opt_iter_all: print('OPT ITER FO', fo) f[var] = optimize_iterate(fo, tol, diagonal, frame, x0, num_peds, ess,\ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ one_over_cov_sumij_x, one_over_cov_sumij_y, \ normalize, ll_converge, T, opt_iter_robot, opt_iter_all) if diagonal: ll[var] = so_diagonal.ll(f[var], num_peds, ess,\ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ one_over_cov_sumij_x, one_over_cov_sumij_y, normalize, T) else: if fo: ll[var] = fo_dense.ll(f[var], num_peds, ess,\ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ normalize) else: ll[var] = so_dense.ll(f[var], num_peds, ess,\ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ one_over_cov_sumij_x, one_over_cov_sumij_y, normalize, T) else: if diagonal: f[var] = sp.optimize.minimize(so_diagonal.ll, x0, args=(num_peds, ess,\ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ one_over_cov_sumij_x, one_over_cov_sumij_y, normalize, T), \ method=opt_method, jac=so_diagonal.d_ll, \ hess=so_diagonal.dd_ll, \ options={'xtol': tol}) ll[var] = so_diagonal.ll(f[var].x, num_peds, ess,\ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ one_over_cov_sumij_x, one_over_cov_sumij_y, normalize, T) elif vg: print('VALUE AND GRAD') print('') f[var] = sp.optimize.minimize(value_and_grad(so_dense.ll), x0, \ args=(num_peds, ess,\ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, \ inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ one_over_cov_sumij_x, one_over_cov_sumij_y, normalize, T), \ jac=True, method='BFGS', options={'xtol': 1e-8, 'disp': True}) # f[var] = sp.optimize.minimize(so_dense.ll, x0, \ # args=(num_peds, ess,\ # robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ # cov_robot_x, cov_robot_y, \ # inv_cov_robot_x, inv_cov_robot_y, \ # cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ # one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ # one_over_cov_sumij_x, one_over_cov_sumij_y, normalize), \ # jac=so_dense.d_ll, method='BFGS', options={'xtol': 1e-8, 'disp': True}) ll[var] = so_dense.ll(f[var].x, num_peds, ess,\ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ one_over_cov_sumij_x, one_over_cov_sumij_y, normalize, T) elif hess_opt: if fo: print('HAND DERIVED SCIPY HESS OPT FO') print('') f[var] = sp.optimize.minimize(fo_dense.ll, x0, args=(num_peds, ess,\ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ normalize), \ method=opt_method, jac=fo_dense.d_ll, hess=fo_dense.dd_ll, \ options={'xtol': tol}) ll[var] = fo_dense.ll(f[var].x, num_peds, ess,\ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ normalize) else: print('HAND DERIVED SCIPY HESS OPT SO') print('') f[var] = sp.optimize.minimize(so_dense.ll, x0, args=(num_peds, ess,\ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ one_over_cov_sumij_x, one_over_cov_sumij_y, normalize, T), \ method=opt_method, jac=so_dense.d_ll, hess=so_dense.dd_ll, \ options={'xtol': tol}) ll[var] = so_dense.ll(f[var].x, num_peds, ess,\ robot_mu_x, robot_mu_y, ped_mu_x_ess, ped_mu_y_ess, \ cov_robot_x, cov_robot_y, inv_cov_robot_x, inv_cov_robot_y, \ cov_ped_x, cov_ped_y, inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ one_over_cov_sumij_x, one_over_cov_sumij_y, normalize, T) if not math.isinf(ll[var]) and not math.isnan(ll[var]): ll[var] = math.trunc(ll[var] * 1e3) / 1e3 print('variance sample = ', var, end=" ", flush=True) #######################HAND ROLLED GRAD+HESS # f = optimize_iterate(frame, x0, num_peds, ess,\ # robot_mu_x, robot_mu_y, \ # ped_mu_x_ess, ped_mu_y_ess, \ # inv_cov_robot_x, inv_cov_robot_y, \ # inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ # one_over_cov_sum_x_ess, \ # one_over_cov_sum_y_ess, \ # one_over_std_sum_x_ess, \ # one_over_std_sum_y_ess) #######################SCIPY LL+GRAD+HESS # f[intent] = sp.optimize.minimize(so_diagonal.ll, x0, \ # args=(num_peds, ess,\ # robot_mu_x, robot_mu_y, \ # ped_mu_x_ess, ped_mu_y_ess, \ # cov_robot_x, cov_robot_y, \ # inv_cov_robot_x, inv_cov_robot_y, \ # cov_ped_x, cov_ped_y, \ # inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ # one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ # one_over_std_sum_x_ess, one_over_std_sum_y_ess), \ # method=opt_method, jac=so_diagonal.d_ll, \ # hess=so_diagonal.dd_ll) # # options={'xtol': tol}) # ll[intent] = so_diagonal.ll(f[intent].x, num_peds, ess,\ # robot_mu_x, robot_mu_y, \ # ped_mu_x_ess, ped_mu_y_ess, \ # cov_robot_x, cov_robot_y, \ # inv_cov_robot_x, inv_cov_robot_y, \ # cov_ped_x, cov_ped_y, \ # inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ # one_over_cov_sum_x_ess, one_over_cov_sum_y_ess, \ # one_over_std_sum_x_ess, one_over_std_sum_y_ess) # ll[intent] = math.trunc(ll[intent]*1e3)/1e3 # print(intent, end =" ", flush=True) # newton iterate, tol=1e-8, num_peds: .138+/-.077s ######################### TIMING ON DIFFERENT APPROACHES # num_peds, Tdex_max=25,no collisions: # trust-krlov w/ gtol=1e-8 .708+/-.223s # trust-krylov: no tol: .523+/-.192s # Newton-CG: xtol=1e-8: 0.701+/-.229s # Newton-CG, no tol: .568+/-.243s # newton-cg/ess=True, xtol=1e-8, .227+/-.099 # NCG/ess=True, no xtol .2+/-.098 # Newton-CG, gtol=1e-8: 0.943+/-0.419s # Newton-CG, no tol: .791+/-.288s # Newton-CG,ess=True, .116+/-.059 # trust-ncg, gtol=1e-8: 1.622+/-0.962s # trust-ncg, no tol: .539+/-.123s #######################SCIPY LL+GRAD # f = sp.optimize.minimize(so_diagonal.ll, x0, \ # args=(num_peds, ess,\ # robot_mu_x, robot_mu_y, \ # ped_mu_x_ess, ped_mu_y_ess, \ # inv_cov_robot_x, inv_cov_robot_y, \ # inv_cov_ped_x_ess, inv_cov_ped_y_ess, \ # one_over_cov_sum_x_ess, \ # one_over_cov_sum_y_ess, \ # one_over_std_sum_x_ess, \ # one_over_std_sum_y_ess), \ # method='BFGS', jac=so_diagonal.d_ll, \ # options={'disp': True}) #######################SCIPY LL # f = sp.optimize.minimize(\ # value_and_grad(ll_diag_slice_grad), x0, \ # jac=True, method='BFGS',\ # options={'xtol': 1e-8, 'disp': True}) def coupling(f, x0, one_over_cov_sum_x, one_over_cov_sum_y): n = 2 uncoupling = 0. for ped in range(ess): vel_x = f[:T] - x0[n * T:(n + 1) * T] vel_y = f[T:2 * T] - x0[(n + 1) * T:(n + 2) * T] n = n + 2 vel_x_2 = np.power(vel_x, 2) vel_y_2 = np.power(vel_y, 2) quad_x = np.multiply(vel_x_2, np.diag(one_over_cov_sum_x[ped])) quad_y = np.multiply(vel_y_2, np.diag(one_over_cov_sum_y[ped])) Z_x = np.exp(-0.5 * quad_x) Z_y = np.exp(-0.5 * quad_y) Z = np.multiply(Z_x, Z_y) log_znot = np.sum(np.log1p(-Z)) uncoupling = uncoupling + log_znot return -1 * uncoupling #WE WANT TO MAKE UNCOUPLING LARGE; SO -UNCOUPLING SHOULD # BE SMALL. LARGE VALUE OF -UNCOUPLING MEANS LOTS OF COUPLING global_optima_dex = np.argmin(ll) if opt_iter_robot or opt_iter_all: agent_disrupt[frame] = np.linalg.norm(f[global_optima_dex][2 * T] - x0[2 * T:]) robot_agent_disrupt[frame] = coupling(f[global_optima_dex], x0, \ one_over_cov_sum_x, one_over_cov_sum_y) else: agent_disrupt[frame] = np.linalg.norm(f[global_optima_dex].x[2 * T] - x0[2 * T:]) robot_agent_disrupt[frame] = coupling(f[global_optima_dex].x, x0, \ one_over_cov_sum_x, one_over_cov_sum_y) opt_time = time.time() - t0 time_array[frame] = opt_time ave_time = math.trunc(1e3 * np.mean(time_array[:frame + 1])) / 1e3 std_time = math.trunc(1e3 * np.std(time_array[:frame + 1])) / 1e3 return f, ll, opt_time, time_array, ave_time, std_time, \ agent_disrupt, robot_agent_disrupt
def fit(self, x_ND, x_valid_ND=None, verbose=True): ''' Fit this estimator to provided training data using LBFGS algorithm Args ---- x_ND : 2D array, shape (N, D) Dataset used for training. Each row is an observed feature vector of size D x_valid_ND : 2D array, shape (Nvalid, D), optional Optional, dataset used for heldout validation. Each row is an observed feature vector of size D If provided, used to measure heldout likelihood at every checkpoint. These likelihoods will be recorded in self.history['valid_neg_log_lik_per_pixel'] verbose : boolean, optional, defaults to True If provided, a message will be printed to stdout after every iteration, indicating the current training loss and (if possible) validation score. Returns ------- self : this GMM object Internal attributes log_pi_K, mu_KD, stddev_KD updated. Performance metrics stored after every iteration in history ''' N = np.maximum(x_ND.shape[0], 1.0) ## Create history attribute to store progress at every checkpoint (every iteration) self.history = defaultdict(list) ## Create initial parameters at random, using self.seed for the random seed # Will always create same parameters if self.seed is the same value. log_pi_K, mu_KD, stddev_KD = self.generate_initial_parameters(x_ND) ## Package up parameters into one vector of unconstrained parameters init_param_vec = self.to_flat_array_of_unconstrained_parameters( log_pi_K, mu_KD, stddev_KD) ## Define loss fuction in terms of single vector containing all unconstrained parameters # Will compute the "per pixel" or "per dimension" loss def calc_loss(vec_M): ''' Compute per-pixel loss (negative log likelihood plus penalty) Returns ------- loss : float ''' # First, take current unconstrained parameters and transform back to common parameters # This provided transformation is autograd-able. log_pi_K, mu_KD, stddev_KD = self.to_common_parameters_from_flat_array( vec_M) # Second compute the loss # TODO replace this placeholder! loss_placeholder = ag_np.sum(ag_np.square(vec_M)) # Finally, be sure this is per-pixel loss (total num pixels = N * D) return loss_placeholder / (N * self.D) ## Define gradient in terms of single vector of unconstrained parameters calc_grad = autograd.grad(calc_loss) calc_loss_and_grad = autograd.value_and_grad(calc_loss) ## Define callback function for monitoring progress of gradient descent # Will be called at every checkpoint (after every iteration of LBFGS) self.callback_count = 0 self.start_time_sec = time.time() def callback_update_history(cur_param_vec): cur_loss, cur_grad_vec = calc_loss_and_grad(cur_param_vec) self.history['iter'].append(self.callback_count) self.history['train_loss_per_pixel'].append(cur_loss) log_pi_K, mu_KD, stddev_KD = self.to_common_parameters_from_flat_array( cur_param_vec) if x_valid_ND is None: valid_neg_log_lik_msg = "" # empty message when no validation set provided else: ## TODO compute the per-pixel negative log likelihood on validation set ## Use calc_negative_log_lik and x_valid_ND valid_neg_log_lik_per_pixel = 0.0 valid_neg_log_lik_msg = "| valid score % 9.6f" % ( valid_neg_log_lik_per_pixel) self.history['valid_neg_log_lik_per_pixel'].append( valid_neg_log_lik_per_pixel) if verbose: print("iter %4d / %4d after %9.1f sec | train loss % 9.6f %s" % (self.callback_count, self.max_iter, time.time() - self.start_time_sec, cur_loss, valid_neg_log_lik_msg)) ## Track L1 norm of the gradient # This should slowly go to exactly zero if we have converged self.history['grad_norm'].append( np.sum(np.abs(cur_grad_vec)) / cur_grad_vec.size) self.callback_count += 1 ## Perform callback on initial parameters # Always good to know performance at original initialization callback_update_history(init_param_vec) ## Call LBFGS routine from scipy # This will perform many LBFGS update iterations, # and after each one will perform a callback using our provided function. # See scipy.optimize.minimize docs for details result = scipy.optimize.minimize(calc_loss, init_param_vec, jac=calc_grad, method='l-bfgs-b', constraints={}, callback=callback_update_history, options=dict(maxiter=self.max_iter, ftol=self.ftol)) ## Unpack the result of the optimization self.result = result self.message = str(result.message) optimal_param_vec = result.x self.log_pi_K, self.mu_KD, self.stddev_KD = self.to_common_parameters_from_flat_array( optimal_param_vec)
import autograd.numpy as np from autograd import value_and_grad from scipy.optimize import minimize def rosenbrock(x): return 100*(x[1] - x[0]**2)**2 + (1 - x[0])**2 # Build a function that also returns gradients using autograd. rosenbrock_with_grad = value_and_grad(rosenbrock) # Optimize using conjugate gradients. result = minimize(rosenbrock_with_grad, x0=np.array([0.0, 0.0]), jac=True, method='CG') print "Found minimum at {0}".format(result.x)
def train(self): result = minimize(value_and_grad(self.likelihood), self.hyp, jac=True, method='L-BFGS-B', callback=self.callback) self.hyp = result.x
fn_out = dir_out + fn def gmm_objective_wrapper(params, x, wishart_gamma, wishart_m): return gmm.gmm_objective(params[0], params[1], params[2], x, wishart_gamma, wishart_m) alphas, means, icf, x, wishart_gamma, wishart_m = gmm_io.read_gmm_instance( fn_in + ".txt", replicate_point) tf = utils.timer(gmm.gmm_objective, (alphas, means, icf, x, wishart_gamma, wishart_m), nruns=nruns_f, limit=time_limit) name = "Autograd" if nruns_J > 0: # k = alphas.size grad_gmm_objective_wrapper = value_and_grad(gmm_objective_wrapper) tJ, grad = utils.timer(grad_gmm_objective_wrapper, ((alphas, means, icf), x, wishart_gamma, wishart_m), nruns=nruns_J, limit=time_limit, ret_val=True) gmm_io.write_J(fn_out + "_J_" + name + ".txt", grad[1]) else: tJ = 0 utils.write_times(fn_out + "_times_" + name + ".txt", tf, tJ)
frate = F(F0, rebonato_vol, var, tj) return frate # for a specific combination of S, T, K v0 = 1.0 F0 = pd.read_pickle("start_rate.pkl") z = np.random.rand(120) def f(params): a, b, c, theta, kappa, epsilon, rho = params estimated = [] for j in range(30): fi = f_rate(F0[j], v0, a, b, c, rho, kappa, theta, epsilon, betas[j], z, z1, z2, j) estimated.append(fi) diff = target - estimated return np.dot(diff) iters = 100 ## dy is a function that will return # 1. the difference between market value and estimated value # 2. the gradients w.r.t each parameters dy = value_and_grad(f) init_guess = np.array(0.15, 0.015, 0.015, 0.015, 0.015, 0.015, 0.5) # begin optimization sol = root(dy, init_guess, jac=True, method='lm', options={"maxiter": iters})
pred_fun, loss_fun, frac_err, num_weights = build_lstm(input_size, state_size, output_size) def print_training_prediction(weights, train_inputs, train_targets): print("Training text Predicted text") logprobs = np.asarray(pred_fun(weights, train_inputs)) for t in range(logprobs.shape[1]): training_text = one_hot_to_string(train_targets[:,t,:]) predicted_text = one_hot_to_string(logprobs[:,t,:]) print(training_text.replace('\n', ' ') + "| " + predicted_text.replace('\n', ' ')) def callback(weights): print("Train loss:", loss_fun(weights, train_inputs, train_targets)) print_training_prediction(weights, train_inputs, train_targets) # Build gradient of loss function using autograd. loss_and_grad = value_and_grad(loss_fun) # Wrap function to only have one argument, for scipy.minimize. def training_loss_and_grad(weights): return loss_and_grad(weights, train_inputs, train_targets) init_weights = npr.randn(num_weights) * param_scale # Check the gradients numerically, just to be safe quick_grad_check(loss_fun, init_weights, (train_inputs, train_targets)) print("Training LSTM...") result = minimize(training_loss_and_grad, init_weights, jac=True, method='CG', options={'maxiter':train_iters}, callback=callback) trained_weights = result.x print("\nGenerating text from LSTM model...")
if __name__ == '__main__': # Network parameters input_size, h1_size, h2_size, output_size = 14*14, 200, 80, 10 print input_size, h1_size, h2_size, output_size # Training parameters param_scale = 0.1 learning_rate = 0.1 / img_all_num momentum = 0.9 batch_size = 512 num_epochs = 5000 # training function & backword gradient clac_loss, num_weights, p_o_b, accuracy, parser = \ rnn_for_mnist(input_size, h1_size, h2_size, output_size) loss_and_grad = value_and_grad(clac_loss, argnum=1) # set batches index def make_batches(img_all_num, batch_size): return [ slice(i, min(i+batch_size, img_all_num)) for i in range(0, img_all_num, batch_size) ] batch_idxs = make_batches( train_images.shape[1] , batch_size ) # init random weights rs = npr.RandomState() weights = rs.randn(num_weights) * param_scale # init backforword gradient matrix weights_back = np.zeros(num_weights)
fig = plt.figure(figsize=(20, 8), facecolor="white") ax_large = fig.add_subplot(121, frameon=False) ax_small = fig.add_subplot(122, frameon=False) plt.show(block=False) axes_set = [False, True] # Architecture of the GP. Last layer should always be 1 init_params = 0.1 * rs.randn(total_num_params) deep_map = create_deep_map(init_params) init_params = initialize(deep_map, X, num_pseudo_params) print("Optimizing covariance parameters...") objective = lambda params: -log_likelihood(params, X, y, n_samples) params = minimize( value_and_grad(objective), init_params, jac=True, method="BFGS", callback=callback, options={"maxiter": 200} ) params = params["x"] plot_xs = np.reshape(np.linspace(-5, 5, 300), (300, 1)) deep_map = create_deep_map(params) for axes in axes_set: if axes: ax = ax_small title = "Close up" else: ax = ax_large title = "Far" plot_deep_gp(ax, params, plot_xs) ax.plot(np.ndarray.flatten(deep_map[0][0]["x0"]), deep_map[0][0]["y0"], "ro")