def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False, burnin=100, grid_subset=20): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.stats_file = os.path.join( expt_dir, self.__module__ + "_hyperparameters.txt") self.mcmc_iters = int(mcmc_iters) self.burnin = int(burnin) self.needs_burnin = True self.pending_samples = int(pending_samples) self.D = -1 self.hyper_iters = 1 # Number of points to optimize EI over self.grid_subset = int(grid_subset) self.noiseless = bool(int(noiseless)) self.hyper_samples = [] self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 10 # top-hat prior on length scales self.prior = False self.prior_mean = None self.prior_cov = None self.expt_grid = None
def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False, burnin=100, grid_subset=20, use_multiprocessing=True): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.stats_file = os.path.join( expt_dir, self.__module__ + "_hyperparameters.txt") self.mcmc_iters = int(mcmc_iters) self.burnin = int(burnin) self.needs_burnin = True self.pending_samples = int(pending_samples) self.D = -1 self.hyper_iters = 1 # Number of points to optimize EI over self.grid_subset = int(grid_subset) self.noiseless = bool(int(noiseless)) self.hyper_samples = [] self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 2 # top-hat prior on length scales self.sample_points = 4 self.samples_per_point = 3 self.sample_from = 10 # If multiprocessing fails or deadlocks, set this to False self.use_multiprocessing = bool(int(use_multiprocessing))
def __init__(self, expt_dir, variables=None, grid_size=None, grid_seed=1): self.expt_dir = expt_dir self.jobs_pkl = os.path.join(expt_dir, 'expt-grid.pkl') self.locker = Locker() # Only one process at a time is allowed to have access to this. sys.stderr.write("Waiting to lock grid...") self.locker.lock_wait(self.jobs_pkl) sys.stderr.write("...acquired\n") # Does this exist already? if variables is not None and not os.path.exists(self.jobs_pkl): # Set up the grid for the first time. self.seed = grid_seed self.vmap = GridMap(variables, grid_size) self.grid = self.hypercube_grid(self.vmap.card(), grid_size) self.status = np.zeros(grid_size, dtype=int) + CANDIDATE_STATE self.values = np.zeros(grid_size) + np.nan self.durs = np.zeros(grid_size) + np.nan self.sgeids = np.zeros(grid_size, dtype=int) # Save this out. self._save_jobs() else: # Load in from the pickle. self._load_jobs()
def job_submit(name, output_file, job_file, working_dir): cmd = ('''python spearmint_sync.py --wrapper "%s" > %s''' % (job_file, output_file)) output_file = open(output_file, 'w') # Submit the job. locker = Locker() locker.unlock(working_dir + '/expt-grid.pkl') process = subprocess.Popen(cmd, stdout=output_file, stderr=output_file, shell=True) return process
def job_submit(name, output_file, job_file, working_dir): spearmint_path = os.path.realpath(__file__) cmd = ('python ' + spearmint_path + ' --wrapper "%s" > %s' % (job_file, output_file)) output_file = open(output_file, 'w') # Submit the job. locker = Locker() locker.unlock(working_dir + '/expt-grid.pkl') process = subprocess.Popen(cmd, stdout=output_file, stderr=output_file, shell=True) return process
def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.mcmc_iters = int(mcmc_iters) self.pending_samples = pending_samples self.D = -1 self.hyper_iters = 1 self.noiseless = bool(int(noiseless)) self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 2 # top-hat prior on length scales
def job_submit(name, output_file, job_file, working_dir): if os.name == 'nt': cmd = ('''python spearmint_sync.py --wrapper "%s"''' % (job_file)) else: cmd = ('''python spearmint_sync.py --wrapper "%s" > "%s"''' % (job_file, output_file)) output_file = open(output_file, 'w') # Submit the job. locker = Locker() locker.unlock(os.path.join(working_dir, 'expt-grid.pkllock')) process = subprocess.Popen(cmd, stdout=output_file, stderr=output_file, shell=True) return process
def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False, burnin=100, grid_subset=20): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.stats_file = os.path.join(expt_dir, self.__module__ + "_hyperparameters.txt") self.mcmc_iters = int(mcmc_iters) self.burnin = int(burnin) self.needs_burnin = True self.pending_samples = pending_samples self.D = -1 self.hyper_iters = 1 # Number of points to optimize EI over self.grid_subset = int(grid_subset) self.noiseless = bool(int(noiseless)) self.hyper_samples = [] self.time_hyper_samples = [] self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 10 # top-hat prior on length scales self.time_noise_scale = 0.1 # horseshoe prior self.time_amp2_scale = 1 # zero-mean log normal prior self.time_max_ls = 10 # top-hat prior on length scales
def __init__(self, expt_dir, covar="Matern52", mcmc_iters=20, pending_samples=100, noiseless=False, burnin=100, grid_subset=20, constraint_violating_value=np.inf, visualize2D=False): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.stats_file = os.path.join(expt_dir, self.__module__ + "_hyperparameters.txt") self.mcmc_iters = int(mcmc_iters) self.burnin = int(burnin) self.needs_burnin = True self.pending_samples = pending_samples self.D = -1 self.hyper_iters = 1 # Number of points to optimize EI over self.grid_subset = int(grid_subset) self.noiseless = bool(int(noiseless)) self.hyper_samples = [] self.constraint_hyper_samples = [] self.ff = None self.ff_samples = [] self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 2 # top-hat prior on length scales self.constraint_noise_scale = 0.1 # horseshoe prior self.constraint_amp2_scale = 1 # zero-mean log normal prio self.constraint_gain = 1 # top-hat prior on length scales self.constraint_max_ls = 2 # top-hat prior on length scales self.bad_value = float(constraint_violating_value) self.visualize2D = visualize2D
def __init__(self, expt_dir, variables=None, grid_size=None, grid_seed=1): self.expt_dir = expt_dir self.jobs_pkl = os.path.join(expt_dir, EXPERIMENT_GRID_FILE) self.locker = Locker() # Only one process at a time is allowed to have access to the grid. self.locker.lock_wait(self.jobs_pkl) # Set up the grid for the first time if it doesn't exist. if variables is not None and not os.path.exists(self.jobs_pkl): self.seed = grid_seed self.vmap = GridMap(variables, grid_size) self.grid = self._hypercube_grid(self.vmap.card(), grid_size) self.status = np.zeros(grid_size, dtype=int) + CANDIDATE_STATE self.values = np.zeros(grid_size) + np.nan self.durs = np.zeros(grid_size) + np.nan self.proc_ids = np.zeros(grid_size, dtype=int) self._save_jobs() # Or load in the grid from the pickled file. else: self._load_jobs()
class GPEIOptChooser: def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False, burnin=100, grid_subset=20): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.stats_file = os.path.join( expt_dir, self.__module__ + "_hyperparameters.txt") self.mcmc_iters = int(mcmc_iters) self.burnin = int(burnin) self.needs_burnin = True self.pending_samples = int(pending_samples) self.D = -1 self.hyper_iters = 1 # Number of points to optimize EI over self.grid_subset = int(grid_subset) self.noiseless = bool(int(noiseless)) self.hyper_samples = [] self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 10 # top-hat prior on length scales self.prior = False self.prior_mean = None self.prior_cov = None self.expt_grid = None def setPrior(self, prior): self.prior = prior self.prior_mean = np.array([0.4, 1, 75]) self.prior_cov = np.array([[6, 0.5, 0.5], [0.5, 1.1, -0.5], [0.5, -0.5, 6]]) #np.array([[1.1,0.5,0.5],[0.5,1.1,-0.5],[0.5,-0.5,1.1]]) def dump_hypers(self): sys.stderr.write("Waiting to lock hyperparameter pickle...") #sys.stderr.write("GPEIOptChooser Before acquiring lock; ABHIMANU") self.locker.lock_wait(self.state_pkl) #sys.stderr.write("GPEIOptChooser After acquiring lock; ABHIMANU") sys.stderr.write("...acquired\n") # Write the hyperparameters out to a Pickle. fh = tempfile.NamedTemporaryFile(mode='w', delete=False) cPickle.dump( { 'dims': self.D, 'ls': self.ls, 'amp2': self.amp2, 'noise': self.noise, 'mean': self.mean }, fh) fh.close() # Use an atomic move for better NFS happiness. cmd = 'mv "%s" "%s"' % (fh.name, self.state_pkl) os.system(cmd) # TODO: Should check system-dependent return status. self.locker.unlock(self.state_pkl) # Write the hyperparameters out to a human readable file as well fh = open(self.stats_file, 'w') fh.write('Mean Noise Amplitude <length scales>\n') fh.write('-----------ALL SAMPLES-------------\n') meanhyps = 0 * np.hstack(self.hyper_samples[0]) for i in self.hyper_samples: hyps = np.hstack(i) meanhyps += (1 / float(len(self.hyper_samples))) * hyps for j in hyps: fh.write(str(j) + ' ') fh.write('\n') fh.write('-----------MEAN OF SAMPLES-------------\n') for j in meanhyps: fh.write(str(j) + ' ') fh.write('\n') fh.close() def _real_init(self, dims, values): sys.stderr.write("Waiting to lock hyperparameter pickle...") self.locker.lock_wait(self.state_pkl) sys.stderr.write("...acquired\n") self.randomstate = npr.get_state() if os.path.exists(self.state_pkl): fh = open(self.state_pkl, 'r') state = cPickle.load(fh) fh.close() self.D = state['dims'] self.ls = state['ls'] self.amp2 = state['amp2'] self.noise = state['noise'] self.mean = state['mean'] self.needs_burnin = False else: # Input dimensionality. self.D = dims # Initial length scales. self.ls = np.ones(self.D) # Initial amplitude. self.amp2 = np.std(values) # Initial observation noise. self.noise = 1e-3 # Initial mean. self.mean = np.mean(values) # Save hyperparameter samples self.hyper_samples.append( (self.mean, self.noise, self.amp2, self.ls)) self.locker.unlock(self.state_pkl) def cov(self, x1, x2=None): if x2 is None: return self.amp2 * (self.cov_func(self.ls, x1, None) + 1e-6 * np.eye(x1.shape[0])) else: return self.amp2 * self.cov_func(self.ls, x1, x2) def set_expt_grid(self, expt_grid): print "set expt_grid Abhi:" self.expt_grid = expt_grid # Given a set of completed 'experiments' in the unit hypercube with # corresponding objective 'values', pick from the next experiment to # run according to the acquisition function. def next(self, grid, values, durations, candidates, pending, complete): # Don't bother using fancy GP stuff at first. if complete.shape[0] < 2: return int(candidates[0]) # Perform the real initialization. if self.D == -1: self._real_init(grid.shape[1], values[complete]) # Grab out the relevant sets. comp = grid[complete, :] cand = grid[candidates, :] pend = grid[pending, :] vals = values[complete] numcand = cand.shape[0] # Spray a set of candidates around the min so far best_comp = np.argmin(vals) cand2 = np.vstack( (np.random.randn(10, comp.shape[1]) * 0.001 + comp[best_comp, :], cand)) if self.mcmc_iters > 0: # Possibly burn in. if self.needs_burnin: for mcmc_iter in xrange(self.burnin): self.sample_hypers(comp, vals) sys.stderr.write( "BURN %d/%d] mean: %.2f amp: %.2f " "noise: %.4f min_ls: %.4f max_ls: %.4f\n" % (mcmc_iter + 1, self.burnin, self.mean, np.sqrt(self.amp2), self.noise, np.min( self.ls), np.max(self.ls))) self.needs_burnin = False # Sample from hyperparameters. # Adjust the candidates to hit ei peaks self.hyper_samples = [] for mcmc_iter in xrange(self.mcmc_iters): self.sample_hypers(comp, vals) sys.stderr.write("%d/%d] mean: %.2f amp: %.2f noise: %.4f " "min_ls: %.4f max_ls: %.4f\n" % (mcmc_iter + 1, self.mcmc_iters, self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) self.dump_hypers() b = [] # optimization bounds for i in xrange(0, cand.shape[1]): b.append((0, 1)) overall_ei = self.ei_over_hypers(comp, pend, cand2, vals) inds = np.argsort(np.mean(overall_ei, axis=1))[-self.grid_subset:] #print "top 20 EI abhimanu: ", overall_ei[inds, :] cand2 = cand2[inds, :] # This is old code to optimize each point in parallel. Uncomment # and replace if multiprocessing doesn't work #for i in xrange(0, cand2.shape[0]): # sys.stderr.write("Optimizing candidate %d/%d\n" % # (i+1, cand2.shape[0])) #self.check_grad_ei(cand2[i,:].flatten(), comp, pend, vals) # ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei_over_hypers, # cand2[i,:].flatten(), args=(comp,pend,vals), # bounds=b, disp=0) # cand2[i,:] = ret[0] #cand = np.vstack((cand, cand2)) # Optimize each point in parallel pool = multiprocessing.Pool(self.grid_subset) results = [ pool.apply_async(optimize_pt, args=(c, b, comp, pend, vals, copy.copy(self))) for c in cand2 ] for res in results: cand = np.vstack((cand, res.get(1e8))) pool.close() overall_ei = self.ei_over_hypers(comp, pend, cand, vals) best_cand = np.argmax(np.mean(overall_ei, axis=1)) #print "Best EI abhimanu: ", overall_ei[best_cand, :] if (best_cand >= numcand): return (int(numcand), cand[best_cand, :]) return int(candidates[best_cand]) else: # Optimize hyperparameters self.optimize_hypers(comp, vals) sys.stderr.write( "mean: %.2f amp: %.2f noise: %.4f " "min_ls: %.4f max_ls: %.4f\n" % (self.mean, np.sqrt( self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) # Optimize over EI b = [] # optimization bounds for i in xrange(0, cand.shape[1]): b.append((0, 1)) for i in xrange(0, cand2.shape[0]): ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei, cand2[i, :].flatten(), args=(comp, vals, True), bounds=b, disp=0) cand2[i, :] = ret[0] cand = np.vstack((cand, cand2)) ei = self.compute_ei(comp, pend, cand, vals) best_cand = np.argmax(ei) if (best_cand >= numcand): return (int(numcand), cand[best_cand, :]) return int(candidates[best_cand]) # Compute EI over hyperparameter samples def ei_over_hypers(self, comp, pend, cand, vals): overall_ei = np.zeros((cand.shape[0], self.mcmc_iters)) for mcmc_iter in xrange(self.mcmc_iters): hyper = self.hyper_samples[mcmc_iter] self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] overall_ei[:, mcmc_iter] = self.compute_ei(comp, pend, cand, vals) return overall_ei def check_grad_ei(self, cand, comp, pend, vals): (ei, dx1) = self.grad_optimize_ei_over_hypers(cand, comp, pend, vals) dx2 = dx1 * 0 idx = np.zeros(cand.shape[0]) for i in xrange(0, cand.shape[0]): idx[i] = 1e-6 (ei1, tmp) = self.grad_optimize_ei_over_hypers(cand + idx, comp, pend, vals) (ei2, tmp) = self.grad_optimize_ei_over_hypers(cand - idx, comp, pend, vals) dx2[i] = (ei - ei2) / (2 * 1e-6) idx[i] = 0 print 'computed grads', dx1 print 'finite diffs', dx2 print(dx1 / dx2) print np.sum((dx1 - dx2)**2) time.sleep(2) # Adjust points by optimizing EI over a set of hyperparameter samples def grad_optimize_ei_over_hypers(self, cand, comp, pend, vals, compute_grad=True): summed_ei = 0 summed_grad_ei = np.zeros(cand.shape).flatten() ls = self.ls.copy() amp2 = self.amp2 mean = self.mean noise = self.noise for hyper in self.hyper_samples: self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] if compute_grad: (ei, g_ei) = self.grad_optimize_ei(cand, comp, pend, vals, compute_grad) summed_grad_ei = summed_grad_ei + g_ei else: ei = self.grad_optimize_ei(cand, comp, pend, vals, compute_grad) summed_ei += ei self.mean = mean self.amp2 = amp2 self.noise = noise self.ls = ls.copy() if compute_grad: return (summed_ei, summed_grad_ei) else: return summed_ei # Adjust points based on optimizing their ei def grad_optimize_ei(self, cand, comp, pend, vals, compute_grad=True): if pend.shape[0] == 0: best = np.min(vals) cand = np.reshape(cand, (-1, comp.shape[1])) # The primary covariances for prediction. comp_cov = self.cov(comp) cand_cross = self.cov(comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.ls, comp, cand) # Predictive things. # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) if not compute_grad: return ei # Gradients of ei w.r.t. mean and variance g_ei_m = -ncdf g_ei_s2 = 0.5 * npdf / func_s # Apply covariance function grad_cross = np.squeeze(cand_cross_grad) grad_xp_m = np.dot(alpha.transpose(), grad_cross) grad_xp_v = np.dot( -2 * spla.cho_solve((obsv_chol, True), cand_cross).transpose(), grad_cross) grad_xp = 0.5 * self.amp2 * (grad_xp_m * g_ei_m + grad_xp_v * g_ei_s2) ei = -np.sum(ei) return ei, grad_xp.flatten() else: # If there are pending experiments, fantasize their outcomes. cand = np.reshape(cand, (-1, comp.shape[1])) # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = (self.cov(comp_pend) + self.noise * np.eye(comp_pend.shape[0])) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(comp, pend) pend_kappa = self.cov(pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[:comp.shape[0], :comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. npr.set_state(self.randomstate) pend_fant = np.dot(pend_chol, npr.randn(pend.shape[0], self.pending_samples)) + self.mean # Include the fantasies. fant_vals = np.concatenate( (np.tile(vals[:, np.newaxis], (1, self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(comp_pend, cand) cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.ls, comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:, np.newaxis]) u = (bests[np.newaxis, :] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) # Gradients of ei w.r.t. mean and variance g_ei_m = -ncdf g_ei_s2 = 0.5 * npdf / func_s # Apply covariance function grad_cross = np.squeeze(cand_cross_grad) grad_xp_m = np.dot(alpha.transpose(), grad_cross) grad_xp_v = np.dot( -2 * spla.cho_solve( (comp_pend_chol, True), cand_cross).transpose(), grad_cross) grad_xp = 0.5 * self.amp2 * ( grad_xp_m * np.tile(g_ei_m, (comp.shape[1], 1)).T + (grad_xp_v.T * g_ei_s2).T) ei = -np.mean(ei, axis=1) grad_xp = np.mean(grad_xp, axis=0) return ei, grad_xp.flatten() def compute_ei(self, comp, pend, cand, vals): if pend.shape[0] == 0: # If there are no pending, don't do anything fancy. # Current best. best = np.min(vals) # The primary covariances for prediction. comp_cov = self.cov(comp) cand_cross = self.cov(comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) if self.prior: print "Abhi Getting self.prior, cand.shape ", cand.shape cand_orig = np.zeros((cand.shape[0], cand.shape[1])) for i in xrange(0, cand.shape[0]): cand_orig[ i, :] = self.expt_grid.vmap.get_datapoint_original( cand[i, :]) p = mvn.pdf(cand_orig, self.prior_mean, self.prior_cov) ei = p * ei return ei else: # If there are pending experiments, fantasize their outcomes. # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = (self.cov(comp_pend) + self.noise * np.eye(comp_pend.shape[0])) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(comp, pend) pend_kappa = self.cov(pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[:comp.shape[0], :comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. npr.set_state(self.randomstate) pend_fant = np.dot(pend_chol, npr.randn(pend.shape[0], self.pending_samples)) + self.mean # Include the fantasies. fant_vals = np.concatenate( (np.tile(vals[:, np.newaxis], (1, self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:, np.newaxis]) u = (bests[np.newaxis, :] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) return np.mean(ei, axis=1) def sample_hypers(self, comp, vals): if self.noiseless: self.noise = 1e-3 self._sample_noiseless(comp, vals) else: self._sample_noisy(comp, vals) self._sample_ls(comp, vals) self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) def _sample_ls(self, comp, vals): def logprob(ls): if np.any(ls < 0) or np.any(ls > self.max_ls): return -np.inf cov = ( self.amp2 * (self.cov_func(ls, comp, None) + 1e-6 * np.eye(comp.shape[0])) + self.noise * np.eye(comp.shape[0])) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - self.mean) lp = (-np.sum(np.log(np.diag(chol))) - 0.5 * np.dot(vals - self.mean, solve)) return lp self.ls = util.slice_sample(self.ls, logprob, compwise=True) def _sample_noisy(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = hypers[2] # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0 or noise < 0: return -np.inf cov = (amp2 * (self.cov_func(self.ls, comp, None) + 1e-6 * np.eye(comp.shape[0])) + noise * np.eye(comp.shape[0])) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot( vals - mean, solve) # Roll in noise horseshoe prior. lp += np.log(np.log(1 + (self.noise_scale / noise)**2)) # Roll in amplitude lognormal prior lp -= 0.5 * (np.log(amp2) / self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = hypers[2] def _sample_noiseless(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = 1e-3 # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0: return -np.inf cov = (amp2 * (self.cov_func(self.ls, comp, None) + 1e-6 * np.eye(comp.shape[0])) + noise * np.eye(comp.shape[0])) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot( vals - mean, solve) # Roll in amplitude lognormal prior lp -= 0.5 * (np.log(amp2) / self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = 1e-3 def optimize_hypers(self, comp, vals): mygp = gp.GP(self.cov_func.__name__) mygp.real_init(comp.shape[1], vals) mygp.optimize_hypers(comp, vals) self.mean = mygp.mean self.ls = mygp.ls self.amp2 = mygp.amp2 self.noise = mygp.noise # Save hyperparameter samples self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) self.dump_hypers() return
class GPEIperSecChooser: def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False, burnin=100, grid_subset=20): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.stats_file = os.path.join(expt_dir, self.__module__ + "_hyperparameters.txt") self.mcmc_iters = int(mcmc_iters) self.burnin = int(burnin) self.needs_burnin = True self.pending_samples = pending_samples self.D = -1 self.hyper_iters = 1 # Number of points to optimize EI over self.grid_subset = int(grid_subset) self.noiseless = bool(int(noiseless)) self.hyper_samples = [] self.time_hyper_samples = [] self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 10 # top-hat prior on length scales self.time_noise_scale = 0.1 # horseshoe prior self.time_amp2_scale = 1 # zero-mean log normal prior self.time_max_ls = 10 # top-hat prior on length scales # A simple function to dump out hyperparameters to allow for a hot start # if the optimization is restarted. def dump_hypers(self): self.locker.lock_wait(self.state_pkl) # Write the hyperparameters out to a Pickle. fh = tempfile.NamedTemporaryFile(mode='w', delete=False) cPickle.dump({ 'dims' : self.D, 'ls' : self.ls, 'amp2' : self.amp2, 'noise' : self.noise, 'mean' : self.mean, 'time_ls' : self.time_ls, 'time_amp2' : self.time_amp2, 'time_noise' : self.time_noise, 'time_mean' : self.time_mean }, fh) fh.close() # Use an atomic move for better NFS happiness. cmd = 'mv "%s" "%s"' % (fh.name, self.state_pkl) os.system(cmd) # TODO: Should check system-dependent return status. self.locker.unlock(self.state_pkl) def _real_init(self, dims, values, durations): self.locker.lock_wait(self.state_pkl) if os.path.exists(self.state_pkl): fh = open(self.state_pkl, 'r') state = cPickle.load(fh) fh.close() self.D = state['dims'] self.ls = state['ls'] self.amp2 = state['amp2'] self.noise = state['noise'] self.mean = state['mean'] self.time_ls = state['time_ls'] self.time_amp2 = state['time_amp2'] self.time_noise = state['time_noise'] self.time_mean = state['time_mean'] else: # Input dimensionality. self.D = dims # Initial length scales. self.ls = np.ones(self.D) self.time_ls = np.ones(self.D) # Initial amplitude. self.amp2 = np.std(values)+1e-4 self.time_amp2 = np.std(durations)+1e-4 # Initial observation noise. self.noise = 1e-3 self.time_noise = 1e-3 # Initial mean. self.mean = np.mean(values) self.time_mean = np.mean(np.log(durations)) self.locker.unlock(self.state_pkl) def cov(self, amp2, ls, x1, x2=None): if x2 is None: return amp2 * (self.cov_func(ls, x1, None) + 1e-6*np.eye(x1.shape[0])) else: return amp2 * self.cov_func(ls, x1, x2) # Given a set of completed 'experiments' in the unit hypercube with # corresponding objective 'values', pick from the next experiment to # run according to the acquisition function. def next(self, grid, values, durations, candidates, pending, complete): # Don't bother using fancy GP stuff at first. if complete.shape[0] < 2: return int(candidates[0]) # Perform the real initialization. if self.D == -1: self._real_init(grid.shape[1], values[complete], durations[complete]) # Grab out the relevant sets. comp = grid[complete,:] cand = grid[candidates,:] pend = grid[pending,:] vals = values[complete] durs = durations[complete] # Bring time into the log domain before we do anything # to maintain strict positivity durs = np.log(durs) # Spray a set of candidates around the min so far numcand = cand.shape[0] best_comp = np.argmin(vals) cand2 = np.vstack((np.random.randn(10,comp.shape[1])*0.001 + comp[best_comp,:], cand)) if self.mcmc_iters > 0: # Possibly burn in. if self.needs_burnin: for mcmc_iter in xrange(self.burnin): self.sample_hypers(comp, vals, durs) log("BURN %d/%d] mean: %.2f amp: %.2f " "noise: %.4f min_ls: %.4f max_ls: %.4f" % (mcmc_iter+1, self.burnin, self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) self.needs_burnin = False # Sample from hyperparameters. # Adjust the candidates to hit ei/sec peaks self.hyper_samples = [] for mcmc_iter in xrange(self.mcmc_iters): self.sample_hypers(comp, vals, durs) log("%d/%d] mean: %.2f amp: %.2f noise: %.4f " "min_ls: %.4f max_ls: %.4f" % (mcmc_iter+1, self.mcmc_iters, self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) log("%d/%d] time_mean: %.2fs time_amp: %.2f time_noise: %.4f " "time_min_ls: %.4f time_max_ls: %.4f" % (mcmc_iter+1, self.mcmc_iters, np.exp(self.time_mean), np.sqrt(self.time_amp2), np.exp(self.time_noise), np.min(self.time_ls), np.max(self.time_ls))) self.dump_hypers() # Pick the top candidates to optimize over overall_ei = self.ei_over_hypers(comp,pend,cand2,vals,durs) inds = np.argsort(np.mean(overall_ei, axis=1))[-self.grid_subset:] cand2 = cand2[inds,:] # Adjust the candidates to hit ei peaks b = []# optimization bounds for i in xrange(0, cand.shape[1]): b.append((0, 1)) for i in xrange(0, cand2.shape[0]): log("Optimizing candidate %d/%d" % (i+1, cand2.shape[0])) ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei_over_hypers, cand2[i,:].flatten(), args=(comp,vals,durs,True), bounds=b, disp=0) cand2[i,:] = ret[0] cand = np.vstack((cand, cand2)) overall_ei = self.ei_over_hypers(comp,pend,cand,vals,durs) best_cand = np.argmax(np.mean(overall_ei, axis=1)) self.dump_hypers() if (best_cand >= numcand): return (int(numcand), cand[best_cand,:]) return int(candidates[best_cand]) else: # Optimize hyperparameters self.optimize_hypers(comp, vals, durs) log("mean: %f amp: %f noise: %f " "min_ls: %f max_ls: %f" % (self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) # Pick the top candidates to optimize over ei = self.compute_ei_per_s(comp, pend, cand2, vals, durs) inds = np.argsort(np.mean(overall_ei, axis=1))[-self.grid_subset:] cand2 = cand2[inds,:] # Adjust the candidates to hit ei peaks b = []# optimization bounds for i in xrange(0, cand.shape[1]): b.append((0, 1)) for i in xrange(0, cand2.shape[0]): log("Optimizing candidate %d/%d" % (i+1, cand2.shape[0])) ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei, cand2[i,:].flatten(), args=(comp,vals,durs,True), bounds=b, disp=0) cand2[i,:] = ret[0] cand = np.vstack((cand, cand2)) ei = self.compute_ei_per_s(comp, pend, cand, vals, durs) best_cand = np.argmax(ei) self.dump_hypers() if (best_cand >= numcand): return (int(numcand), cand[best_cand,:]) return int(candidates[best_cand]) # Compute EI over hyperparameter samples def ei_over_hypers(self,comp,pend,cand,vals,durs): overall_ei = np.zeros((cand.shape[0], self.mcmc_iters)) for mcmc_iter in xrange(self.mcmc_iters): hyper = self.hyper_samples[mcmc_iter] time_hyper = self.time_hyper_samples[mcmc_iter] self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] self.time_mean = time_hyper[0] self.time_noise = time_hyper[1] self.time_amp2 = time_hyper[2] self.time_ls = time_hyper[3] overall_ei[:,mcmc_iter] = self.compute_ei_per_s(comp, pend, cand, vals, durs.squeeze()) return overall_ei def check_grad_ei_per(self, cand, comp, vals, durs): (ei,dx1) = self.grad_optimize_ei_over_hypers(cand, comp, vals, durs) dx2 = dx1*0 idx = np.zeros(cand.shape[0]) for i in xrange(0, cand.shape[0]): idx[i] = 1e-6 (ei1,tmp) = self.grad_optimize_ei_over_hypers(cand + idx, comp, vals, durs) (ei2,tmp) = self.grad_optimize_ei_over_hypers(cand - idx, comp, vals, durs) dx2[i] = (ei - ei2)/(2*1e-6) idx[i] = 0 print 'computed grads', dx1 print 'finite diffs', dx2 print (dx1/dx2) print np.sum((dx1 - dx2)**2) time.sleep(2) # Adjust points by optimizing EI over a set of hyperparameter samples def grad_optimize_ei_over_hypers(self, cand, comp, vals, durs, compute_grad=True): summed_ei = 0 summed_grad_ei = np.zeros(cand.shape).flatten() for mcmc_iter in xrange(self.mcmc_iters): hyper = self.hyper_samples[mcmc_iter] time_hyper = self.time_hyper_samples[mcmc_iter] self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] self.time_mean = time_hyper[0] self.time_noise = time_hyper[1] self.time_amp2 = time_hyper[2] self.time_ls = time_hyper[3] if compute_grad: (ei,g_ei) = self.grad_optimize_ei(cand,comp,vals,durs,compute_grad) summed_grad_ei = summed_grad_ei + g_ei else: ei = self.grad_optimize_ei(cand,comp,vals,durs,compute_grad) summed_ei += ei if compute_grad: return (summed_ei, summed_grad_ei) else: return summed_ei def grad_optimize_ei(self, cand, comp, vals, durs, compute_grad=True): # Here we have to compute the gradients for ei per second # This means deriving through the two kernels, the one for predicting # time and the one predicting ei best = np.min(vals) cand = np.reshape(cand, (-1, comp.shape[1])) # First we make predictions for the durations # Compute covariances comp_time_cov = self.cov(self.time_amp2, self.time_ls, comp) cand_time_cross = self.cov(self.time_amp2, self.time_ls,comp,cand) # Cholesky decompositions obsv_time_cov = comp_time_cov + self.time_noise*np.eye(comp.shape[0]) obsv_time_chol = spla.cholesky( obsv_time_cov, lower=True ) # Linear systems t_alpha = spla.cho_solve((obsv_time_chol, True), durs - self.time_mean) # Predict marginal mean times and (possibly) variances func_time_m = np.dot(cand_time_cross.T, t_alpha) + self.time_mean # We don't really need the time variances now #func_time_v = self.time_amp2*(1+1e-6) - np.sum(t_beta**2, axis=0) # Bring time out of the log domain func_time_m = np.exp(func_time_m) # Compute derivative of cross-distances. grad_cross_r = gp.grad_dist2(self.time_ls, comp, cand) # Apply covariance function cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.time_ls, comp, cand) grad_cross_t = np.squeeze(cand_cross_grad) # Now compute the gradients w.r.t. ei # The primary covariances for prediction. comp_cov = self.cov(self.amp2, self.ls, comp) cand_cross = self.cov(self.amp2, self.ls, comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise*np.eye(comp.shape[0]) obsv_chol = spla.cholesky( obsv_cov, lower=True ) cand_cross_grad = cov_grad_func(self.ls, comp, cand) # Predictive things. # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*(u*ncdf + npdf) ei_per_s = -np.sum(ei/func_time_m) if not compute_grad: return ei grad_time_xp_m = np.dot(t_alpha.transpose(),grad_cross_t) # Gradients of ei w.r.t. mean and variance g_ei_m = -ncdf g_ei_s2 = 0.5*npdf / func_s # Apply covariance function grad_cross = np.squeeze(cand_cross_grad) grad_xp_m = np.dot(alpha.transpose(),grad_cross) grad_xp_v = np.dot(-2*spla.cho_solve((obsv_chol, True), cand_cross).transpose(),grad_cross) grad_xp = 0.5*self.amp2*(grad_xp_m*g_ei_m + grad_xp_v*g_ei_s2) grad_time_xp_m = 0.5*self.time_amp2*grad_time_xp_m*func_time_m grad_xp = (func_time_m*grad_xp - ei*grad_time_xp_m)/(func_time_m**2) return ei_per_s, grad_xp.flatten() def compute_ei_per_s(self, comp, pend, cand, vals, durs): # First we make predictions for the durations as that # doesn't depend on pending experiments # Compute covariances comp_time_cov = self.cov(self.time_amp2, self.time_ls, comp) cand_time_cross = self.cov(self.time_amp2, self.time_ls,comp,cand) # Cholesky decompositions obsv_time_cov = comp_time_cov + self.time_noise*np.eye(comp.shape[0]) obsv_time_chol = spla.cholesky( obsv_time_cov, lower=True ) # Linear systems t_alpha = spla.cho_solve((obsv_time_chol, True), durs - self.time_mean) #t_beta = spla.solve_triangular(obsv_time_chol, cand_time_cross, lower=True) # Predict marginal mean times and (possibly) variances func_time_m = np.dot(cand_time_cross.T, t_alpha) + self.time_mean # We don't really need the time variances now #func_time_v = self.time_amp2*(1+1e-6) - np.sum(t_beta**2, axis=0) # Bring time out of the log domain func_time_m = np.exp(func_time_m) if pend.shape[0] == 0: # If there are no pending, don't do anything fancy. # Current best. best = np.min(vals) # The primary covariances for prediction. comp_cov = self.cov(self.amp2, self.ls, comp) cand_cross = self.cov(self.amp2, self.ls, comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise*np.eye(comp.shape[0]) obsv_chol = spla.cholesky( obsv_cov, lower=True ) # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*( u*ncdf + npdf) ei_per_s = ei/func_time_m return ei_per_s else: # If there are pending experiments, fantasize their outcomes. # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = self.cov(self.amp2, self.ls, comp_pend) + self.noise*np.eye(comp_pend.shape[0]) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(self.amp2, self.ls, comp, pend) pend_kappa = self.cov(self.amp2, self.ls, pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[:comp.shape[0],:comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. pend_fant = np.dot(pend_chol, npr.randn(pend.shape[0],self.pending_samples)) + pend_m[:,None] # Include the fantasies. fant_vals = np.concatenate((np.tile(vals[:,np.newaxis], (1,self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(self.amp2, self.ls, comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:,np.newaxis]) u = (bests[np.newaxis,:] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*( u*ncdf + npdf) return np.divide(np.mean(ei, axis=1), func_time_m) def sample_hypers(self, comp, vals, durs): if self.noiseless: self.noise = 1e-3 self._sample_noiseless(comp, vals) else: self._sample_noisy(comp, vals) self._sample_ls(comp, vals) self._sample_time_noisy(comp, durs.squeeze()) self._sample_time_ls(comp, durs.squeeze()) self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) self.time_hyper_samples.append((self.time_mean, self.time_noise, self.time_amp2, self.time_ls)) def _sample_ls(self, comp, vals): def logprob(ls): if np.any(ls < 0) or np.any(ls > self.max_ls): return -np.inf cov = self.amp2 * (self.cov_func(ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + self.noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - self.mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-self.mean, solve) return lp self.ls = util.slice_sample(self.ls, logprob, compwise=True) def _sample_time_ls(self, comp, vals): def logprob(ls): if np.any(ls < 0) or np.any(ls > self.time_max_ls): return -np.inf cov = self.time_amp2 * (self.cov_func(ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + self.time_noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - self.time_mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-self.time_mean, solve) return lp self.time_ls = util.slice_sample(self.time_ls, logprob, compwise=True) def _sample_noisy(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = hypers[2] # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0 or noise < 0: return -np.inf cov = amp2 * (self.cov_func(self.ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve) # Roll in noise horseshoe prior. lp += np.log(np.log(1 + (self.noise_scale/noise)**2)) #lp -= 0.5*(np.log(noise)/self.noise_scale)**2 # Roll in amplitude lognormal prior lp -= 0.5*(np.log(amp2)/self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = hypers[2] def _sample_time_noisy(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = hypers[2] # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0 or noise < 0: return -np.inf cov = amp2 * (self.cov_func(self.time_ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve) # Roll in noise horseshoe prior. lp += np.log(np.log(1 + (self.time_noise_scale/noise)**2)) #lp -= 0.5*(np.log(noise)/self.time_noise_scale)**2 # Roll in amplitude lognormal prior lp -= 0.5*(np.log(np.sqrt(amp2))/self.time_amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.time_mean, self.time_amp2, self.time_noise]), logprob, compwise=False) self.time_mean = hypers[0] self.time_amp2 = hypers[1] self.time_noise = hypers[2] def _sample_noiseless(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = 1e-3 # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0: return -np.inf cov = amp2 * (self.cov_func(self.ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve) # Roll in amplitude lognormal prior lp -= 0.5*(np.log(amp2)/self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = 1e-3 def optimize_hypers(self, comp, vals, durs): # First the GP to observations mygp = gp.GP(self.cov_func.__name__) mygp.real_init(comp.shape[1], vals) mygp.optimize_hypers(comp,vals) self.mean = mygp.mean self.ls = mygp.ls self.amp2 = mygp.amp2 self.noise = mygp.noise # Now the GP to times timegp = gp.GP(self.cov_func.__name__) timegp.real_init(comp.shape[1], durs) timegp.optimize_hypers(comp, durs) self.time_mean = timegp.mean self.time_amp2 = timegp.amp2 self.time_noise = timegp.noise self.time_ls = timegp.ls # Save hyperparameter samples self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) self.time_hyper_samples.append((self.time_mean, self.time_noise, self.time_amp2, self.time_ls)) self.dump_hypers()
def __init__(self, expt_dir, expt_name, max_wallclock_time=sys.float_info.max, title=None, folds=1): self.expt_dir = expt_dir if folds < 1: folds = 1 self.jobs_pkl = os.path.join(expt_dir, expt_name + ".pkl") self.locker = Locker.Locker() # Only one process at a time is allowed to have access to this. sys.stderr.write("Waiting to lock experiments file " + self.jobs_pkl + "...") self.locker.lock_wait(self.jobs_pkl) sys.stderr.write("...acquired\n") # Does this exist already? if not os.path.exists(self.jobs_pkl): # Set up the experiments file for the first time # General information # TODO: Unfortunately, this is also the optimizer name self.experiment_name = expt_name self.title = title self.optimizer = None self.folds = folds self.instance_order = [] self.trials = [] # Time information # Wallclock_time used for the functions (should be the sum of all # instance_durations) self.total_wallclock_time = 0 # The maximal allowed wallclock time self.max_wallclock_time = max_wallclock_time # Time when wrapping.py kicks of the optimizer self.starttime = [] # Time when the focus is passed back to the optimizer self.endtime = [] # Is triggered everytime cv.py is called, is used to calculate the # optimizer time self.cv_starttime = [] # Is triggered when cv.py leaves, used to calculate the optimizer # time They are alternatively called when runsolver_wrapper is # called by SMAC self.cv_endtime = [] # Dummy field, this will be calculated by wrapping.py after # everything is finished self.optimizer_time = [] # Save this out. self._save_jobs() else: # Load in from the pickle. self._load_jobs()
class GPEIConstrainedChooser: def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False, burnin=100, grid_subset=20, constraint_violating_value=-1): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.stats_file = os.path.join( expt_dir, self.__module__ + "_hyperparameters.txt") self.mcmc_iters = int(mcmc_iters) self.burnin = int(burnin) self.needs_burnin = True self.pending_samples = pending_samples self.D = -1 self.hyper_iters = 1 # Number of points to optimize EI over self.grid_subset = int(grid_subset) self.noiseless = bool(int(noiseless)) self.hyper_samples = [] self.constraint_hyper_samples = [] self.ff = None self.ff_samples = [] self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 2 # top-hat prior on length scales self.constraint_noise_scale = 0.1 # horseshoe prior self.constraint_amp2_scale = 1 # zero-mean log normal prio self.constraint_gain = 1 # top-hat prior on length scales self.constraint_max_ls = 2 # top-hat prior on length scales self.bad_value = float(constraint_violating_value) # A simple function to dump out hyperparameters to allow for a hot start # if the optimization is restarted. def dump_hypers(self): sys.stderr.write("Waiting to lock hyperparameter pickle...") self.locker.lock_wait(self.state_pkl) sys.stderr.write("...acquired\n") # Write the hyperparameters out to a Pickle. fh = tempfile.NamedTemporaryFile(mode='w', delete=False) cPickle.dump( { 'dims': self.D, 'ls': self.ls, 'amp2': self.amp2, 'noise': self.noise, 'mean': self.mean, 'constraint_ls': self.constraint_ls, 'constraint_amp2': self.constraint_amp2, 'constraint_noise': self.constraint_noise, 'constraint_mean': self.constraint_mean }, fh) fh.close() # Use an atomic move for better NFS happiness. cmd = 'mv "%s" "%s"' % (fh.name, self.state_pkl) os.system(cmd) # TODO: Should check system-dependent return status. self.locker.unlock(self.state_pkl) def _real_init(self, dims, values, durations): sys.stderr.write("Waiting to lock hyperparameter pickle...") self.locker.lock_wait(self.state_pkl) sys.stderr.write("...acquired\n") if os.path.exists(self.state_pkl): fh = open(self.state_pkl, 'r') state = cPickle.load(fh) fh.close() self.D = state['dims'] self.ls = state['ls'] self.amp2 = state['amp2'] self.noise = state['noise'] self.mean = state['mean'] self.constraint_ls = state['constraint_ls'] self.constraint_amp2 = state['constraint_amp2'] self.constraint_noise = state['constraint_noise'] self.constraint_mean = state['constraint_mean'] self.constraint_gain = state['constraint_mean'] self.needs_burnin = False else: # Identify constraint violations goodvals = np.nonzero(values != self.bad_value)[0] # Input dimensionality. self.D = dims # Initial length scales. self.ls = np.ones(self.D) self.constraint_ls = np.ones(self.D) # Initial amplitude. self.amp2 = np.std(values[goodvals]) self.constraint_amp2 = 1 #np.std(durations) # Initial observation noise. self.noise = 1e-3 self.constraint_noise = 1e-3 self.constraint_gain = 1 # Initial mean. self.mean = np.mean(values[goodvals]) self.constraint_mean = 0.5 self.locker.unlock(self.state_pkl) def cov(self, amp2, ls, x1, x2=None): if x2 is None: return amp2 * (self.cov_func(ls, x1, None) + 1e-6 * np.eye(x1.shape[0])) else: return amp2 * self.cov_func(ls, x1, x2) # Given a set of completed 'experiments' in the unit hypercube with # corresponding objective 'values', pick from the next experiment to # run according to the acquisition function. def next(self, grid, values, durations, candidates, pending, complete): # Don't bother using fancy GP stuff at first. if complete.shape[0] < 2: return int(candidates[0]) # Perform the real initialization. if self.D == -1: self._real_init(grid.shape[1], values[complete], durations[complete]) # Grab out the relevant sets. comp = grid[complete, :] cand = grid[candidates, :] pend = grid[pending, :] vals = values[complete] # Find which completed jobs violated constraints badvals = np.nonzero(vals == self.bad_value)[0] goodvals = np.nonzero(vals != self.bad_value)[0] print 'Found %d constraint violating jobs' % (badvals.shape[0]) labels = np.zeros(vals.shape[0]) labels[goodvals] = 1 if comp.shape[0] < 2: return int(candidates[0]) # Spray a set of candidates around the min so far numcand = cand.shape[0] best_comp = np.argmin(vals) cand2 = np.vstack( (np.random.randn(10, comp.shape[1]) * 0.001 + comp[best_comp, :], cand)) if self.mcmc_iters > 0: # Possibly burn in. if self.needs_burnin: for mcmc_iter in xrange(self.burnin): self.sample_constraint_hypers(comp, labels) self.sample_hypers(comp[goodvals, :], vals[goodvals]) sys.stderr.write( "BURN %d/%d] mean: %.2f amp: %.2f " "noise: %.4f min_ls: %.4f max_ls: %.4f\n" % (mcmc_iter + 1, self.burnin, self.mean, np.sqrt(self.amp2), self.noise, np.min( self.ls), np.max(self.ls))) self.needs_burnin = False # Sample from hyperparameters. # Adjust the candidates to hit ei/sec peaks self.hyper_samples = [] for mcmc_iter in xrange(self.mcmc_iters): self.sample_constraint_hypers(comp, labels) self.sample_hypers(comp[goodvals, :], vals[goodvals]) sys.stderr.write("%d/%d] mean: %.2f amp: %.2f noise: %.4f " "min_ls: %.4f max_ls: %.4f\n" % (mcmc_iter + 1, self.mcmc_iters, self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) sys.stderr.write( "%d/%d] constraint_mean: %.2f " "constraint_amp: %.2f constraint_gain: %.4f " "constraint_min_ls: %.4f constraint_max_ls: " "%.4f\n" % (mcmc_iter + 1, self.mcmc_iters, self.constraint_mean, np.sqrt(self.constraint_amp2), self.constraint_gain, np.min(self.constraint_ls), np.max(self.constraint_ls))) self.dump_hypers() comp_preds = np.zeros(labels.shape[0]).flatten() preds = self.pred_constraint_voilation(cand, comp, labels).flatten() for ii in xrange(self.mcmc_iters): constraint_hyper = self.constraint_hyper_samples[ii] self.ff = self.ff_samples[ii] self.constraint_mean = constraint_hyper[0] self.constraint_gain = constraint_hyper[1] self.constraint_amp2 = constraint_hyper[2] self.constraint_ls = constraint_hyper[3] comp_preds += self.pred_constraint_voilation( comp, comp, labels).flatten() comp_preds = comp_preds / float(self.mcmc_iters) print 'Prediction %f percent violations (%d/%d): ' % (np.mean( preds < 0.5), np.sum(preds < 0.5), preds.shape[0]) print 'Prediction %f percent train accuracy (%d/%d): ' % (np.mean( (comp_preds > 0.5 ) == labels), np.sum( (comp_preds > 0.5) == labels), comp_preds.shape[0]) if False: delta = 0.025 x = np.arange(0, 1.0, delta) y = np.arange(0, 1.0, delta) X, Y = np.meshgrid(x, y) cpreds = np.zeros((X.shape[0], X.shape[1])) predei = np.zeros((X.shape[0], X.shape[1])) predei2 = np.zeros((X.shape[0], X.shape[1])) for ii in xrange(self.mcmc_iters): constraint_hyper = self.constraint_hyper_samples[ii] self.ff = self.ff_samples[ii] self.constraint_mean = constraint_hyper[0] self.constraint_gain = constraint_hyper[1] self.constraint_amp2 = constraint_hyper[2] self.constraint_ls = constraint_hyper[3] cpred = self.pred_constraint_voilation( np.hstack((X.flatten()[:, np.newaxis], Y.flatten()[:, np.newaxis])), comp, labels) pei = self.compute_ei_per_s( comp, pend, np.hstack((X.flatten()[:, np.newaxis], Y.flatten()[:, np.newaxis])), vals, labels) pei2 = self.compute_ei( comp, pend, np.hstack((X.flatten()[:, np.newaxis], Y.flatten()[:, np.newaxis])), vals, labels) cpreds += np.reshape(cpred, (X.shape[0], X.shape[1])) predei += np.reshape(pei, (X.shape[0], X.shape[1])) predei2 += np.reshape(pei2, (X.shape[0], X.shape[1])) plt.figure(1) cpreds = cpreds / float(self.mcmc_iters) CS = plt.contour(X, Y, cpreds) plt.clabel(CS, inline=1, fontsize=10) plt.plot(comp[labels == 0, 0], comp[labels == 0, 1], 'rx') plt.plot(comp[labels == 1, 0], comp[labels == 1, 1], 'bx') plt.title( 'Contours of Classification GP (Prob of not being a constraint violation)' ) plt.legend(('Constraint Violations', 'Good points'), 'lower left') plt.savefig('constrained_ei_chooser_class_contour.pdf') plt.figure(2) predei = predei / float(self.mcmc_iters) CS = plt.contour(X, Y, predei) plt.clabel(CS, inline=1, fontsize=10) plt.plot(comp[labels == 0, 0], comp[labels == 0, 1], 'rx') plt.plot(comp[labels == 1, 0], comp[labels == 1, 1], 'bx') plt.title('Contours of EI*P(not violating constraint)') plt.legend(('Constraint Violations', 'Good points'), 'lower left') plt.savefig('constrained_ei_chooser_eitimesprob_contour.pdf') plt.figure(3) predei2 = predei2 / float(self.mcmc_iters) CS = plt.contour(X, Y, predei2) plt.clabel(CS, inline=1, fontsize=10) plt.plot(comp[labels == 0, 0], comp[labels == 0, 1], 'rx') plt.plot(comp[labels == 1, 0], comp[labels == 1, 1], 'bx') plt.title('Contours of EI') plt.legend(('Constraint Violations', 'Good points'), 'lower left') plt.savefig('constrained_ei_chooser_ei_contour.pdf') plt.show() # Pick the top candidates to optimize over overall_ei = self.ei_over_hypers(comp, pend, cand2, vals, labels) inds = np.argsort(np.mean(overall_ei, axis=1))[-self.grid_subset:] cand2 = cand2[inds, :] # Adjust the candidates to hit ei peaks b = [] # optimization bounds for i in xrange(0, cand.shape[1]): b.append((0, 1)) for i in xrange(0, cand2.shape[0]): sys.stderr.write("Optimizing candidate %d/%d\n" % (i + 1, cand2.shape[0])) self.check_grad_ei_per(cand2[i, :], comp, vals, labels) ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei_over_hypers, cand2[i, :].flatten(), args=(comp, vals, labels, True), bounds=b, disp=0) cand2[i, :] = ret[0] cand = np.vstack((cand, cand2)) overall_ei = self.ei_over_hypers(comp, pend, cand, vals, labels) best_cand = np.argmax(np.mean(overall_ei, axis=1)) self.dump_hypers() if (best_cand >= numcand): return (int(numcand), cand[best_cand, :]) return int(candidates[best_cand]) else: # Optimize hyperparameters self.optimize_hypers(comp, vals, labels) sys.stderr.write( "mean: %f amp: %f noise: %f " "min_ls: %f max_ls: %f\n" % (self.mean, np.sqrt( self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) # Pick the top candidates to optimize over ei = self.compute_ei_per_s(comp, pend, cand2, vals, labels) inds = np.argsort(np.mean(overall_ei, axis=1))[-self.grid_subset:] cand2 = cand2[inds, :] # Adjust the candidates to hit ei peaks b = [] # optimization bounds for i in xrange(0, cand.shape[1]): b.append((0, 1)) for i in xrange(0, cand2.shape[0]): sys.stderr.write("Optimizing candidate %d/%d\n" % (i + 1, cand2.shape[0])) ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei, cand2[i, :].flatten(), args=(comp, vals, labels, True), bounds=b, disp=0) cand2[i, :] = ret[0] cand = np.vstack((cand, cand2)) ei = self.compute_ei_per_s(comp, pend, cand, vals, labels) best_cand = np.argmax(ei) self.dump_hypers() if (best_cand >= numcand): return (int(numcand), cand[best_cand, :]) return int(candidates[best_cand]) # Predict constraint voilating points def pred_constraint_voilation(self, cand, comp, vals): # The primary covariances for prediction. comp_cov = self.cov(self.constraint_amp2, self.constraint_ls, comp) cand_cross = self.cov(self.constraint_amp2, self.constraint_ls, comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.constraint_noise * np.eye(comp.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.constraint_ls, comp, cand) # Predictive things. # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), self.ff) # - self.constraint_mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) # + self.constraint_mean func_m = 1. / (1 + np.exp(-self.constraint_gain * func_m)) return func_m # Compute EI over hyperparameter samples def ei_over_hypers(self, comp, pend, cand, vals, labels): overall_ei = np.zeros((cand.shape[0], self.mcmc_iters)) for mcmc_iter in xrange(self.mcmc_iters): hyper = self.hyper_samples[mcmc_iter] constraint_hyper = self.constraint_hyper_samples[mcmc_iter] self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] self.constraint_mean = constraint_hyper[0] self.constraint_gain = constraint_hyper[1] self.constraint_amp2 = constraint_hyper[2] self.constraint_ls = constraint_hyper[3] overall_ei[:, mcmc_iter] = self.compute_ei_per_s( comp, pend, cand, vals, labels) return overall_ei # Adjust points by optimizing EI over a set of hyperparameter samples def grad_optimize_ei_over_hypers(self, cand, comp, vals, labels, compute_grad=True): summed_ei = 0 summed_grad_ei = np.zeros(cand.shape).flatten() for mcmc_iter in xrange(self.mcmc_iters): hyper = self.hyper_samples[mcmc_iter] constraint_hyper = self.constraint_hyper_samples[mcmc_iter] self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] self.constraint_mean = constraint_hyper[0] self.constraint_gain = constraint_hyper[1] self.constraint_amp2 = constraint_hyper[2] self.constraint_ls = constraint_hyper[3] if compute_grad: (ei, g_ei) = self.grad_optimize_ei(cand, comp, vals, labels, compute_grad) summed_grad_ei = summed_grad_ei + g_ei else: ei = self.grad_optimize_ei(cand, comp, vals, labels, compute_grad) summed_ei += ei if compute_grad: return (summed_ei, summed_grad_ei) else: return summed_ei def check_grad_ei_per(self, cand, comp, vals, labels): (ei, dx1) = self.grad_optimize_ei_over_hypers(cand, comp, vals, labels) dx2 = dx1 * 0 idx = np.zeros(cand.shape[0]) for i in xrange(0, cand.shape[0]): idx[i] = 1e-6 (ei1, tmp) = self.grad_optimize_ei_over_hypers(cand + idx, comp, vals, labels) (ei2, tmp) = self.grad_optimize_ei_over_hypers(cand - idx, comp, vals, labels) dx2[i] = (ei - ei2) / (2 * 1e-6) idx[i] = 0 print 'computed grads', dx1 print 'finite diffs', dx2 print(dx1 / dx2) print np.sum((dx1 - dx2)**2) time.sleep(2) def grad_optimize_ei(self, cand, comp, vals, labels, compute_grad=True): # Here we have to compute the gradients for constrained ei # This means deriving through the two kernels, the one for predicting # constraint violations and the one predicting ei # First pull out violating points compfull = comp.copy() comp = comp[labels > 0, :] vals = vals[labels > 0] best = np.min(vals) cand = np.reshape(cand, (-1, comp.shape[1])) # First we make predictions for the durations # Compute covariances comp_constraint_cov = self.cov(self.constraint_amp2, self.constraint_ls, compfull) cand_constraint_cross = self.cov(self.constraint_amp2, self.constraint_ls, compfull, cand) # Cholesky decompositions obsv_constraint_cov = comp_constraint_cov + self.constraint_noise * np.eye( compfull.shape[0]) obsv_constraint_chol = spla.cholesky(obsv_constraint_cov, lower=True) # Linear systems t_alpha = spla.cho_solve((obsv_constraint_chol, True), self.ff) # - self.constraint_mean) # Predict marginal mean times and (possibly) variances func_constraint_m = np.dot(cand_constraint_cross.T, t_alpha) # Squash through logistic to get probabilities func_constraint_m = 1. / ( 1 + np.exp(-self.constraint_gain * func_constraint_m)) # Apply covariance function cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.constraint_ls, compfull, cand) grad_cross_t = np.squeeze(cand_cross_grad) # Now compute the gradients w.r.t. ei # The primary covariances for prediction. comp_cov = self.cov(self.amp2, self.ls, comp) cand_cross = self.cov(self.amp2, self.ls, comp, cand) comp_cov_full = self.cov(self.amp2, self.ls, compfull) cand_cross_full = self.cov(self.amp2, self.ls, compfull, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) obsv_cov_full = comp_cov_full + self.noise * np.eye(compfull.shape[0]) obsv_chol_full = spla.cholesky(obsv_cov_full, lower=True) # Predictive things. # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) #beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) beta = spla.solve_triangular(obsv_chol_full, cand_cross_full, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) ei_per_s = -np.sum(ei * func_constraint_m) if not compute_grad: return ei_per_s grad_constraint_xp_m = np.dot(t_alpha.transpose(), grad_cross_t) # Gradients of ei w.r.t. mean and variance g_ei_m = -ncdf g_ei_s2 = 0.5 * npdf / func_s # Apply covariance function cand_cross_grad = cov_grad_func(self.ls, comp, cand) grad_cross = np.squeeze(cand_cross_grad) cand_cross_grad_full = cov_grad_func(self.ls, compfull, cand) grad_cross_full = np.squeeze(cand_cross_grad_full) grad_xp_m = np.dot(alpha.transpose(), grad_cross) #grad_xp_v = np.dot(-2*spla.cho_solve((obsv_chol, True), # cand_cross).transpose(),grad_cross) grad_xp_v = np.dot( -2 * spla.cho_solve( (obsv_chol_full, True), cand_cross_full).transpose(), grad_cross_full) grad_xp = 0.5 * self.amp2 * (grad_xp_m * g_ei_m + grad_xp_v * g_ei_s2) grad_constraint_xp_m = 0.5 * self.constraint_amp2 * self.constraint_gain * grad_constraint_xp_m * func_constraint_m * ( 1 - func_constraint_m) grad_xp = (func_constraint_m * grad_xp + ei * grad_constraint_xp_m) return ei_per_s, grad_xp.flatten() def compute_ei_per_s(self, comp, pend, cand, vals, labels): # First we make predictions for the durations as that # doesn't depend on pending experiments # First pull out violating points compfull = comp.copy() comp = comp[labels > 0, :] vals = vals[labels > 0] # Compute covariances comp_constraint_cov = self.cov(self.constraint_amp2, self.constraint_ls, compfull) cand_constraint_cross = self.cov(self.constraint_amp2, self.constraint_ls, compfull, cand) # Cholesky decompositions obsv_constraint_cov = comp_constraint_cov + self.constraint_noise * np.eye( compfull.shape[0]) obsv_constraint_chol = spla.cholesky(obsv_constraint_cov, lower=True) # Linear systems t_alpha = spla.cho_solve((obsv_constraint_chol, True), self.ff) # - self.constraint_mean) #t_beta = spla.solve_triangular(obsv_constraint_chol, cand_constraint_cross, lower=True) # Predict marginal mean times and (possibly) variances func_constraint_m = (np.dot(cand_constraint_cross.T, t_alpha)) # + self.constraint_mean) # We don't really need the time variances now #func_constraint_v = self.constraint_amp2*(1+1e-6) - np.sum(t_beta**2, axis=0) # Squash through a logistic to get probability of not violating a constraint func_constraint_m = 1. / ( 1 + np.exp(-self.constraint_gain * func_constraint_m)) if pend.shape[0] == 0: # If there are no pending, don't do anything fancy. # Current best. best = np.min(vals) # The primary covariances for prediction. comp_cov = self.cov(self.amp2, self.ls, comp) comp_cov_full = self.cov(self.amp2, self.ls, compfull) cand_cross = self.cov(self.amp2, self.ls, comp, cand) cand_cross_full = self.cov(self.amp2, self.ls, compfull, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0]) obsv_cov_full = comp_cov_full + self.noise * np.eye( compfull.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) obsv_chol_full = spla.cholesky(obsv_cov_full, lower=True) # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) #beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) beta = spla.solve_triangular(obsv_chol_full, cand_cross_full, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) ei_per_s = ei * func_constraint_m return ei_per_s else: # If there are pending experiments, fantasize their outcomes. # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = self.cov( self.amp2, self.ls, comp_pend) + self.noise * np.eye(comp_pend.shape[0]) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(self.amp2, self.ls, comp, pend) pend_kappa = self.cov(self.amp2, self.ls, pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[:comp.shape[0], :comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. pend_fant = np.dot(pend_chol, npr.randn(pend.shape[0], self.pending_samples)) + self.mean # Include the fantasies. fant_vals = np.concatenate( (np.tile(vals[:, np.newaxis], (1, self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(self.amp2, self.ls, comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:, np.newaxis]) u = (bests[np.newaxis, :] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) return np.mean(ei, axis=1) * func_constraint_m def compute_ei(self, comp, pend, cand, vals, labels): # First we make predictions for the durations as that # doesn't depend on pending experiments # First pull out violating points compfull = comp.copy() comp = comp[labels > 0, :] vals = vals[labels > 0] # Compute covariances comp_constraint_cov = self.cov(self.constraint_amp2, self.constraint_ls, compfull) cand_constraint_cross = self.cov(self.constraint_amp2, self.constraint_ls, compfull, cand) # Cholesky decompositions obsv_constraint_cov = comp_constraint_cov + self.constraint_noise * np.eye( compfull.shape[0]) obsv_constraint_chol = spla.cholesky(obsv_constraint_cov, lower=True) # Linear systems t_alpha = spla.cho_solve((obsv_constraint_chol, True), self.ff) # - self.constraint_mean) #t_beta = spla.solve_triangular(obsv_constraint_chol, cand_constraint_cross, lower=True) # Predict marginal mean times and (possibly) variances func_constraint_m = (np.dot(cand_constraint_cross.T, t_alpha)) # + self.constraint_mean) # We don't really need the time variances now #func_constraint_v = self.constraint_amp2*(1+1e-6) - np.sum(t_beta**2, axis=0) # Squash through a logistic to get probability of not violating a constraint func_constraint_m = 1. / ( 1 + np.exp(-self.constraint_gain * func_constraint_m)) if pend.shape[0] == 0: # If there are no pending, don't do anything fancy. # Current best. best = np.min(vals) # The primary covariances for prediction. comp_cov = self.cov(self.amp2, self.ls, comp) comp_cov_full = self.cov(self.amp2, self.ls, compfull) cand_cross = self.cov(self.amp2, self.ls, comp, cand) cand_cross_full = self.cov(self.amp2, self.ls, compfull, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0]) obsv_cov_full = comp_cov_full + self.noise * np.eye( compfull.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) obsv_chol_full = spla.cholesky(obsv_cov_full, lower=True) # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) #beta = spla.solve_triangular(obsv_chol_full, cand_cross_full, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) ei_per_s = ei #ei_per_s = ei return ei else: return 0 def sample_constraint_hypers(self, comp, labels): # The latent GP projection if self.ff is None: comp_cov = self.cov(self.amp2, self.ls, comp) obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) self.ff = np.dot(obsv_chol, npr.randn(obsv_chol.shape[0])) self._sample_constraint_noisy(comp, labels) self._sample_constraint_ls(comp, labels) self.constraint_hyper_samples.append( (self.constraint_mean, self.constraint_gain, self.constraint_amp2, self.constraint_ls)) self.ff_samples.append(self.ff) def sample_hypers(self, comp, vals): if self.noiseless: self.noise = 1e-3 self._sample_noiseless(comp, vals) else: self._sample_noisy(comp, vals) self._sample_ls(comp, vals) self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) def _sample_ls(self, comp, vals): def logprob(ls): if np.any(ls < 0) or np.any(ls > self.max_ls): return -np.inf cov = self.amp2 * (self.cov_func(ls, comp, None) + 1e-6 * np.eye( comp.shape[0])) + self.noise * np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - self.mean) lp = (-np.sum(np.log(np.diag(chol))) - 0.5 * np.dot(vals - self.mean, solve)) return lp self.ls = util.slice_sample(self.ls, logprob, compwise=True) def _sample_constraint_ls(self, comp, vals): def lpSigmoid(ff, gain=self.constraint_gain): probs = 1. / (1. + np.exp(-gain * ff)) probs[probs <= 0] = 1e-12 probs[probs >= 1] = 1 - 1e-12 llh = np.sum(vals * np.log(probs) + (1 - vals) * np.log(1 - probs)) return llh def updateGain(gain): if gain < 0.01 or gain > 10: return -np.inf cov = self.constraint_amp2 * ( self.cov_func(self.constraint_ls, comp, None) + 1e-6 * np.eye(comp.shape[0])) + self.constraint_noise * np.eye( comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals) # - self.constraint_mean) #lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(self.ff, solve) lp = lpSigmoid(self.ff, gain) return lp def logprob(ls): if np.any(ls < 0) or np.any(ls > self.constraint_max_ls): return -np.inf cov = self.constraint_amp2 * ( self.cov_func(ls, comp, None) + 1e-6 * np.eye(comp.shape[0]) ) + self.constraint_noise * np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), self.ff) # - self.constraint_mean) #lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(self.ff, solve) lp = lpSigmoid(self.ff) return lp #hypers = util.slice_sample(np.hstack((self.constraint_ls, self.ff)), logprob, compwise=True) hypers = util.slice_sample(self.constraint_ls, logprob, compwise=True) self.constraint_ls = hypers cov = self.constraint_amp2 * ( self.cov_func(self.constraint_ls, comp, None) + 1e-6 * np.eye( comp.shape[0])) + self.constraint_noise * np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=False) ff = self.ff for jj in xrange(20): (ff, lpell) = self.elliptical_slice(ff, chol, lpSigmoid) self.ff = ff # Update gain hypers = util.slice_sample(np.array([self.constraint_gain]), updateGain, compwise=True) self.constraint_gain = hypers def _sample_noisy(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = hypers[2] # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0 or noise < 0: return -np.inf cov = amp2 * (self.cov_func(self.ls, comp, None) + 1e-6 * np.eye( comp.shape[0])) + noise * np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot( vals - mean, solve) # Roll in noise horseshoe prior. lp += np.log(np.log(1 + (self.noise_scale / noise)**2)) #lp -= 0.5*(np.log(noise)/self.noise_scale)**2 # Roll in amplitude lognormal prior lp -= 0.5 * (np.log(amp2) / self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = hypers[2] def _sample_constraint_noisy(self, comp, vals): def lpSigmoid(ff, gain=self.constraint_gain): probs = 1. / (1. + np.exp(-gain * ff)) probs[probs <= 0] = 1e-12 probs[probs >= 1] = 1 - 1e-12 llh = np.sum(vals * np.log(probs) + (1 - vals) * np.log(1 - probs)) return llh def logprob(hypers): #mean = hypers[0] amp2 = hypers[0] #gain = hypers[2] ff = hypers[1:] # This is pretty hacky, but keeps things sane. #if mean > np.max(vals) or mean < np.min(vals): # return -np.inf if amp2 < 0: return -np.inf noise = self.constraint_noise cov = amp2 * (self.cov_func(self.constraint_ls, comp, None) + 1e-6 * np.eye(comp.shape[0])) + noise * np.eye( comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), ff) # - mean) #lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(ff-mean, solve) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot(ff, solve) # Roll in noise horseshoe prior. #lp += np.log(np.log(1 + (self.constraint_noise_scale/noise)**2)) #lp -= 0.5*(np.log(noise)/self.constraint_noise_scale)**2 # Roll in amplitude lognormal prior lp -= 0.5 * (np.log(amp2) / self.constraint_amp2_scale)**2 #lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(self.ff, solve) lp += lpSigmoid(ff, self.constraint_gain) return lp hypers = util.slice_sample(np.hstack( (np.array([self.constraint_amp2]), self.ff)), logprob, compwise=False) #self.constraint_mean = hypers[0] self.constraint_amp2 = hypers[0] #self.constraint_gain = hypers[2] self.ff = hypers[1:] cov = self.constraint_amp2 * ( self.cov_func(self.constraint_ls, comp, None) + 1e-6 * np.eye( comp.shape[0])) + self.constraint_noise * np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=False) ff = self.ff for jj in xrange(50): (ff, lpell) = self.elliptical_slice(ff, chol, lpSigmoid) self.ff = ff def _sample_noiseless(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = 1e-3 # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0: return -np.inf cov = amp2 * (self.cov_func(self.ls, comp, None) + 1e-6 * np.eye( comp.shape[0])) + noise * np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot( vals - mean, solve) # Roll in amplitude lognormal prior lp -= 0.5 * (np.log(amp2) / self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = 1e-3 def elliptical_slice(self, xx, chol_Sigma, log_like_fn, cur_log_like=None, angle_range=0): D = xx.shape[0] if cur_log_like is None: cur_log_like = log_like_fn(xx) nu = np.dot(chol_Sigma.T, np.random.randn(D, 1)).flatten() hh = np.log(np.random.rand()) + cur_log_like # Set up a bracket of angles and pick a first proposal. # "phi = (theta'-theta)" is a change in angle. if angle_range <= 0: # Bracket whole ellipse with both edges at first proposed point phi = np.random.rand() * 2 * math.pi phi_min = phi - 2 * math.pi phi_max = phi else: # Randomly center bracket on current point phi_min = -angle_range * np.random.rand() phi_max = phi_min + angle_range phi = np.random.rand() * (phi_max - phi_min) + phi_min # Slice sampling loop while True: # Compute xx for proposed angle difference and check if it's on the slice xx_prop = xx * np.cos(phi) + nu * np.sin(phi) cur_log_like = log_like_fn(xx_prop) if cur_log_like > hh: # New point is on slice, ** EXIT LOOP ** break # Shrink slice to rejected point if phi > 0: phi_max = phi elif phi < 0: phi_min = phi else: raise Exception( 'BUG DETECTED: Shrunk to current position and still not acceptable.' ) # Propose new angle difference phi = np.random.rand() * (phi_max - phi_min) + phi_min xx = xx_prop return (xx, cur_log_like) def optimize_hypers(self, comp, vals, labels): # First the GP to observations mygp = gp.GP(self.cov_func.__name__) mygp.real_init(comp.shape[1], vals) mygp.optimize_hypers(comp, vals) self.mean = mygp.mean self.ls = mygp.ls self.amp2 = mygp.amp2 self.noise = mygp.noise # Now the GP to times timegp = gp.GP(self.cov_func.__name__) timegp.real_init(comp.shape[1], labels) timegp.optimize_hypers(comp, labels) self.constraint_mean = timegp.mean self.constraint_amp2 = timegp.amp2 self.constraint_noise = timegp.noise self.constraint_ls = timegp.ls # Save hyperparameter samples self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) self.constraint_hyper_samples.append( (self.constraint_mean, self.constraint_noise, self.constraint_amp2, self.constraint_ls)) self.dump_hypers()
class GPEIOptChooser: def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False, burnin=100, grid_subset=20, use_multiprocessing=True): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.stats_file = os.path.join(expt_dir, self.__module__ + "_hyperparameters.txt") self.mcmc_iters = int(mcmc_iters) self.burnin = int(burnin) self.needs_burnin = True self.pending_samples = int(pending_samples) self.D = -1 self.hyper_iters = 1 # Number of points to optimize EI over self.grid_subset = int(grid_subset) self.noiseless = bool(int(noiseless)) self.hyper_samples = [] self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 2 # top-hat prior on length scales # If multiprocessing fails or deadlocks, set this to False self.use_multiprocessing = bool(int(use_multiprocessing)) def dump_hypers(self): self.locker.lock_wait(self.state_pkl) # Write the hyperparameters out to a Pickle. fh = tempfile.NamedTemporaryFile(mode='w', delete=False) cPickle.dump({ 'dims' : self.D, 'ls' : self.ls, 'amp2' : self.amp2, 'noise' : self.noise, 'hyper_samples' : self.hyper_samples, 'mean' : self.mean }, fh) fh.close() # Use an atomic move for better NFS happiness. cmd = 'mv "%s" "%s"' % (fh.name, self.state_pkl) os.system(cmd) # TODO: Should check system-dependent return status. self.locker.unlock(self.state_pkl) # Write the hyperparameters out to a human readable file as well fh = open(self.stats_file, 'w') fh.write('Mean Noise Amplitude <length scales>\n') fh.write('-----------ALL SAMPLES-------------\n') meanhyps = 0*np.hstack(self.hyper_samples[0]) for i in self.hyper_samples: hyps = np.hstack(i) meanhyps += (1/float(len(self.hyper_samples)))*hyps for j in hyps: fh.write(str(j) + ' ') fh.write('\n') fh.write('-----------MEAN OF SAMPLES-------------\n') for j in meanhyps: fh.write(str(j) + ' ') fh.write('\n') fh.close() # This passes out html or javascript to display interesting # stats - such as the length scales (sensitivity to various # dimensions). def generate_stats_html(self): # Need this because the model may not necessarily be # initialized when this code is called. if not self._read_only(): return 'Chooser not yet ready to display output' mean_mean = np.mean(np.vstack([h[0] for h in self.hyper_samples])) mean_noise = np.mean(np.vstack([h[1] for h in self.hyper_samples])) mean_ls = np.mean(np.vstack([h[3][np.newaxis,:] for h in self.hyper_samples]),0) try: output = ( '<br /><span class=\"label label-info\">Estimated mean:</span> ' + str(mean_mean) + '<br /><span class=\"label label-info\">Estimated noise:</span> ' + str(mean_noise) + '<br /><br /><span class=\"label label-info\">Inverse parameter sensitivity' + ' - Gaussian Process length scales</span><br /><br />' + '<div id=\"lschart\"></div><script type=\"text/javascript\">' + 'var lsdata = [' + ','.join(['%.2f' % i for i in mean_ls]) + '];') except: return 'Chooser not yet ready to display output.' output += ('bar_chart("#lschart", lsdata, ' + str(self.max_ls) + ');' + '</script>') return output # Read in the chooser from file. Returns True only on success def _read_only(self): if os.path.exists(self.state_pkl): fh = open(self.state_pkl, 'r') state = cPickle.load(fh) fh.close() self.D = state['dims'] self.ls = state['ls'] self.amp2 = state['amp2'] self.noise = state['noise'] self.mean = state['mean'] self.hyper_samples = state['hyper_samples'] self.needs_burnin = False return True return False def _real_init(self, dims, values): self.locker.lock_wait(self.state_pkl) self.randomstate = npr.get_state() if os.path.exists(self.state_pkl): fh = open(self.state_pkl, 'r') state = cPickle.load(fh) fh.close() self.D = state['dims'] self.ls = state['ls'] self.amp2 = state['amp2'] self.noise = state['noise'] self.mean = state['mean'] self.hyper_samples = state['hyper_samples'] self.needs_burnin = False else: # Input dimensionality. self.D = dims # Initial length scales. self.ls = np.ones(self.D) # Initial amplitude. self.amp2 = np.std(values)+1e-4 # Initial observation noise. self.noise = 1e-3 # Initial mean. self.mean = np.mean(values) # Save hyperparameter samples self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) self.locker.unlock(self.state_pkl) def cov(self, x1, x2=None): if x2 is None: return self.amp2 * (self.cov_func(self.ls, x1, None) + 1e-6*np.eye(x1.shape[0])) else: return self.amp2 * self.cov_func(self.ls, x1, x2) # Given a set of completed 'experiments' in the unit hypercube with # corresponding objective 'values', pick from the next experiment to # run according to the acquisition function. def next(self, grid, values, durations, candidates, pending, complete): # Don't bother using fancy GP stuff at first. if complete.shape[0] < 2: return int(candidates[0]) # Perform the real initialization. if self.D == -1: self._real_init(grid.shape[1], values[complete]) # Grab out the relevant sets. comp = grid[complete,:] cand = grid[candidates,:] pend = grid[pending,:] vals = values[complete] numcand = cand.shape[0] # Spray a set of candidates around the min so far best_comp = np.argmin(vals) cand2 = np.vstack((np.random.randn(10,comp.shape[1])*0.001 + comp[best_comp,:], cand)) if self.mcmc_iters > 0: # Possibly burn in. if self.needs_burnin: for mcmc_iter in xrange(self.burnin): self.sample_hypers(comp, vals) log("BURN %d/%d] mean: %.2f amp: %.2f " "noise: %.4f min_ls: %.4f max_ls: %.4f" % (mcmc_iter+1, self.burnin, self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) self.needs_burnin = False # Sample from hyperparameters. # Adjust the candidates to hit ei peaks self.hyper_samples = [] for mcmc_iter in xrange(self.mcmc_iters): self.sample_hypers(comp, vals) log("%d/%d] mean: %.2f amp: %.2f noise: %.4f " "min_ls: %.4f max_ls: %.4f" % (mcmc_iter+1, self.mcmc_iters, self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) self.dump_hypers() b = []# optimization bounds for i in xrange(0, cand.shape[1]): b.append((0, 1)) overall_ei = self.ei_over_hypers(comp,pend,cand2,vals) inds = np.argsort(np.mean(overall_ei,axis=1))[-self.grid_subset:] cand2 = cand2[inds,:] # Optimize each point in parallel if self.use_multiprocessing: pool = multiprocessing.Pool(self.grid_subset) results = [pool.apply_async(optimize_pt,args=( c,b,comp,pend,vals,copy.copy(self))) for c in cand2] for res in results: cand = np.vstack((cand, res.get(1e8))) pool.close() else: # This is old code to optimize each point in parallel. for i in xrange(0, cand2.shape[0]): log("Optimizing candidate %d/%d" % (i+1, cand2.shape[0])) #self.check_grad_ei(cand2[i,:].flatten(), comp, pend, vals) ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei_over_hypers, cand2[i,:].flatten(), args=(comp,pend,vals), bounds=b, disp=0) cand2[i,:] = ret[0] cand = np.vstack((cand, cand2)) overall_ei = self.ei_over_hypers(comp,pend,cand,vals) best_cand = np.argmax(np.mean(overall_ei, axis=1)) if (best_cand >= numcand): return (int(numcand), cand[best_cand,:]) return int(candidates[best_cand]) else: # Optimize hyperparameters self.optimize_hypers(comp, vals) log("mean: %.2f amp: %.2f noise: %.4f " "min_ls: %.4f max_ls: %.4f" % (self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) # Optimize over EI b = []# optimization bounds for i in xrange(0, cand.shape[1]): b.append((0, 1)) for i in xrange(0, cand2.shape[0]): ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei, cand2[i,:].flatten(), args=(comp,vals,True), bounds=b, disp=0) cand2[i,:] = ret[0] cand = np.vstack((cand, cand2)) ei = self.compute_ei(comp, pend, cand, vals) best_cand = np.argmax(ei) if (best_cand >= numcand): return (int(numcand), cand[best_cand,:]) return int(candidates[best_cand]) # Compute EI over hyperparameter samples def ei_over_hypers(self,comp,pend,cand,vals): overall_ei = np.zeros((cand.shape[0], self.mcmc_iters)) for mcmc_iter in xrange(self.mcmc_iters): hyper = self.hyper_samples[mcmc_iter] self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] overall_ei[:,mcmc_iter] = self.compute_ei(comp, pend, cand, vals) return overall_ei def check_grad_ei(self, cand, comp, pend, vals): (ei,dx1) = self.grad_optimize_ei_over_hypers(cand, comp, pend, vals) dx2 = dx1*0 idx = np.zeros(cand.shape[0]) for i in xrange(0, cand.shape[0]): idx[i] = 1e-6 (ei1,tmp) = self.grad_optimize_ei_over_hypers(cand + idx, comp, pend, vals) (ei2,tmp) = self.grad_optimize_ei_over_hypers(cand - idx, comp, pend, vals) dx2[i] = (ei - ei2)/(2*1e-6) idx[i] = 0 print 'computed grads', dx1 print 'finite diffs', dx2 print (dx1/dx2) print np.sum((dx1 - dx2)**2) time.sleep(2) # Adjust points by optimizing EI over a set of hyperparameter samples def grad_optimize_ei_over_hypers(self, cand, comp, pend, vals, compute_grad=True): summed_ei = 0 summed_grad_ei = np.zeros(cand.shape).flatten() ls = self.ls.copy() amp2 = self.amp2 mean = self.mean noise = self.noise for hyper in self.hyper_samples: self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] if compute_grad: (ei,g_ei) = self.grad_optimize_ei(cand,comp,pend,vals,compute_grad) summed_grad_ei = summed_grad_ei + g_ei else: ei = self.grad_optimize_ei(cand,comp,pend,vals,compute_grad) summed_ei += ei self.mean = mean self.amp2 = amp2 self.noise = noise self.ls = ls.copy() if compute_grad: return (summed_ei, summed_grad_ei) else: return summed_ei # Adjust points based on optimizing their ei def grad_optimize_ei(self, cand, comp, pend, vals, compute_grad=True): if pend.shape[0] == 0: best = np.min(vals) cand = np.reshape(cand, (-1, comp.shape[1])) # The primary covariances for prediction. comp_cov = self.cov(comp) cand_cross = self.cov(comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise*np.eye(comp.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.ls, comp, cand) # Predictive things. # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*( u*ncdf + npdf) if not compute_grad: return ei # Gradients of ei w.r.t. mean and variance g_ei_m = -ncdf g_ei_s2 = 0.5*npdf / func_s # Apply covariance function grad_cross = np.squeeze(cand_cross_grad) grad_xp_m = np.dot(alpha.transpose(),grad_cross) grad_xp_v = np.dot(-2*spla.cho_solve( (obsv_chol, True),cand_cross).transpose(), grad_cross) grad_xp = 0.5*self.amp2*(grad_xp_m*g_ei_m + grad_xp_v*g_ei_s2) ei = -np.sum(ei) return ei, grad_xp.flatten() else: # If there are pending experiments, fantasize their outcomes. cand = np.reshape(cand, (-1, comp.shape[1])) # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = (self.cov(comp_pend) + self.noise*np.eye(comp_pend.shape[0])) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(comp, pend) pend_kappa = self.cov(pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[:comp.shape[0],:comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. npr.set_state(self.randomstate) pend_fant = np.dot(pend_chol, npr.randn(pend.shape[0],self.pending_samples)) + pend_m[:,None] # Include the fantasies. fant_vals = np.concatenate( (np.tile(vals[:,np.newaxis], (1,self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(comp_pend, cand) cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.ls, comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:,np.newaxis]) u = (bests[np.newaxis,:] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*( u*ncdf + npdf) # Gradients of ei w.r.t. mean and variance g_ei_m = -ncdf g_ei_s2 = 0.5*npdf / func_s # Apply covariance function grad_cross = np.squeeze(cand_cross_grad) grad_xp_m = np.dot(alpha.transpose(),grad_cross) grad_xp_v = np.dot(-2*spla.cho_solve( (comp_pend_chol, True),cand_cross).transpose(), grad_cross) grad_xp = 0.5*self.amp2*(grad_xp_m*np.tile(g_ei_m,(comp.shape[1],1)).T + (grad_xp_v.T*g_ei_s2).T) ei = -np.mean(ei, axis=1) grad_xp = np.mean(grad_xp,axis=0) return ei, grad_xp.flatten() def compute_ei(self, comp, pend, cand, vals): if pend.shape[0] == 0: # If there are no pending, don't do anything fancy. # Current best. best = np.min(vals) # The primary covariances for prediction. comp_cov = self.cov(comp) cand_cross = self.cov(comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise*np.eye(comp.shape[0]) obsv_chol = spla.cholesky( obsv_cov, lower=True ) # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*( u*ncdf + npdf) return ei else: # If there are pending experiments, fantasize their outcomes. # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = (self.cov(comp_pend) + self.noise*np.eye(comp_pend.shape[0])) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(comp, pend) pend_kappa = self.cov(pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[:comp.shape[0],:comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. npr.set_state(self.randomstate) pend_fant = np.dot(pend_chol, npr.randn(pend.shape[0],self.pending_samples)) + pend_m[:,None] # Include the fantasies. fant_vals = np.concatenate( (np.tile(vals[:,np.newaxis], (1,self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:,np.newaxis]) u = (bests[np.newaxis,:] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*( u*ncdf + npdf) return np.mean(ei, axis=1) def sample_hypers(self, comp, vals): if self.noiseless: self.noise = 1e-3 self._sample_noiseless(comp, vals) else: self._sample_noisy(comp, vals) self._sample_ls(comp, vals) self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) def _sample_ls(self, comp, vals): def logprob(ls): if np.any(ls < 0) or np.any(ls > self.max_ls): return -np.inf cov = (self.amp2 * (self.cov_func(ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + self.noise*np.eye(comp.shape[0])) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - self.mean) lp = (-np.sum(np.log(np.diag(chol))) - 0.5*np.dot(vals-self.mean, solve)) return lp self.ls = util.slice_sample(self.ls, logprob, compwise=True) def _sample_noisy(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = hypers[2] # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0 or noise < 0: return -np.inf cov = (amp2 * (self.cov_func(self.ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0])) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve) # Roll in noise horseshoe prior. lp += np.log(np.log(1 + (self.noise_scale/noise)**2)) # Roll in amplitude lognormal prior lp -= 0.5*(np.log(np.sqrt(amp2))/self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array( [self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = hypers[2] def _sample_noiseless(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = 1e-3 # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0: return -np.inf cov = (amp2 * (self.cov_func(self.ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0])) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve) # Roll in amplitude lognormal prior lp -= 0.5*(np.log(np.sqrt(amp2))/self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array( [self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = 1e-3 def optimize_hypers(self, comp, vals): mygp = gp.GP(self.cov_func.__name__) mygp.real_init(comp.shape[1], vals) mygp.optimize_hypers(comp,vals) self.mean = mygp.mean self.ls = mygp.ls self.amp2 = mygp.amp2 self.noise = mygp.noise # Save hyperparameter samples self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) self.dump_hypers() return
class ExperimentGrid: @staticmethod def job_running(expt_dir, id): expt_grid = ExperimentGrid(expt_dir) expt_grid.set_running(id) @staticmethod def job_complete(expt_dir, id, value, duration): log("setting job %d complete" % id) expt_grid = ExperimentGrid(expt_dir) expt_grid.set_complete(id, value, duration) log("set...") @staticmethod def job_broken(expt_dir, id): expt_grid = ExperimentGrid(expt_dir) expt_grid.set_broken(id) def __init__(self, expt_dir, variables=None, grid_size=None, grid_seed=1): self.expt_dir = expt_dir self.jobs_pkl = os.path.join(expt_dir, EXPERIMENT_GRID_FILE) self.locker = Locker() # Only one process at a time is allowed to have access to the grid. self.locker.lock_wait(self.jobs_pkl) # Set up the grid for the first time if it doesn't exist. if variables is not None and not os.path.exists(self.jobs_pkl): self.seed = grid_seed self.vmap = GridMap(variables, grid_size) self.grid = self._hypercube_grid(self.vmap.card(), grid_size) self.status = np.zeros(grid_size, dtype=int) + CANDIDATE_STATE self.values = np.zeros(grid_size) + np.nan self.durs = np.zeros(grid_size) + np.nan self.executed = np.zeros(grid_size) + 0 self.proc_ids = np.zeros(grid_size, dtype=int) self._save_jobs() # Or load in the grid from the pickled file. else: self._load_jobs() def __del__(self): self._save_jobs() if self.locker.unlock(self.jobs_pkl): pass else: raise Exception("Could not release lock on job grid.\n") def get_grid(self): return self.grid, self.values, self.durs def get_candidates(self): return np.nonzero(self.status == CANDIDATE_STATE)[0] def get_pending(self): return np.nonzero((self.status == SUBMITTED_STATE) | (self.status == RUNNING_STATE))[0] def get_complete(self): return np.nonzero(self.status == COMPLETE_STATE)[0] def get_broken(self): return np.nonzero(self.status == BROKEN_STATE)[0] def get_executed(self): return np.nonzero(self.executed == 1)[0] def get_params(self, index): return self.vmap.get_params(self.grid[index, :]) def get_best(self): finite = self.values[np.isfinite(self.values)] if len(finite) > 0: cur_min = np.min(finite) index = np.nonzero(self.values == cur_min)[0][0] return cur_min, index else: return np.nan, -1 def get_proc_id(self, id): return self.proc_ids[id] def add_to_grid(self, candidate): # Checks to prevent numerical over/underflow from corrupting the grid candidate[candidate > 1.0] = 1.0 candidate[candidate < 0.0] = 0.0 # Set up the grid self.grid = np.vstack((self.grid, candidate)) self.status = np.append(self.status, np.zeros(1, dtype=int) + int(CANDIDATE_STATE)) self.values = np.append(self.values, np.zeros(1) + np.nan) self.durs = np.append(self.durs, np.zeros(1) + np.nan) self.proc_ids = np.append(self.proc_ids, np.zeros(1, dtype=int)) # Save this out. self._save_jobs() return self.grid.shape[0] - 1 def set_candidate(self, id): self.status[id] = CANDIDATE_STATE self._save_jobs() def set_submitted(self, id, proc_id): self.status[id] = SUBMITTED_STATE self.proc_ids[id] = proc_id self._save_jobs() def set_running(self, id): self.status[id] = RUNNING_STATE self._save_jobs() def set_complete(self, id, value, duration): self.status[id] = COMPLETE_STATE self.values[id] = value self.durs[id] = duration self.executed[id] = 1 self._save_jobs() def set_broken(self, id): self.status[id] = BROKEN_STATE self._save_jobs() def _load_jobs(self): fh = open(self.jobs_pkl, 'r') jobs = cPickle.load(fh) fh.close() self.vmap = jobs['vmap'] self.grid = jobs['grid'] self.status = jobs['status'] self.values = jobs['values'] self.durs = jobs['durs'] self.executed = jobs['executed'] self.proc_ids = jobs['proc_ids'] def _save_jobs(self): # Write everything to a temporary file first. fh = tempfile.NamedTemporaryFile(mode='w', delete=False) cPickle.dump( { 'vmap': self.vmap, 'grid': self.grid, 'status': self.status, 'values': self.values, 'durs': self.durs, 'executed': self.executed, 'proc_ids': self.proc_ids }, fh, protocol=-1) fh.close() # Use an atomic move for better NFS happiness. cmd = 'mv "%s" "%s"' % (fh.name, self.jobs_pkl) os.system(cmd) # TODO: Should check system-dependent return status. def _hypercube_grid(self, dims, size): # Generate from a sobol sequence sobol_grid = np.transpose(i4_sobol_generate(dims, size, self.seed)) return sobol_grid
class GPEIChooser: def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.mcmc_iters = int(mcmc_iters) self.pending_samples = pending_samples self.D = -1 self.hyper_iters = 1 self.noiseless = bool(int(noiseless)) self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 2 # top-hat prior on length scales def __del__(self): self.locker.lock_wait(self.state_pkl) # Write the hyperparameters out to a Pickle. fh = tempfile.NamedTemporaryFile(mode='w', delete=False) cPickle.dump({ 'dims' : self.D, 'ls' : self.ls, 'amp2' : self.amp2, 'noise' : self.noise, 'mean' : self.mean }, fh) fh.close() # Use an atomic move for better NFS happiness. cmd = 'mv "%s" "%s"' % (fh.name, self.state_pkl) os.system(cmd) # TODO: Should check system-dependent return status. self.locker.unlock(self.state_pkl) def _real_init(self, dims, values): self.locker.lock_wait(self.state_pkl) if os.path.exists(self.state_pkl): fh = open(self.state_pkl, 'r') state = cPickle.load(fh) fh.close() self.D = state['dims'] self.ls = state['ls'] self.amp2 = state['amp2'] self.noise = state['noise'] self.mean = state['mean'] else: # Input dimensionality. self.D = dims # Initial length scales. self.ls = np.ones(self.D) # Initial amplitude. self.amp2 = np.std(values)+1e-4 # Initial observation noise. self.noise = 1e-3 # Initial mean. self.mean = np.mean(values) self.locker.unlock(self.state_pkl) def cov(self, x1, x2=None): if x2 is None: return self.amp2 * (self.cov_func(self.ls, x1, None) + 1e-6*np.eye(x1.shape[0])) else: return self.amp2 * self.cov_func(self.ls, x1, x2) def next(self, grid, values, durations, candidates, pending, complete): # Don't bother using fancy GP stuff at first. if complete.shape[0] < 2: return int(candidates[0]) # Perform the real initialization. if self.D == -1: self._real_init(grid.shape[1], values[complete]) # Grab out the relevant sets. comp = grid[complete,:] cand = grid[candidates,:] pend = grid[pending,:] vals = values[complete] if self.mcmc_iters > 0: # Sample from hyperparameters. overall_ei = np.zeros((cand.shape[0], self.mcmc_iters)) for mcmc_iter in xrange(self.mcmc_iters): self.sample_hypers(comp, vals) log("mean: %f amp: %f noise: %f min_ls: %f max_ls: %f" % (self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) overall_ei[:,mcmc_iter] = self.compute_ei(comp, pend, cand, vals) best_cand = np.argmax(np.mean(overall_ei, axis=1)) return int(candidates[best_cand]) else: # Optimize hyperparameters try: self.optimize_hypers(comp, vals) except: # Initial length scales. self.ls = np.ones(self.D) # Initial amplitude. self.amp2 = np.std(vals) # Initial observation noise. self.noise = 1e-3 log("mean: %f amp: %f noise: %f min_ls: %f max_ls: %f" % (self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) ei = self.compute_ei(comp, pend, cand, vals) best_cand = np.argmax(ei) return int(candidates[best_cand]) def compute_ei(self, comp, pend, cand, vals): if pend.shape[0] == 0: # If there are no pending, don't do anything fancy. # Current best. best = np.min(vals) # The primary covariances for prediction. comp_cov = self.cov(comp) cand_cross = self.cov(comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise*np.eye(comp.shape[0]) obsv_chol = spla.cholesky( obsv_cov, lower=True ) # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*( u*ncdf + npdf) return ei else: # If there are pending experiments, fantasize their outcomes. # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = self.cov(comp_pend) + self.noise*np.eye(comp_pend.shape[0]) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(comp, pend) pend_kappa = self.cov(pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[:comp.shape[0],:comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. pend_fant = (np.dot(pend_chol, npr.randn(pend.shape[0],self.pending_samples)) + pend_m[:,None]) # Include the fantasies. fant_vals = np.concatenate((np.tile(vals[:,np.newaxis], (1,self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:,np.newaxis]) u = (bests[np.newaxis,:] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*( u*ncdf + npdf) return np.mean(ei, axis=1) def sample_hypers(self, comp, vals): if self.noiseless: self.noise = 1e-3 self._sample_noiseless(comp, vals) else: self._sample_noisy(comp, vals) self._sample_ls(comp, vals) def _sample_ls(self, comp, vals): def logprob(ls): if np.any(ls < 0) or np.any(ls > self.max_ls): return -np.inf cov = self.amp2 * (self.cov_func(ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + self.noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - self.mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-self.mean, solve) return lp self.ls = util.slice_sample(self.ls, logprob, compwise=True) def _sample_noisy(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = hypers[2] # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0 or noise < 0: return -np.inf cov = amp2 * (self.cov_func(self.ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve) # Roll in noise horseshoe prior. lp += np.log(np.log(1 + (self.noise_scale/noise)**2)) # Roll in amplitude lognormal prior lp -= 0.5*(np.log(amp2)/self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = hypers[2] def _sample_noiseless(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = 1e-3 if amp2 < 0: return -np.inf cov = amp2 * (self.cov_func(self.ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve) # Roll in amplitude lognormal prior lp -= 0.5*(np.log(amp2)/self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = 1e-3 def optimize_hypers(self, comp, vals): mygp = gp.GP(self.cov_func.__name__) mygp.real_init(comp.shape[1], vals) mygp.optimize_hypers(comp,vals) self.mean = mygp.mean self.ls = mygp.ls self.amp2 = mygp.amp2 self.noise = mygp.noise # Save hyperparameter samples #self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) #self.dump_hypers() return
class ExperimentGrid: @staticmethod def job_running(expt_dir, id): expt_grid = ExperimentGrid(expt_dir) expt_grid.set_running(id) @staticmethod def job_complete(expt_dir, id, value, duration): expt_grid = ExperimentGrid(expt_dir) expt_grid.set_complete(id, value, duration) @staticmethod def job_broken(expt_dir, id): expt_grid = ExperimentGrid(expt_dir) expt_grid.set_broken(id) def __init__(self, expt_dir, variables=None, grid_size=None, grid_seed=1): self.expt_dir = expt_dir self.jobs_pkl = os.path.join(expt_dir, 'expt-grid.pkl') self.locker = Locker() # Only one process at a time is allowed to have access to this. sys.stderr.write("Waiting to lock grid...") self.locker.lock_wait(self.jobs_pkl) sys.stderr.write("...acquired\n") # Does this exist already? if variables is not None and not os.path.exists(self.jobs_pkl): # Set up the grid for the first time. self.seed = grid_seed self.vmap = GridMap(variables, grid_size) self.grid = self.hypercube_grid(self.vmap.card(), grid_size) self.status = np.zeros(grid_size, dtype=int) + CANDIDATE_STATE self.values = np.zeros(grid_size) + np.nan self.durs = np.zeros(grid_size) + np.nan self.sgeids = np.zeros(grid_size, dtype=int) # Save this out. self._save_jobs() else: # Load in from the pickle. self._load_jobs() def __del__(self): self._save_jobs() if self.locker.unlock(self.jobs_pkl): sys.stderr.write("Released lock on job grid.\n") else: raise Exception("Could not release lock on job grid.\n") def get_grid(self): return self.grid, self.values, self.durs def get_candidates(self): return np.nonzero(self.status == CANDIDATE_STATE)[0] def get_pending(self): return np.nonzero((self.status == SUBMITTED_STATE) | (self.status == RUNNING_STATE))[0] def get_complete(self): return np.nonzero(self.status == COMPLETE_STATE)[0] def get_broken(self): return np.nonzero(self.status == BROKEN_STATE)[0] def get_params(self, index): return self.vmap.get_params(self.grid[index,:]) def get_best(self): finite = self.values[np.isfinite(self.values)] if len(finite) > 0: cur_min = np.min(finite) index = np.nonzero(self.values==cur_min)[0][0] return cur_min, index else: return np.nan, -1 def get_sgeid(self, id): return self.sgeids[id] def add_to_grid(self, candidate): # Set up the grid self.grid = np.vstack((self.grid, candidate)) self.status = np.append(self.status, np.zeros(1, dtype=int) + int(CANDIDATE_STATE)) self.values = np.append(self.values, np.zeros(1)+np.nan) self.durs = np.append(self.durs, np.zeros(1)+np.nan) self.sgeids = np.append(self.sgeids, np.zeros(1,dtype=int)) # Save this out. self._save_jobs() return self.grid.shape[0]-1 def set_candidate(self, id): self.status[id] = CANDIDATE_STATE self._save_jobs() def set_submitted(self, id, sgeid): self.status[id] = SUBMITTED_STATE self.sgeids[id] = sgeid self._save_jobs() def set_running(self, id): self.status[id] = RUNNING_STATE self._save_jobs() def set_complete(self, id, value, duration): self.status[id] = COMPLETE_STATE self.values[id] = value self.durs[id] = duration self._save_jobs() def set_broken(self, id): self.status[id] = BROKEN_STATE self._save_jobs() def _load_jobs(self): fh = open(self.jobs_pkl, 'r') jobs = cPickle.load(fh) fh.close() self.vmap = jobs['vmap'] self.grid = jobs['grid'] self.status = jobs['status'] self.values = jobs['values'] self.durs = jobs['durs'] self.sgeids = jobs['sgeids'] def _save_jobs(self): # Write everything to a temporary file first. fh = tempfile.NamedTemporaryFile(mode='w', delete=False) cPickle.dump({ 'vmap' : self.vmap, 'grid' : self.grid, 'status' : self.status, 'values' : self.values, 'durs' : self.durs, 'sgeids' : self.sgeids }, fh) fh.close() # Use an atomic move for better NFS happiness. cmd = 'mv "%s" "%s"' % (fh.name, self.jobs_pkl) os.system(cmd) # TODO: Should check system-dependent return status. def _hypercube_grid(self, dims, size): # Generate from a sobol sequence #sobol_grid = np.transpose(i4_sobol_generate(dims,size,self.seed)) sobol_grid = sobol_generate(dims,size,self.seed) return sobol_grid
class GPEIConstrainedChooser: def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False, burnin=100, grid_subset=20, constraint_violating_value=-1): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.stats_file = os.path.join(expt_dir, self.__module__ + "_hyperparameters.txt") self.mcmc_iters = int(mcmc_iters) self.burnin = int(burnin) self.needs_burnin = True self.pending_samples = pending_samples self.D = -1 self.hyper_iters = 1 # Number of points to optimize EI over self.grid_subset = int(grid_subset) self.noiseless = bool(int(noiseless)) self.hyper_samples = [] self.constraint_hyper_samples = [] self.ff = None self.ff_samples = [] self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 2 # top-hat prior on length scales self.constraint_noise_scale = 0.1 # horseshoe prior self.constraint_amp2_scale = 1 # zero-mean log normal prio self.constraint_gain = 1 # top-hat prior on length scales self.constraint_max_ls = 2 # top-hat prior on length scales self.bad_value = float(constraint_violating_value) # A simple function to dump out hyperparameters to allow for a hot start # if the optimization is restarted. def dump_hypers(self): sys.stderr.write("Waiting to lock hyperparameter pickle...") self.locker.lock_wait(self.state_pkl) sys.stderr.write("...acquired\n") # Write the hyperparameters out to a Pickle. fh = tempfile.NamedTemporaryFile(mode='w', delete=False) cPickle.dump({ 'dims' : self.D, 'ls' : self.ls, 'amp2' : self.amp2, 'noise' : self.noise, 'mean' : self.mean, 'constraint_ls' : self.constraint_ls, 'constraint_amp2' : self.constraint_amp2, 'constraint_noise' : self.constraint_noise, 'constraint_mean' : self.constraint_mean }, fh) fh.close() # Use an atomic move for better NFS happiness. cmd = 'mv "%s" "%s"' % (fh.name, self.state_pkl) os.system(cmd) # TODO: Should check system-dependent return status. self.locker.unlock(self.state_pkl) def _real_init(self, dims, values, durations): sys.stderr.write("Waiting to lock hyperparameter pickle...") self.locker.lock_wait(self.state_pkl) sys.stderr.write("...acquired\n") if os.path.exists(self.state_pkl): fh = open(self.state_pkl, 'r') state = cPickle.load(fh) fh.close() self.D = state['dims'] self.ls = state['ls'] self.amp2 = state['amp2'] self.noise = state['noise'] self.mean = state['mean'] self.constraint_ls = state['constraint_ls'] self.constraint_amp2 = state['constraint_amp2'] self.constraint_noise = state['constraint_noise'] self.constraint_mean = state['constraint_mean'] self.constraint_gain = state['constraint_mean'] self.needs_burnin = False else: # Identify constraint violations goodvals = np.nonzero(values != self.bad_value)[0] # Input dimensionality. self.D = dims # Initial length scales. self.ls = np.ones(self.D) self.constraint_ls = np.ones(self.D) # Initial amplitude. self.amp2 = np.std(values[goodvals]) self.constraint_amp2 = 1#np.std(durations) # Initial observation noise. self.noise = 1e-3 self.constraint_noise = 1e-3 self.constraint_gain = 1 # Initial mean. self.mean = np.mean(values[goodvals]) self.constraint_mean = 0.5 self.locker.unlock(self.state_pkl) def cov(self, amp2, ls, x1, x2=None): if x2 is None: return amp2 * (self.cov_func(ls, x1, None) + 1e-6*np.eye(x1.shape[0])) else: return amp2 * self.cov_func(ls, x1, x2) # Given a set of completed 'experiments' in the unit hypercube with # corresponding objective 'values', pick from the next experiment to # run according to the acquisition function. def next(self, grid, values, durations, candidates, pending, complete): # Don't bother using fancy GP stuff at first. if complete.shape[0] < 2: return int(candidates[0]) # Perform the real initialization. if self.D == -1: self._real_init(grid.shape[1], values[complete], durations[complete]) # Grab out the relevant sets. comp = grid[complete,:] cand = grid[candidates,:] pend = grid[pending,:] vals = values[complete] # Find which completed jobs violated constraints badvals = np.nonzero(vals == self.bad_value)[0] goodvals = np.nonzero(vals != self.bad_value)[0] print 'Found %d constraint violating jobs' % (badvals.shape[0]) labels = np.zeros(vals.shape[0]) labels[goodvals] = 1 if comp.shape[0] < 2: return int(candidates[0]) # Spray a set of candidates around the min so far numcand = cand.shape[0] best_comp = np.argmin(vals) cand2 = np.vstack((np.random.randn(10,comp.shape[1])*0.001 + comp[best_comp,:], cand)) if self.mcmc_iters > 0: # Possibly burn in. if self.needs_burnin: for mcmc_iter in xrange(self.burnin): self.sample_constraint_hypers(comp, labels) self.sample_hypers(comp[goodvals,:], vals[goodvals]) sys.stderr.write("BURN %d/%d] mean: %.2f amp: %.2f " "noise: %.4f min_ls: %.4f max_ls: %.4f\n" % (mcmc_iter+1, self.burnin, self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) self.needs_burnin = False # Sample from hyperparameters. # Adjust the candidates to hit ei/sec peaks self.hyper_samples = [] for mcmc_iter in xrange(self.mcmc_iters): self.sample_constraint_hypers(comp, labels) self.sample_hypers(comp[goodvals,:], vals[goodvals]) sys.stderr.write("%d/%d] mean: %.2f amp: %.2f noise: %.4f " "min_ls: %.4f max_ls: %.4f\n" % (mcmc_iter+1, self.mcmc_iters, self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) sys.stderr.write("%d/%d] constraint_mean: %.2f " "constraint_amp: %.2f constraint_gain: %.4f " "constraint_min_ls: %.4f constraint_max_ls: " "%.4f\n" % (mcmc_iter+1, self.mcmc_iters, self.constraint_mean, np.sqrt(self.constraint_amp2), self.constraint_gain, np.min(self.constraint_ls), np.max(self.constraint_ls))) self.dump_hypers() comp_preds = np.zeros(labels.shape[0]).flatten() preds = self.pred_constraint_voilation(cand, comp, labels).flatten() for ii in xrange(self.mcmc_iters): constraint_hyper = self.constraint_hyper_samples[ii] self.ff = self.ff_samples[ii] self.constraint_mean = constraint_hyper[0] self.constraint_gain = constraint_hyper[1] self.constraint_amp2 = constraint_hyper[2] self.constraint_ls = constraint_hyper[3] comp_preds += self.pred_constraint_voilation(comp, comp, labels).flatten() comp_preds = comp_preds / float(self.mcmc_iters) print 'Prediction %f percent violations (%d/%d): ' % ( np.mean(preds < 0.5), np.sum(preds < 0.5), preds.shape[0]) print 'Prediction %f percent train accuracy (%d/%d): ' % ( np.mean((comp_preds > 0.5) == labels), np.sum((comp_preds > 0.5) == labels), comp_preds.shape[0]) if False: delta = 0.025 x = np.arange(0, 1.0, delta) y = np.arange(0, 1.0, delta) X, Y = np.meshgrid(x, y) cpreds = np.zeros((X.shape[0], X.shape[1])) predei = np.zeros((X.shape[0], X.shape[1])) predei2 = np.zeros((X.shape[0], X.shape[1])) for ii in xrange(self.mcmc_iters): constraint_hyper = self.constraint_hyper_samples[ii] self.ff = self.ff_samples[ii] self.constraint_mean = constraint_hyper[0] self.constraint_gain = constraint_hyper[1] self.constraint_amp2 = constraint_hyper[2] self.constraint_ls = constraint_hyper[3] cpred = self.pred_constraint_voilation(np.hstack((X.flatten()[:,np.newaxis], Y.flatten()[:,np.newaxis])), comp, labels) pei = self.compute_ei_per_s(comp, pend, np.hstack((X.flatten()[:,np.newaxis], Y.flatten()[:,np.newaxis])), vals, labels) pei2 = self.compute_ei(comp, pend, np.hstack((X.flatten()[:,np.newaxis], Y.flatten()[:,np.newaxis])), vals, labels) cpreds += np.reshape(cpred, (X.shape[0], X.shape[1])) predei += np.reshape(pei, (X.shape[0], X.shape[1])) predei2 += np.reshape(pei2, (X.shape[0], X.shape[1])) plt.figure(1) cpreds = cpreds/float(self.mcmc_iters) CS = plt.contour(X,Y,cpreds) plt.clabel(CS, inline=1, fontsize=10) plt.plot(comp[labels == 0,0], comp[labels == 0,1], 'rx') plt.plot(comp[labels == 1,0], comp[labels == 1,1], 'bx') plt.title('Contours of Classification GP (Prob of not being a constraint violation)') plt.legend(('Constraint Violations', 'Good points'),'lower left') plt.savefig('constrained_ei_chooser_class_contour.pdf') plt.figure(2) predei = predei/float(self.mcmc_iters) CS = plt.contour(X,Y,predei) plt.clabel(CS, inline=1, fontsize=10) plt.plot(comp[labels == 0,0], comp[labels == 0,1], 'rx') plt.plot(comp[labels == 1,0], comp[labels == 1,1], 'bx') plt.title('Contours of EI*P(not violating constraint)') plt.legend(('Constraint Violations', 'Good points'),'lower left') plt.savefig('constrained_ei_chooser_eitimesprob_contour.pdf') plt.figure(3) predei2 = predei2/float(self.mcmc_iters) CS = plt.contour(X,Y,predei2) plt.clabel(CS, inline=1, fontsize=10) plt.plot(comp[labels == 0,0], comp[labels == 0,1], 'rx') plt.plot(comp[labels == 1,0], comp[labels == 1,1], 'bx') plt.title('Contours of EI') plt.legend(('Constraint Violations', 'Good points'),'lower left') plt.savefig('constrained_ei_chooser_ei_contour.pdf') plt.show() # Pick the top candidates to optimize over overall_ei = self.ei_over_hypers(comp,pend,cand2,vals,labels) inds = np.argsort(np.mean(overall_ei, axis=1))[-self.grid_subset:] cand2 = cand2[inds,:] # Adjust the candidates to hit ei peaks b = []# optimization bounds for i in xrange(0, cand.shape[1]): b.append((0, 1)) for i in xrange(0, cand2.shape[0]): sys.stderr.write("Optimizing candidate %d/%d\n" % (i+1, cand2.shape[0])) self.check_grad_ei_per(cand2[i,:], comp, vals, labels) ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei_over_hypers, cand2[i,:].flatten(), args=(comp,vals,labels,True), bounds=b, disp=0) cand2[i,:] = ret[0] cand = np.vstack((cand, cand2)) overall_ei = self.ei_over_hypers(comp,pend,cand,vals,labels) best_cand = np.argmax(np.mean(overall_ei, axis=1)) self.dump_hypers() if (best_cand >= numcand): return (int(numcand), cand[best_cand,:]) return int(candidates[best_cand]) else: # Optimize hyperparameters self.optimize_hypers(comp, vals, labels) sys.stderr.write("mean: %f amp: %f noise: %f " "min_ls: %f max_ls: %f\n" % (self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) # Pick the top candidates to optimize over ei = self.compute_ei_per_s(comp, pend, cand2, vals, labels) inds = np.argsort(np.mean(overall_ei, axis=1))[-self.grid_subset:] cand2 = cand2[inds,:] # Adjust the candidates to hit ei peaks b = []# optimization bounds for i in xrange(0, cand.shape[1]): b.append((0, 1)) for i in xrange(0, cand2.shape[0]): sys.stderr.write("Optimizing candidate %d/%d\n" % (i+1, cand2.shape[0])) ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei, cand2[i,:].flatten(), args=(comp,vals,labels,True), bounds=b, disp=0) cand2[i,:] = ret[0] cand = np.vstack((cand, cand2)) ei = self.compute_ei_per_s(comp, pend, cand, vals, labels) best_cand = np.argmax(ei) self.dump_hypers() if (best_cand >= numcand): return (int(numcand), cand[best_cand,:]) return int(candidates[best_cand]) # Predict constraint voilating points def pred_constraint_voilation(self, cand, comp, vals): # The primary covariances for prediction. comp_cov = self.cov(self.constraint_amp2, self.constraint_ls, comp) cand_cross = self.cov(self.constraint_amp2, self.constraint_ls, comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.constraint_noise*np.eye(comp.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.constraint_ls, comp, cand) # Predictive things. # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), self.ff)# - self.constraint_mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha)# + self.constraint_mean func_m = 1./(1 + np.exp(-self.constraint_gain*func_m)) return func_m # Compute EI over hyperparameter samples def ei_over_hypers(self,comp,pend,cand,vals,labels): overall_ei = np.zeros((cand.shape[0], self.mcmc_iters)) for mcmc_iter in xrange(self.mcmc_iters): hyper = self.hyper_samples[mcmc_iter] constraint_hyper = self.constraint_hyper_samples[mcmc_iter] self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] self.constraint_mean = constraint_hyper[0] self.constraint_gain = constraint_hyper[1] self.constraint_amp2 = constraint_hyper[2] self.constraint_ls = constraint_hyper[3] overall_ei[:,mcmc_iter] = self.compute_ei_per_s(comp, pend, cand, vals, labels) return overall_ei # Adjust points by optimizing EI over a set of hyperparameter samples def grad_optimize_ei_over_hypers(self, cand, comp, vals, labels, compute_grad=True): summed_ei = 0 summed_grad_ei = np.zeros(cand.shape).flatten() for mcmc_iter in xrange(self.mcmc_iters): hyper = self.hyper_samples[mcmc_iter] constraint_hyper = self.constraint_hyper_samples[mcmc_iter] self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] self.constraint_mean = constraint_hyper[0] self.constraint_gain = constraint_hyper[1] self.constraint_amp2 = constraint_hyper[2] self.constraint_ls = constraint_hyper[3] if compute_grad: (ei,g_ei) = self.grad_optimize_ei(cand,comp,vals,labels,compute_grad) summed_grad_ei = summed_grad_ei + g_ei else: ei = self.grad_optimize_ei(cand,comp,vals,labels,compute_grad) summed_ei += ei if compute_grad: return (summed_ei, summed_grad_ei) else: return summed_ei def check_grad_ei_per(self, cand, comp, vals, labels): (ei,dx1) = self.grad_optimize_ei_over_hypers(cand, comp, vals, labels) dx2 = dx1*0 idx = np.zeros(cand.shape[0]) for i in xrange(0, cand.shape[0]): idx[i] = 1e-6 (ei1,tmp) = self.grad_optimize_ei_over_hypers(cand + idx, comp, vals, labels) (ei2,tmp) = self.grad_optimize_ei_over_hypers(cand - idx, comp, vals, labels) dx2[i] = (ei - ei2)/(2*1e-6) idx[i] = 0 print 'computed grads', dx1 print 'finite diffs', dx2 print (dx1/dx2) print np.sum((dx1 - dx2)**2) time.sleep(2) def grad_optimize_ei(self, cand, comp, vals, labels, compute_grad=True): # Here we have to compute the gradients for constrained ei # This means deriving through the two kernels, the one for predicting # constraint violations and the one predicting ei # First pull out violating points compfull = comp.copy() comp = comp[labels > 0, :] vals = vals[labels > 0] best = np.min(vals) cand = np.reshape(cand, (-1, comp.shape[1])) # First we make predictions for the durations # Compute covariances comp_constraint_cov = self.cov(self.constraint_amp2, self.constraint_ls, compfull) cand_constraint_cross = self.cov(self.constraint_amp2, self.constraint_ls, compfull,cand) # Cholesky decompositions obsv_constraint_cov = comp_constraint_cov + self.constraint_noise*np.eye( compfull.shape[0]) obsv_constraint_chol = spla.cholesky( obsv_constraint_cov, lower=True) # Linear systems t_alpha = spla.cho_solve((obsv_constraint_chol, True), self.ff)# - self.constraint_mean) # Predict marginal mean times and (possibly) variances func_constraint_m = np.dot(cand_constraint_cross.T, t_alpha) # Squash through logistic to get probabilities func_constraint_m = 1./(1+np.exp(-self.constraint_gain*func_constraint_m)) # Apply covariance function cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.constraint_ls, compfull, cand) grad_cross_t = np.squeeze(cand_cross_grad) # Now compute the gradients w.r.t. ei # The primary covariances for prediction. comp_cov = self.cov(self.amp2, self.ls, comp) cand_cross = self.cov(self.amp2, self.ls, comp, cand) comp_cov_full = self.cov(self.amp2, self.ls, compfull) cand_cross_full = self.cov(self.amp2, self.ls, compfull, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise*np.eye(comp.shape[0]) obsv_chol = spla.cholesky( obsv_cov, lower=True ) obsv_cov_full = comp_cov_full + self.noise*np.eye(compfull.shape[0]) obsv_chol_full = spla.cholesky( obsv_cov_full, lower=True) # Predictive things. # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) #beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) beta = spla.solve_triangular(obsv_chol_full, cand_cross_full, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*(u*ncdf + npdf) ei_per_s = -np.sum(ei*func_constraint_m) if not compute_grad: return ei_per_s grad_constraint_xp_m = np.dot(t_alpha.transpose(),grad_cross_t) # Gradients of ei w.r.t. mean and variance g_ei_m = -ncdf g_ei_s2 = 0.5*npdf / func_s # Apply covariance function cand_cross_grad = cov_grad_func(self.ls, comp, cand) grad_cross = np.squeeze(cand_cross_grad) cand_cross_grad_full = cov_grad_func(self.ls, compfull, cand) grad_cross_full = np.squeeze(cand_cross_grad_full) grad_xp_m = np.dot(alpha.transpose(),grad_cross) #grad_xp_v = np.dot(-2*spla.cho_solve((obsv_chol, True), # cand_cross).transpose(),grad_cross) grad_xp_v = np.dot(-2*spla.cho_solve((obsv_chol_full, True), cand_cross_full).transpose(),grad_cross_full) grad_xp = 0.5*self.amp2*(grad_xp_m*g_ei_m + grad_xp_v*g_ei_s2) grad_constraint_xp_m = 0.5*self.constraint_amp2*self.constraint_gain*grad_constraint_xp_m*func_constraint_m*(1-func_constraint_m) grad_xp = (func_constraint_m*grad_xp + ei*grad_constraint_xp_m) return ei_per_s, grad_xp.flatten() def compute_ei_per_s(self, comp, pend, cand, vals, labels): # First we make predictions for the durations as that # doesn't depend on pending experiments # First pull out violating points compfull = comp.copy() comp = comp[labels > 0, :] vals = vals[labels > 0] # Compute covariances comp_constraint_cov = self.cov(self.constraint_amp2, self.constraint_ls, compfull) cand_constraint_cross = self.cov(self.constraint_amp2, self.constraint_ls, compfull,cand) # Cholesky decompositions obsv_constraint_cov = comp_constraint_cov + self.constraint_noise*np.eye( compfull.shape[0]) obsv_constraint_chol = spla.cholesky( obsv_constraint_cov, lower=True ) # Linear systems t_alpha = spla.cho_solve((obsv_constraint_chol, True), self.ff)# - self.constraint_mean) #t_beta = spla.solve_triangular(obsv_constraint_chol, cand_constraint_cross, lower=True) # Predict marginal mean times and (possibly) variances func_constraint_m = (np.dot(cand_constraint_cross.T, t_alpha))# + self.constraint_mean) # We don't really need the time variances now #func_constraint_v = self.constraint_amp2*(1+1e-6) - np.sum(t_beta**2, axis=0) # Squash through a logistic to get probability of not violating a constraint func_constraint_m = 1./(1+np.exp(-self.constraint_gain*func_constraint_m)) if pend.shape[0] == 0: # If there are no pending, don't do anything fancy. # Current best. best = np.min(vals) # The primary covariances for prediction. comp_cov = self.cov(self.amp2, self.ls, comp) comp_cov_full = self.cov(self.amp2, self.ls, compfull) cand_cross = self.cov(self.amp2, self.ls, comp, cand) cand_cross_full = self.cov(self.amp2, self.ls, compfull, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise*np.eye(comp.shape[0]) obsv_cov_full = comp_cov_full + self.noise*np.eye(compfull.shape[0]) obsv_chol = spla.cholesky( obsv_cov, lower=True ) obsv_chol_full = spla.cholesky( obsv_cov_full, lower=True ) # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) #beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) beta = spla.solve_triangular(obsv_chol_full, cand_cross_full, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*( u*ncdf + npdf) ei_per_s = ei*func_constraint_m return ei_per_s else: # If there are pending experiments, fantasize their outcomes. # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = self.cov(self.amp2, self.ls, comp_pend) + self.noise*np.eye(comp_pend.shape[0]) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(self.amp2, self.ls, comp, pend) pend_kappa = self.cov(self.amp2, self.ls, pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[:comp.shape[0],:comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. pend_fant = np.dot(pend_chol, npr.randn(pend.shape[0],self.pending_samples)) + self.mean # Include the fantasies. fant_vals = np.concatenate((np.tile(vals[:,np.newaxis], (1,self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(self.amp2, self.ls, comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:,np.newaxis]) u = (bests[np.newaxis,:] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*( u*ncdf + npdf) return np.mean(ei, axis=1)*func_constraint_m def compute_ei(self, comp, pend, cand, vals, labels): # First we make predictions for the durations as that # doesn't depend on pending experiments # First pull out violating points compfull = comp.copy() comp = comp[labels > 0, :] vals = vals[labels > 0] # Compute covariances comp_constraint_cov = self.cov(self.constraint_amp2, self.constraint_ls, compfull) cand_constraint_cross = self.cov(self.constraint_amp2, self.constraint_ls, compfull,cand) # Cholesky decompositions obsv_constraint_cov = comp_constraint_cov + self.constraint_noise*np.eye( compfull.shape[0]) obsv_constraint_chol = spla.cholesky( obsv_constraint_cov, lower=True ) # Linear systems t_alpha = spla.cho_solve((obsv_constraint_chol, True), self.ff)# - self.constraint_mean) #t_beta = spla.solve_triangular(obsv_constraint_chol, cand_constraint_cross, lower=True) # Predict marginal mean times and (possibly) variances func_constraint_m = (np.dot(cand_constraint_cross.T, t_alpha))# + self.constraint_mean) # We don't really need the time variances now #func_constraint_v = self.constraint_amp2*(1+1e-6) - np.sum(t_beta**2, axis=0) # Squash through a logistic to get probability of not violating a constraint func_constraint_m = 1./(1+np.exp(-self.constraint_gain*func_constraint_m)) if pend.shape[0] == 0: # If there are no pending, don't do anything fancy. # Current best. best = np.min(vals) # The primary covariances for prediction. comp_cov = self.cov(self.amp2, self.ls, comp) comp_cov_full = self.cov(self.amp2, self.ls, compfull) cand_cross = self.cov(self.amp2, self.ls, comp, cand) cand_cross_full = self.cov(self.amp2, self.ls, compfull, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise*np.eye(comp.shape[0]) obsv_cov_full = comp_cov_full + self.noise*np.eye(compfull.shape[0]) obsv_chol = spla.cholesky( obsv_cov, lower=True ) obsv_chol_full = spla.cholesky( obsv_cov_full, lower=True ) # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) #beta = spla.solve_triangular(obsv_chol_full, cand_cross_full, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*( u*ncdf + npdf) ei_per_s = ei #ei_per_s = ei return ei else: return 0 def sample_constraint_hypers(self, comp, labels): # The latent GP projection if self.ff is None: comp_cov = self.cov(self.amp2, self.ls, comp) obsv_cov = comp_cov + self.noise*np.eye(comp.shape[0]) obsv_chol = spla.cholesky( obsv_cov, lower=True ) self.ff = np.dot(obsv_chol,npr.randn(obsv_chol.shape[0])) self._sample_constraint_noisy(comp, labels) self._sample_constraint_ls(comp, labels) self.constraint_hyper_samples.append((self.constraint_mean, self.constraint_gain, self.constraint_amp2, self.constraint_ls)) self.ff_samples.append(self.ff) def sample_hypers(self, comp, vals): if self.noiseless: self.noise = 1e-3 self._sample_noiseless(comp, vals) else: self._sample_noisy(comp, vals) self._sample_ls(comp, vals) self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) def _sample_ls(self, comp, vals): def logprob(ls): if np.any(ls < 0) or np.any(ls > self.max_ls): return -np.inf cov = self.amp2 * (self.cov_func(ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + self.noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - self.mean) lp = (-np.sum(np.log(np.diag(chol))) - 0.5*np.dot(vals-self.mean, solve)) return lp self.ls = util.slice_sample(self.ls, logprob, compwise=True) def _sample_constraint_ls(self, comp, vals): def lpSigmoid(ff, gain=self.constraint_gain): probs = 1./(1. + np.exp(-gain*ff)); probs[probs <= 0] = 1e-12 probs[probs >= 1] = 1-1e-12 llh = np.sum(vals*np.log(probs) + (1-vals)*np.log(1-probs)); return llh def updateGain(gain): if gain < 0.01 or gain > 10: return -np.inf cov = self.constraint_amp2 * (self.cov_func(self.constraint_ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + self.constraint_noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals)# - self.constraint_mean) #lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(self.ff, solve) lp = lpSigmoid(self.ff, gain) return lp def logprob(ls): if np.any(ls < 0) or np.any(ls > self.constraint_max_ls): return -np.inf cov = self.constraint_amp2 * (self.cov_func(ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + self.constraint_noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), self.ff)# - self.constraint_mean) #lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(self.ff, solve) lp = lpSigmoid(self.ff) return lp #hypers = util.slice_sample(np.hstack((self.constraint_ls, self.ff)), logprob, compwise=True) hypers = util.slice_sample(self.constraint_ls, logprob, compwise=True) self.constraint_ls = hypers cov = self.constraint_amp2 * (self.cov_func(self.constraint_ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + self.constraint_noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=False) ff = self.ff for jj in xrange(20): (ff, lpell) = self.elliptical_slice(ff, chol, lpSigmoid); self.ff = ff # Update gain hypers = util.slice_sample(np.array([self.constraint_gain]), updateGain, compwise=True) self.constraint_gain = hypers def _sample_noisy(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = hypers[2] # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0 or noise < 0: return -np.inf cov = amp2 * (self.cov_func(self.ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve) # Roll in noise horseshoe prior. lp += np.log(np.log(1 + (self.noise_scale/noise)**2)) #lp -= 0.5*(np.log(noise)/self.noise_scale)**2 # Roll in amplitude lognormal prior lp -= 0.5*(np.log(amp2)/self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = hypers[2] def _sample_constraint_noisy(self, comp, vals): def lpSigmoid(ff,gain=self.constraint_gain): probs = 1./(1. + np.exp(-gain*ff)); probs[probs <= 0] = 1e-12 probs[probs >= 1] = 1-1e-12 llh = np.sum(vals*np.log(probs) + (1-vals)*np.log(1-probs)); return llh def logprob(hypers): #mean = hypers[0] amp2 = hypers[0] #gain = hypers[2] ff = hypers[1:] # This is pretty hacky, but keeps things sane. #if mean > np.max(vals) or mean < np.min(vals): # return -np.inf if amp2 < 0: return -np.inf noise = self.constraint_noise cov = amp2 * (self.cov_func(self.constraint_ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), ff)# - mean) #lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(ff-mean, solve) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(ff, solve) # Roll in noise horseshoe prior. #lp += np.log(np.log(1 + (self.constraint_noise_scale/noise)**2)) #lp -= 0.5*(np.log(noise)/self.constraint_noise_scale)**2 # Roll in amplitude lognormal prior lp -= 0.5*(np.log(amp2)/self.constraint_amp2_scale)**2 #lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(self.ff, solve) lp += lpSigmoid(ff,self.constraint_gain) return lp hypers = util.slice_sample(np.hstack((np.array([self.constraint_amp2]), self.ff)), logprob, compwise=False) #self.constraint_mean = hypers[0] self.constraint_amp2 = hypers[0] #self.constraint_gain = hypers[2] self.ff = hypers[1:] cov = self.constraint_amp2 * (self.cov_func(self.constraint_ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + self.constraint_noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=False) ff = self.ff for jj in xrange(50): (ff, lpell) = self.elliptical_slice(ff, chol, lpSigmoid); self.ff = ff def _sample_noiseless(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = 1e-3 # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0: return -np.inf cov = amp2 * (self.cov_func(self.ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve) # Roll in amplitude lognormal prior lp -= 0.5*(np.log(amp2)/self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = 1e-3 def elliptical_slice(self, xx, chol_Sigma, log_like_fn, cur_log_like=None, angle_range=0): D = xx.shape[0] if cur_log_like is None: cur_log_like = log_like_fn(xx) nu = np.dot(chol_Sigma.T,np.random.randn(D, 1)).flatten() hh = np.log(np.random.rand()) + cur_log_like # Set up a bracket of angles and pick a first proposal. # "phi = (theta'-theta)" is a change in angle. if angle_range <= 0: # Bracket whole ellipse with both edges at first proposed point phi = np.random.rand()*2*math.pi; phi_min = phi - 2*math.pi; phi_max = phi; else: # Randomly center bracket on current point phi_min = -angle_range*np.random.rand(); phi_max = phi_min + angle_range; phi = np.random.rand()*(phi_max - phi_min) + phi_min; # Slice sampling loop while True: # Compute xx for proposed angle difference and check if it's on the slice xx_prop = xx*np.cos(phi) + nu*np.sin(phi); cur_log_like = log_like_fn(xx_prop); if cur_log_like > hh: # New point is on slice, ** EXIT LOOP ** break; # Shrink slice to rejected point if phi > 0: phi_max = phi; elif phi < 0: phi_min = phi; else: raise Exception('BUG DETECTED: Shrunk to current position and still not acceptable.'); # Propose new angle difference phi = np.random.rand()*(phi_max - phi_min) + phi_min; xx = xx_prop; return (xx, cur_log_like) def optimize_hypers(self, comp, vals, labels): # First the GP to observations mygp = gp.GP(self.cov_func.__name__) mygp.real_init(comp.shape[1], vals) mygp.optimize_hypers(comp,vals) self.mean = mygp.mean self.ls = mygp.ls self.amp2 = mygp.amp2 self.noise = mygp.noise # Now the GP to times timegp = gp.GP(self.cov_func.__name__) timegp.real_init(comp.shape[1], labels) timegp.optimize_hypers(comp, labels) self.constraint_mean = timegp.mean self.constraint_amp2 = timegp.amp2 self.constraint_noise = timegp.noise self.constraint_ls = timegp.ls # Save hyperparameter samples self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) self.constraint_hyper_samples.append((self.constraint_mean, self.constraint_noise, self.constraint_amp2, self.constraint_ls)) self.dump_hypers()
import zmq import Locker import constraints as c import cPickle import commands # The port to which this server will listen PORT = '5060' # Setting up 0mq to work as socket server context = zmq.Context() socket = context.socket(zmq.REP) socket.bind("tcp://0.0.0.0:%s" % PORT) lock = Locker.Lock(300) # In testing mode no drive commands are executed TESTING_MODE = True #import Controller #from Manual_Drive import * # create controller entity #controller = Controller.Controller() #manualDrive = ManualDrive(controller.start_command,controller.forward,controller.backward,controller.left,controller.right,controller.stop) # A dictionnary containing the possible commands (keys) # Correct formated command : {'command':'NameCommand',ID:{}} commands = { 'LOCK':{'nb_of_arguments':0,'function':func_lock}, 'UNLOCK':{'nb_of_arguments':0,'function':func_unlock},
import Locker import constraints as c import cPickle from commands import * DEBUG = True scriptPath = os.path.realpath(os.path.dirname(sys.argv[0])) os.chdir(scriptPath) #append the relative location you want to import from sys.path.append("../Socket") import sockets_server # Setting up a new a new socket server socket = sockets_server.SocketServer(6001) socket.start() # Create a lock entity with a lock time of 20 minutes lock = Locker.Lock(1200) if not DEBUG: # Import controller, entity responsible for starting and stopping controller commands import Controller # Import commands that the controller can start import ControllerCommands # Import manual drive is entity to select the right controller command given the chosen keys from ManualDrive import * # create controller entity print 'Start controller' controller = Controller.Controller() manualDrive = ManualDrive( controller.start_command, ControllerCommands.forward, ControllerCommands.backward, ControllerCommands.left, ControllerCommands.right, ControllerCommands.forward_left, ControllerCommands.forward_right, ControllerCommands.backward_left,
class GPEIOptChooser: def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False, burnin=100, grid_subset=20, use_multiprocessing=True): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.stats_file = os.path.join( expt_dir, self.__module__ + "_hyperparameters.txt") self.mcmc_iters = int(mcmc_iters) self.burnin = int(burnin) self.needs_burnin = True self.pending_samples = int(pending_samples) self.D = -1 self.hyper_iters = 1 # Number of points to optimize EI over self.grid_subset = int(grid_subset) self.noiseless = bool(int(noiseless)) self.hyper_samples = [] self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 2 # top-hat prior on length scales # If multiprocessing fails or deadlocks, set this to False self.use_multiprocessing = bool(int(use_multiprocessing)) def dump_hypers(self): self.locker.lock_wait(self.state_pkl) # Write the hyperparameters out to a Pickle. fh = tempfile.NamedTemporaryFile(mode='w+b', delete=False) pickle.dump( { 'dims': self.D, 'ls': self.ls, 'amp2': self.amp2, 'noise': self.noise, 'hyper_samples': self.hyper_samples, 'mean': self.mean }, fh) fh.close() # Use an atomic move for better NFS happiness. cmd = 'mv "%s" "%s"' % (fh.name, self.state_pkl) os.system(cmd) # TODO: Should check system-dependent return status. self.locker.unlock(self.state_pkl) # Write the hyperparameters out to a human readable file as well fh = open(self.stats_file, 'wt') fh.write('Mean Noise Amplitude <length scales>\n') fh.write('-----------ALL SAMPLES-------------\n') meanhyps = 0 * np.hstack(self.hyper_samples[0]) for i in self.hyper_samples: hyps = np.hstack(i) meanhyps += (1 / float(len(self.hyper_samples))) * hyps for j in hyps: fh.write(str(j) + ' ') fh.write('\n') fh.write('-----------MEAN OF SAMPLES-------------\n') for j in meanhyps: fh.write(str(j) + ' ') fh.write('\n') fh.close() # This passes out html or javascript to display interesting # stats - such as the length scales (sensitivity to various # dimensions). def generate_stats_html(self): # Need this because the model may not necessarily be # initialized when this code is called. if not self._read_only(): return 'Chooser not yet ready to display output' mean_mean = np.mean(np.vstack([h[0] for h in self.hyper_samples])) mean_noise = np.mean(np.vstack([h[1] for h in self.hyper_samples])) mean_ls = np.mean( np.vstack([h[3][np.newaxis, :] for h in self.hyper_samples]), 0) try: output = ( '<br /><span class=\"label label-info\">Estimated mean:</span> ' + str(mean_mean) + '<br /><span class=\"label label-info\">Estimated noise:</span> ' + str(mean_noise) + '<br /><br /><span class=\"label label-info\">Inverse parameter sensitivity' + ' - Gaussian Process length scales</span><br /><br />' + '<div id=\"lschart\"></div><script type=\"text/javascript\">' + 'var lsdata = [' + ','.join(['%.2f' % i for i in mean_ls]) + '];') except: return 'Chooser not yet ready to display output.' output += ('bar_chart("#lschart", lsdata, ' + str(self.max_ls) + ');' + '</script>') return output # Read in the chooser from file. Returns True only on success def _read_only(self): if os.path.exists(self.state_pkl): fh = open(self.state_pkl, 'rb') state = pickle.load(fh) fh.close() self.D = state['dims'] self.ls = state['ls'] self.amp2 = state['amp2'] self.noise = state['noise'] self.mean = state['mean'] self.hyper_samples = state['hyper_samples'] self.needs_burnin = False return True return False def _real_init(self, dims, values): self.locker.lock_wait(self.state_pkl) self.randomstate = npr.get_state() if os.path.exists(self.state_pkl): fh = open(self.state_pkl, 'rb') state = pickle.load(fh) fh.close() self.D = state['dims'] self.ls = state['ls'] self.amp2 = state['amp2'] self.noise = state['noise'] self.mean = state['mean'] self.hyper_samples = state['hyper_samples'] self.needs_burnin = False else: # Input dimensionality. self.D = dims # Initial length scales. self.ls = np.ones(self.D) # Initial amplitude. self.amp2 = np.std(values) + 1e-4 # Initial observation noise. self.noise = 1e-3 # Initial mean. self.mean = np.mean(values) # Save hyperparameter samples self.hyper_samples.append( (self.mean, self.noise, self.amp2, self.ls)) self.locker.unlock(self.state_pkl) def cov(self, x1, x2=None): if x2 is None: return self.amp2 * (self.cov_func(self.ls, x1, None) + 1e-6 * np.eye(x1.shape[0])) else: return self.amp2 * self.cov_func(self.ls, x1, x2) # Given a set of completed 'experiments' in the unit hypercube with # corresponding objective 'values', pick from the next experiment to # run according to the acquisition function. def next(self, grid, values, durations, candidates, pending, complete): # Don't bother using fancy GP stuff at first. if complete.shape[0] < 2: return int(candidates[0]) # Perform the real initialization. if self.D == -1: self._real_init(grid.shape[1], values[complete]) # Grab out the relevant sets. comp = grid[complete, :] cand = grid[candidates, :] pend = grid[pending, :] vals = values[complete] numcand = cand.shape[0] # Spray a set of candidates around the min so far best_comp = np.argmin(vals) cand2 = np.vstack( (np.random.randn(10, comp.shape[1]) * 0.001 + comp[best_comp, :], cand)) if self.mcmc_iters > 0: # Possibly burn in. if self.needs_burnin: for mcmc_iter in range(self.burnin): self.sample_hypers(comp, vals) log("BURN %d/%d] mean: %.2f amp: %.2f " "noise: %.4f min_ls: %.4f max_ls: %.4f" % (mcmc_iter + 1, self.burnin, self.mean, np.sqrt(self.amp2), self.noise, np.min( self.ls), np.max(self.ls))) self.needs_burnin = False # Sample from hyperparameters. # Adjust the candidates to hit ei peaks self.hyper_samples = [] for mcmc_iter in range(self.mcmc_iters): self.sample_hypers(comp, vals) log("%d/%d] mean: %.2f amp: %.2f noise: %.4f " "min_ls: %.4f max_ls: %.4f" % (mcmc_iter + 1, self.mcmc_iters, self.mean, np.sqrt(self.amp2), self.noise, np.min( self.ls), np.max(self.ls))) self.dump_hypers() b = [] # optimization bounds for i in range(0, cand.shape[1]): b.append((0, 1)) overall_ei = self.ei_over_hypers(comp, pend, cand2, vals) inds = np.argsort(np.mean(overall_ei, axis=1))[-self.grid_subset:] cand2 = cand2[inds, :] # Optimize each point in parallel if self.use_multiprocessing: pool = multiprocessing.Pool(self.grid_subset) results = [ pool.apply_async(optimize_pt, args=(c, b, comp, pend, vals, copy.copy(self))) for c in cand2 ] for res in results: cand = np.vstack((cand, res.get(1e8))) pool.close() else: # This is old code to optimize each point in parallel. for i in range(0, cand2.shape[0]): log("Optimizing candidate %d/%d" % (i + 1, cand2.shape[0])) #self.check_grad_ei(cand2[i,:].flatten(), comp, pend, vals) ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei_over_hypers, cand2[i, :].flatten(), args=(comp, pend, vals), bounds=b, disp=0) cand2[i, :] = ret[0] cand = np.vstack((cand, cand2)) overall_ei = self.ei_over_hypers(comp, pend, cand, vals) best_cand = np.argmax(np.mean(overall_ei, axis=1)) if (best_cand >= numcand): return (int(numcand), cand[best_cand, :]) return int(candidates[best_cand]) else: # Optimize hyperparameters self.optimize_hypers(comp, vals) log("mean: %.2f amp: %.2f noise: %.4f " "min_ls: %.4f max_ls: %.4f" % (self.mean, np.sqrt( self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) # Optimize over EI b = [] # optimization bounds for i in range(0, cand.shape[1]): b.append((0, 1)) for i in range(0, cand2.shape[0]): ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei, cand2[i, :].flatten(), args=(comp, vals, True), bounds=b, disp=0) cand2[i, :] = ret[0] cand = np.vstack((cand, cand2)) ei = self.compute_ei(comp, pend, cand, vals) best_cand = np.argmax(ei) if (best_cand >= numcand): return (int(numcand), cand[best_cand, :]) return int(candidates[best_cand]) # Compute EI over hyperparameter samples def ei_over_hypers(self, comp, pend, cand, vals): overall_ei = np.zeros((cand.shape[0], self.mcmc_iters)) for mcmc_iter in range(self.mcmc_iters): hyper = self.hyper_samples[mcmc_iter] self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] overall_ei[:, mcmc_iter] = self.compute_ei(comp, pend, cand, vals) return overall_ei def check_grad_ei(self, cand, comp, pend, vals): (ei, dx1) = self.grad_optimize_ei_over_hypers(cand, comp, pend, vals) dx2 = dx1 * 0 idx = np.zeros(cand.shape[0]) for i in range(0, cand.shape[0]): idx[i] = 1e-6 (ei1, tmp) = self.grad_optimize_ei_over_hypers(cand + idx, comp, pend, vals) (ei2, tmp) = self.grad_optimize_ei_over_hypers(cand - idx, comp, pend, vals) dx2[i] = (ei - ei2) / (2 * 1e-6) idx[i] = 0 print('computed grads', dx1) print('finite diffs', dx2) print((dx1 / dx2)) print(np.sum((dx1 - dx2)**2)) time.sleep(2) # Adjust points by optimizing EI over a set of hyperparameter samples def grad_optimize_ei_over_hypers(self, cand, comp, pend, vals, compute_grad=True): summed_ei = 0 summed_grad_ei = np.zeros(cand.shape).flatten() ls = self.ls.copy() amp2 = self.amp2 mean = self.mean noise = self.noise for hyper in self.hyper_samples: self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] if compute_grad: (ei, g_ei) = self.grad_optimize_ei(cand, comp, pend, vals, compute_grad) summed_grad_ei = summed_grad_ei + g_ei else: ei = self.grad_optimize_ei(cand, comp, pend, vals, compute_grad) summed_ei += ei self.mean = mean self.amp2 = amp2 self.noise = noise self.ls = ls.copy() if compute_grad: return (summed_ei, summed_grad_ei) else: return summed_ei # Adjust points based on optimizing their ei def grad_optimize_ei(self, cand, comp, pend, vals, compute_grad=True): if pend.shape[0] == 0: best = np.min(vals) cand = np.reshape(cand, (-1, comp.shape[1])) # The primary covariances for prediction. comp_cov = self.cov(comp) cand_cross = self.cov(comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.ls, comp, cand) # Predictive things. # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) if not compute_grad: return ei # Gradients of ei w.r.t. mean and variance g_ei_m = -ncdf g_ei_s2 = 0.5 * npdf / func_s # Apply covariance function grad_cross = np.squeeze(cand_cross_grad) grad_xp_m = np.dot(alpha.transpose(), grad_cross) grad_xp_v = np.dot( -2 * spla.cho_solve((obsv_chol, True), cand_cross).transpose(), grad_cross) grad_xp = 0.5 * self.amp2 * (grad_xp_m * g_ei_m + grad_xp_v * g_ei_s2) ei = -np.sum(ei) return ei, grad_xp.flatten() else: # If there are pending experiments, fantasize their outcomes. cand = np.reshape(cand, (-1, comp.shape[1])) # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = (self.cov(comp_pend) + self.noise * np.eye(comp_pend.shape[0])) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(comp, pend) pend_kappa = self.cov(pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[:comp.shape[0], :comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. npr.set_state(self.randomstate) pend_fant = np.dot( pend_chol, npr.randn(pend.shape[0], self.pending_samples)) + pend_m[:, None] # Include the fantasies. fant_vals = np.concatenate( (np.tile(vals[:, np.newaxis], (1, self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(comp_pend, cand) cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.ls, comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:, np.newaxis]) u = (bests[np.newaxis, :] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) # Gradients of ei w.r.t. mean and variance g_ei_m = -ncdf g_ei_s2 = 0.5 * npdf / func_s # Apply covariance function # Squeeze can break the 1D case be careful if pend.shape[1] == 1: grad_cross = np.squeeze(cand_cross_grad, axis=(2, )) else: grad_cross = np.squeeze(cand_cross_grad) grad_xp_m = np.dot(alpha.transpose(), grad_cross) grad_xp_v = np.dot( -2 * spla.cho_solve( (comp_pend_chol, True), cand_cross).transpose(), grad_cross) grad_xp = 0.5 * self.amp2 * ( grad_xp_m * np.tile(g_ei_m, (comp.shape[1], 1)).T + (grad_xp_v.T * g_ei_s2).T) ei = -np.mean(ei, axis=1) grad_xp = np.mean(grad_xp, axis=0) return ei, grad_xp.flatten() def compute_ei(self, comp, pend, cand, vals): if pend.shape[0] == 0: # If there are no pending, don't do anything fancy. # Current best. best = np.min(vals) # The primary covariances for prediction. comp_cov = self.cov(comp) cand_cross = self.cov(comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) return ei else: # If there are pending experiments, fantasize their outcomes. # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = (self.cov(comp_pend) + self.noise * np.eye(comp_pend.shape[0])) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(comp, pend) pend_kappa = self.cov(pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[:comp.shape[0], :comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. npr.set_state(self.randomstate) pend_fant = np.dot( pend_chol, npr.randn(pend.shape[0], self.pending_samples)) + pend_m[:, None] # Include the fantasies. fant_vals = np.concatenate( (np.tile(vals[:, np.newaxis], (1, self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:, np.newaxis]) u = (bests[np.newaxis, :] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) return np.mean(ei, axis=1) def sample_hypers(self, comp, vals): if self.noiseless: self.noise = 1e-3 self._sample_noiseless(comp, vals) else: self._sample_noisy(comp, vals) self._sample_ls(comp, vals) self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) def _sample_ls(self, comp, vals): def logprob(ls): if np.any(ls < 0) or np.any(ls > self.max_ls): return -np.inf cov = ( self.amp2 * (self.cov_func(ls, comp, None) + 1e-6 * np.eye(comp.shape[0])) + self.noise * np.eye(comp.shape[0])) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - self.mean) lp = (-np.sum(np.log(np.diag(chol))) - 0.5 * np.dot(vals - self.mean, solve)) return lp self.ls = util.slice_sample(self.ls, logprob, compwise=True) def _sample_noisy(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = hypers[2] # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0 or noise < 0: return -np.inf cov = (amp2 * (self.cov_func(self.ls, comp, None) + 1e-6 * np.eye(comp.shape[0])) + noise * np.eye(comp.shape[0])) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot( vals - mean, solve) # Roll in noise horseshoe prior. lp += np.log(np.log(1 + (self.noise_scale / noise)**2)) # Roll in amplitude lognormal prior lp -= 0.5 * (np.log(np.sqrt(amp2)) / self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = hypers[2] def _sample_noiseless(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = 1e-3 # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0: return -np.inf cov = (amp2 * (self.cov_func(self.ls, comp, None) + 1e-6 * np.eye(comp.shape[0])) + noise * np.eye(comp.shape[0])) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot( vals - mean, solve) # Roll in amplitude lognormal prior lp -= 0.5 * (np.log(np.sqrt(amp2)) / self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = 1e-3 def optimize_hypers(self, comp, vals): mygp = gp.GP(self.cov_func.__name__) mygp.real_init(comp.shape[1], vals) mygp.optimize_hypers(comp, vals) self.mean = mygp.mean self.ls = mygp.ls self.amp2 = mygp.amp2 self.noise = mygp.noise # Save hyperparameter samples self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) self.dump_hypers() return
class ExperimentGrid: @staticmethod def job_running(expt_dir, id): expt_grid = ExperimentGrid(expt_dir) expt_grid.set_running(id) @staticmethod def job_complete(expt_dir, id, value, duration): expt_grid = ExperimentGrid(expt_dir) expt_grid.set_complete(id, value, duration) @staticmethod def job_broken(expt_dir, id): expt_grid = ExperimentGrid(expt_dir) expt_grid.set_broken(id) def __init__(self, expt_dir, variables=None, grid_size=None, grid_seed=1): self.expt_dir = expt_dir self.jobs_pkl = os.path.join(expt_dir, 'expt-grid.pkl') self.locker = Locker() # Only one process at a time is allowed to have access to this. sys.stderr.write("Waiting to lock grid...") self.locker.lock_wait(self.jobs_pkl) sys.stderr.write("...acquired\n") # Does this exist already? if variables is not None and not os.path.exists(self.jobs_pkl): # Set up the grid for the first time. self.seed = grid_seed self.vmap = GridMap(variables, grid_size) self.grid = self.hypercube_grid(self.vmap.card(), grid_size) self.status = np.zeros(grid_size, dtype=int) + CANDIDATE_STATE self.values = np.zeros(grid_size) + np.nan self.durs = np.zeros(grid_size) + np.nan self.sgeids = np.zeros(grid_size, dtype=int) # Save this out. self._save_jobs() else: # Load in from the pickle. self._load_jobs() def __del__(self): self._save_jobs() if self.locker.unlock(self.jobs_pkl): sys.stderr.write("Released lock on job grid.\n") else: raise Exception("Could not release lock on job grid.\n") def get_grid(self): return self.grid, self.values, self.durs def get_candidates(self): return np.nonzero(self.status == CANDIDATE_STATE)[0] def get_pending(self): return np.nonzero((self.status == SUBMITTED_STATE) | (self.status == RUNNING_STATE))[0] def get_complete(self): return np.nonzero(self.status == COMPLETE_STATE)[0] def get_broken(self): return np.nonzero(self.status == BROKEN_STATE)[0] def get_params(self, index): return self.vmap.get_params(self.grid[index,:]) def get_best(self): finite = self.values[np.isfinite(self.values)] if len(finite) > 0: cur_min = np.min(finite) index = np.nonzero(self.values==cur_min)[0][0] return cur_min, index else: return np.nan, -1 def get_sgeid(self, id): return self.sgeids[id] def add_to_grid(self, candidate): # Set up the grid self.grid = np.vstack((self.grid, candidate)) self.status = np.append(self.status, np.zeros(1, dtype=int) + int(CANDIDATE_STATE)) self.values = np.append(self.values, np.zeros(1)+np.nan) self.durs = np.append(self.durs, np.zeros(1)+np.nan) self.sgeids = np.append(self.sgeids, np.zeros(1,dtype=int)) # Save this out. self._save_jobs() return self.grid.shape[0]-1 def set_candidate(self, id): self.status[id] = CANDIDATE_STATE self._save_jobs() def set_submitted(self, id, sgeid): self.status[id] = SUBMITTED_STATE self.sgeids[id] = sgeid self._save_jobs() def set_running(self, id): self.status[id] = RUNNING_STATE self._save_jobs() def set_complete(self, id, value, duration): self.status[id] = COMPLETE_STATE self.values[id] = value self.durs[id] = duration self._save_jobs() def set_broken(self, id): self.status[id] = BROKEN_STATE self._save_jobs() def _load_jobs(self): fh = open(self.jobs_pkl, 'r') jobs = cPickle.load(fh) fh.close() self.vmap = jobs['vmap'] self.grid = jobs['grid'] self.status = jobs['status'] self.values = jobs['values'] self.durs = jobs['durs'] self.sgeids = jobs['sgeids'] def _save_jobs(self): # Write everything to a temporary file first. fh = tempfile.NamedTemporaryFile(mode='w', delete=False) cPickle.dump({ 'vmap' : self.vmap, 'grid' : self.grid, 'status' : self.status, 'values' : self.values, 'durs' : self.durs, 'sgeids' : self.sgeids }, fh) fh.close() # Use an atomic move for better NFS happiness. cmd = 'mv "%s" "%s"' % (fh.name, self.jobs_pkl) os.system(cmd) # TODO: Should check system-dependent return status. def _hypercube_grid(self, dims, size): # Generate from a sobol sequence sobol_grid = np.transpose(i4_sobol_generate(dims,size,self.seed)) return sobol_grid
def reset(): # Kill running workers. if os.path.exists('expt-grid.pkl'): try: locker = Locker.Locker() locker.lock( os.path.join(os.path.realpath('.'), 'expt-grid.pkllock')) with open('expt-grid.pkl', 'r') as f: expt_grid = pickle.load(f) for proc_ind in xrange(expt_grid['sgeids'].shape[0]): if expt_grid['status'][ proc_ind] == ExperimentGrid.RUNNING_STATE: print 'Killing process with id: %s' % expt_grid['sgeids'][ proc_ind] try: subprocess.check_call('taskkill /PID %s /F /T' % expt_grid['sgeids'][proc_ind]) except: print 'Couldnt kill process with id: %s' % expt_grid[ 'sgeids'][proc_ind] except Exception as e: print 'Couldnt clean up processes: %s.' % e.message # Clean up. # Jobs. if os.path.exists('jobs'): try: shutil.rmtree('jobs') except: print 'Couldnt remove jobs folder' # Outputs. if os.path.exists('output'): try: shutil.rmtree('output') except: try: time.sleep(5) shutil.rmtree('output') except: print 'Couldnt remove output folder' # Best result. if os.path.exists('best_job_and_result.txt'): try: os.remove('best_job_and_result.txt') except: print 'Couldnt remove best job file' # Experiment grid. if os.path.exists("expt-grid.pkl"): try: os.remove('expt-grid.pkl') except: print 'Couldnt remove experiment grid.' # GPEIOptChooser files. if os.path.exists('GPEIOptChooser.pkl'): try: os.remove('GPEIOptChooser.pkl') os.remove('GPEIOptChooser_hyperparameters.txt') except: print 'Couldnt remove GPEIOptChooser files.' # Trace. if os.path.exists('trace.csv'): try: os.remove('trace.csv') except: print 'Couldnt remove jobs folder'
class GPConstrainedEIChooser: def __init__( self, expt_dir, covar="Matern52", mcmc_iters=20, pending_samples=100, noiseless=False, burnin=100, grid_subset=20, constraint_violating_value=np.inf, verbosity=0, visualize2D=False, ): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.stats_file = os.path.join(expt_dir, self.__module__ + "_hyperparameters.txt") self.mcmc_iters = int(mcmc_iters) self.burnin = int(burnin) self.needs_burnin = True self.pending_samples = pending_samples self.D = -1 self.hyper_iters = 1 # Number of points to optimize EI over self.grid_subset = int(grid_subset) self.noiseless = bool(int(noiseless)) self.hyper_samples = [] self.constraint_hyper_samples = [] self.ff = None self.ff_samples = [] self.verbosity = int(verbosity) self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 2 # top-hat prior on length scales self.constraint_noise_scale = 0.1 # horseshoe prior self.constraint_amp2_scale = 1 # zero-mean log normal prio self.constraint_gain = 1 # top-hat prior on length scales self.constraint_max_ls = 2 # top-hat prior on length scales self.bad_value = float(constraint_violating_value) self.visualize2D = visualize2D # A simple function to dump out hyperparameters to allow for a hot start # if the optimization is restarted. def dump_hypers(self): self.locker.lock_wait(self.state_pkl) # Write the hyperparameters out to a Pickle. fh = tempfile.NamedTemporaryFile(mode="wb", delete=False) pickle.dump( { "dims": self.D, "ls": self.ls, "amp2": self.amp2, "noise": self.noise, "mean": self.mean, "constraint_ls": self.constraint_ls, "constraint_amp2": self.constraint_amp2, "constraint_noise": self.constraint_noise, "constraint_mean": self.constraint_mean, }, fh, ) fh.close() # Use an atomic move for better NFS happiness. cmd = 'mv "%s" "%s"' % (fh.name, self.state_pkl) os.system(cmd) # TODO: Should check system-dependent return status. self.locker.unlock(self.state_pkl) # Write the hyperparameters out to a human readable file as well fh = open(self.stats_file, "w") fh.write("Mean Noise Amplitude <length scales>\n") fh.write("-----------ALL SAMPLES-------------\n") meanhyps = 0 * np.hstack(self.hyper_samples[0]) for i in self.hyper_samples: hyps = np.hstack(i) meanhyps += (1 / float(len(self.hyper_samples))) * hyps for j in hyps: fh.write(str(j) + " ") fh.write("\n") fh.write("-----------MEAN OF SAMPLES-------------\n") for j in meanhyps: fh.write(str(j) + " ") fh.write("\n") fh.close() def _real_init(self, dims, values, durations): self.locker.lock_wait(self.state_pkl) self.randomstate = npr.get_state() if os.path.exists(self.state_pkl): fh = open(self.state_pkl, "rb") state = pickle.load(fh) fh.close() self.D = state["dims"] self.ls = state["ls"] self.amp2 = state["amp2"] self.noise = state["noise"] self.mean = state["mean"] self.constraint_ls = state["constraint_ls"] self.constraint_amp2 = state["constraint_amp2"] self.constraint_noise = state["constraint_noise"] self.constraint_mean = state["constraint_mean"] self.constraint_gain = state["constraint_gain"] self.needs_burnin = False else: # Identify constraint violations # Note that we'll treat NaNs and Infs as these values as well # as an optional user defined value goodvals = np.nonzero(np.logical_and(values != self.bad_value, np.isfinite(values)))[0] # Input dimensionality. self.D = dims # Initial length scales. self.ls = np.ones(self.D) self.constraint_ls = np.ones(self.D) # Initial amplitude. self.amp2 = np.std(values[goodvals]) + 1e-4 self.constraint_amp2 = 1.0 # Initial observation noise. self.noise = 1e-3 self.constraint_noise = 1e-3 self.constraint_gain = 1 # Initial mean. self.mean = np.mean(values[goodvals]) self.constraint_mean = 0.5 self.locker.unlock(self.state_pkl) def cov(self, amp2, ls, x1, x2=None): if x2 is None: return amp2 * (self.cov_func(ls, x1, None) + 1e-6 * np.eye(x1.shape[0])) else: return amp2 * self.cov_func(ls, x1, x2) # Given a set of completed 'experiments' in the unit hypercube with # corresponding objective 'values', pick from the next experiment to # run according to the acquisition function. def next(self, grid, values, durations, candidates, pending, complete): # Don't bother using fancy GP stuff at first. if complete.shape[0] < 2: return int(candidates[0]) # Grab out the relevant sets. comp = grid[complete, :] cand = grid[candidates, :] pend = grid[pending, :] vals = values[complete] # Identify constraint violations # Note that we'll treat NaNs and Infs as these values as well # as an optional user defined value idx = np.logical_and(vals != self.bad_value, np.isfinite(vals)) goodvals = np.nonzero(idx)[0] badvals = np.nonzero(np.logical_not(idx))[0] print("Found %d constraint violating jobs" % (badvals.shape[0])) # There's no point regressing on one observation print("Received %d valid results" % (goodvals.shape[0])) if goodvals.shape[0] < 2: return int(candidates[0]) labels = np.zeros(vals.shape[0]) labels[goodvals] = 1 if np.sum(labels) < 2: return int(candidates[0]) # Perform the real initialization. if self.D == -1: self._real_init(grid.shape[1], values[complete], durations[complete]) # Spray a set of candidates around the min so far numcand = cand.shape[0] best_comp = np.argmin(vals) cand2 = np.vstack((np.random.randn(10, comp.shape[1]) * 0.001 + comp[best_comp, :], cand)) if self.mcmc_iters > 0: # Possibly burn in. if self.needs_burnin: for mcmc_iter in range(self.burnin): self.sample_constraint_hypers(comp, labels) self.sample_hypers(comp[goodvals, :], vals[goodvals]) log( "BURN %d/%d] mean: %.2f amp: %.2f " "noise: %.4f min_ls: %.4f max_ls: %.4f" % ( mcmc_iter + 1, self.burnin, self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls), ) ) self.needs_burnin = False # Sample from hyperparameters. # Adjust the candidates to hit ei/sec peaks self.hyper_samples = [] for mcmc_iter in range(self.mcmc_iters): self.sample_constraint_hypers(comp, labels) self.sample_hypers(comp[goodvals, :], vals[goodvals]) if self.verbosity > 0: log( "%d/%d] mean: %.2f amp: %.2f noise: %.4f " "min_ls: %.4f max_ls: %.4f" % ( mcmc_iter + 1, self.mcmc_iters, self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls), ) ) log( "%d/%d] constraint_mean: %.2f " "constraint_amp: %.2f " "constraint_gain: %.4f " "constraint_min_ls: %.4f " "constraint_max_ls: " "%.4f" % ( mcmc_iter + 1, self.mcmc_iters, self.constraint_mean, np.sqrt(self.constraint_amp2), self.constraint_gain, np.min(self.constraint_ls), np.max(self.constraint_ls), ) ) self.dump_hypers() comp_preds = np.zeros(labels.shape[0]).flatten() preds = self.pred_constraint_voilation(cand, comp, labels).flatten() for ii in range(self.mcmc_iters): constraint_hyper = self.constraint_hyper_samples[ii] self.ff = self.ff_samples[ii] self.constraint_mean = constraint_hyper[0] self.constraint_gain = constraint_hyper[1] self.constraint_amp2 = constraint_hyper[2] self.constraint_ls = constraint_hyper[3] comp_preds += self.pred_constraint_voilation(comp, comp, labels).flatten() comp_preds = comp_preds / float(self.mcmc_iters) print( "Predicting %.2f%% constraint violations (%d/%d): " % (np.mean(preds < 0.5) * 100, np.sum(preds < 0.5), preds.shape[0]) ) if self.verbosity > 0: print( "Prediction` %f%% train accuracy (%d/%d): " % (np.mean((comp_preds > 0.5) == labels), np.sum((comp_preds > 0.5) == labels), comp_preds.shape[0]) ) if self.visualize2D: delta = 0.025 x = np.arange(0, 1.0, delta) y = np.arange(0, 1.0, delta) X, Y = np.meshgrid(x, y) cpreds = np.zeros((X.shape[0], X.shape[1])) predei = np.zeros((X.shape[0], X.shape[1])) predei2 = np.zeros((X.shape[0], X.shape[1])) for ii in range(self.mcmc_iters): constraint_hyper = self.constraint_hyper_samples[ii] self.ff = self.ff_samples[ii] self.constraint_mean = constraint_hyper[0] self.constraint_gain = constraint_hyper[1] self.constraint_amp2 = constraint_hyper[2] self.constraint_ls = constraint_hyper[3] cpred = self.pred_constraint_voilation( np.hstack((X.flatten()[:, np.newaxis], Y.flatten()[:, np.newaxis])), comp, labels ) pei = self.compute_constrained_ei( comp, pend, np.hstack((X.flatten()[:, np.newaxis], Y.flatten()[:, np.newaxis])), vals, labels ) pei2 = self.compute_ei( comp, pend, np.hstack((X.flatten()[:, np.newaxis], Y.flatten()[:, np.newaxis])), vals, labels ) cpreds += np.reshape(cpred, (X.shape[0], X.shape[1])) predei += np.reshape(pei, (X.shape[0], X.shape[1])) predei2 += np.reshape(pei2, (X.shape[0], X.shape[1])) plt.figure(1) plt.clf() cpreds = cpreds / float(self.mcmc_iters) CS = plt.contour(X, Y, cpreds) plt.clabel(CS, inline=1, fontsize=10) plt.plot(comp[labels == 0, 0], comp[labels == 0, 1], "rx") plt.plot(comp[labels == 1, 0], comp[labels == 1, 1], "bx") plt.title("Contours of Classification GP (Prob of not being a " "constraint violation)") plt.legend(("Constraint Violations", "Good points"), "lower left") plt.savefig("constrained_ei_chooser_class_contour.pdf") plt.figure(2) plt.clf() predei = predei / float(self.mcmc_iters) CS = plt.contour(X, Y, predei) plt.clabel(CS, inline=1, fontsize=10) plt.plot(comp[labels == 0, 0], comp[labels == 0, 1], "rx") plt.plot(comp[labels == 1, 0], comp[labels == 1, 1], "bx") plt.title("Contours of EI*P(not violating constraint)") plt.legend(("Constraint Violations", "Good points"), "lower left") plt.savefig("constrained_ei_chooser_eitimesprob_contour.pdf") plt.figure(3) plt.clf() predei2 = predei2 / float(self.mcmc_iters) CS = plt.contour(X, Y, predei2) plt.clabel(CS, inline=1, fontsize=10) plt.plot(comp[labels == 0, 0], comp[labels == 0, 1], "rx") plt.plot(comp[labels == 1, 0], comp[labels == 1, 1], "bx") plt.title("Contours of EI") plt.legend(("Constraint Violations", "Good points"), "lower left") plt.savefig("constrained_ei_chooser_ei_contour.pdf") # plt.show() # Pick the top candidates to optimize over overall_ei = self.ei_over_hypers(comp, pend, cand2, vals, labels) inds = np.argsort(np.mean(overall_ei, axis=1))[-self.grid_subset :] cand2 = cand2[inds, :] # Adjust the candidates to hit ei peaks b = [] # optimization bounds for i in range(0, cand.shape[1]): b.append((0, 1)) # Optimize each point in parallel pool = multiprocessing.Pool(self.grid_subset) results = [ pool.apply_async(optimize_pt, args=(c, b, comp, pend, vals, labels, copy.copy(self))) for c in cand2 ] for res in results: cand = np.vstack((cand, res.get(1024))) pool.close() # for i in xrange(0, cand2.shape[0]): # log("Optimizing candidate %d/%d\n" % # (i+1, cand2.shape[0])) # self.check_grad_ei(cand2[i,:], comp, pend, vals, labels) # ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei_over_hypers, # cand2[i,:].flatten(), # args=(comp,pend,vals,labels,True), # bounds=b, disp=0) # cand2[i,:] = ret[0] cand = np.vstack((cand, cand2)) overall_ei = self.ei_over_hypers(comp, pend, cand, vals, labels) best_cand = np.argmax(np.mean(overall_ei, axis=1)) self.dump_hypers() if best_cand >= numcand: return (int(numcand), cand[best_cand, :]) return int(candidates[best_cand]) else: print("This Chooser module permits only slice sampling with > 0 " "samples.") raise Exception("mcmc_iters <= 0") # Predict constraint voilating points def pred_constraint_voilation(self, cand, comp, vals): # The primary covariances for prediction. comp_cov = self.cov(self.constraint_amp2, self.constraint_ls, comp) cand_cross = self.cov(self.constraint_amp2, self.constraint_ls, comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.constraint_noise * np.eye(comp.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) cov_grad_func = getattr(gp, "grad_" + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.constraint_ls, comp, cand) # Predictive things. # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), self.ff) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) # + self.constraint_mean func_m = sps.norm.cdf(func_m * self.constraint_gain) return func_m # Compute EI over hyperparameter samples def ei_over_hypers(self, comp, pend, cand, vals, labels): overall_ei = np.zeros((cand.shape[0], self.mcmc_iters)) for mcmc_iter in range(self.mcmc_iters): hyper = self.hyper_samples[mcmc_iter] constraint_hyper = self.constraint_hyper_samples[mcmc_iter] self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] self.constraint_mean = constraint_hyper[0] self.constraint_gain = constraint_hyper[1] self.constraint_amp2 = constraint_hyper[2] self.constraint_ls = constraint_hyper[3] overall_ei[:, mcmc_iter] = self.compute_constrained_ei(comp, pend, cand, vals, labels) return overall_ei # Adjust points by optimizing EI over a set of hyperparameter samples def grad_optimize_ei_over_hypers(self, cand, comp, pend, vals, labels, compute_grad=True): summed_ei = 0 summed_grad_ei = np.zeros(cand.shape).flatten() for mcmc_iter in range(self.mcmc_iters): hyper = self.hyper_samples[mcmc_iter] constraint_hyper = self.constraint_hyper_samples[mcmc_iter] self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] self.constraint_mean = constraint_hyper[0] self.constraint_gain = constraint_hyper[1] self.constraint_amp2 = constraint_hyper[2] self.constraint_ls = constraint_hyper[3] if compute_grad: (ei, g_ei) = self.grad_optimize_ei(cand, comp, pend, vals, labels, compute_grad) summed_grad_ei = summed_grad_ei + g_ei else: ei = self.grad_optimize_ei(cand, comp, pend, vals, labels, compute_grad) summed_ei += ei if compute_grad: return (summed_ei, summed_grad_ei) else: return summed_ei def check_grad_ei(self, cand, comp, pend, vals, labels): (ei, dx1) = self.grad_optimize_ei_over_hypers(cand, comp, pend, vals, labels) dx2 = dx1 * 0 idx = np.zeros(cand.shape[0]) for i in range(0, cand.shape[0]): idx[i] = 1e-6 (ei1, tmp) = self.grad_optimize_ei_over_hypers(cand + idx, comp, pend, vals, labels) (ei2, tmp) = self.grad_optimize_ei_over_hypers(cand - idx, comp, pend, vals, labels) dx2[i] = (ei - ei2) / (2 * 1e-6) idx[i] = 0 print("computed grads", dx1) print("finite diffs", dx2) print((dx1 / dx2)) print(np.sum((dx1 - dx2) ** 2)) time.sleep(2) def grad_optimize_ei(self, cand, comp, pend, vals, labels, compute_grad=True): if pend.shape[0] == 0: return self.grad_optimize_ei_nopend(cand, comp, vals, labels, compute_grad=True) else: return self.grad_optimize_ei_pend(cand, comp, pend, vals, labels, compute_grad=True) def grad_optimize_ei_pend(self, cand, comp, pend, vals, labels, compute_grad=True): # Here we have to compute the gradients for constrained ei # This means deriving through the two kernels, the one for predicting # constraint violations and the one predicting ei # First pull out violating points compfull = comp.copy() comp = comp[labels > 0, :] vals = vals[labels > 0] # Use standard EI if there aren't enough observations of either # positive or negative constraint violations use_vanilla_ei = np.all(labels > 0) or np.all(labels <= 0) best = np.min(vals) cand = np.reshape(cand, (-1, comp.shape[1])) func_constraint_m = 1 if not use_vanilla_ei: # First we make predictions for the durations # Compute covariances comp_constraint_cov = self.cov(self.constraint_amp2, self.constraint_ls, compfull) cand_constraint_cross = self.cov(self.constraint_amp2, self.constraint_ls, compfull, cand) # Cholesky decompositions obsv_constraint_cov = comp_constraint_cov + self.constraint_noise * np.eye(compfull.shape[0]) obsv_constraint_chol = spla.cholesky(obsv_constraint_cov, lower=True) # Linear systems t_alpha = spla.cho_solve((obsv_constraint_chol, True), self.ff) # Predict marginal mean times and (possibly) variances ff = np.dot(cand_constraint_cross.T, t_alpha) # Squash through Gaussian cdf func_constraint_m = sps.norm.cdf(self.constraint_gain * ff) # Apply covariance function cov_grad_func = getattr(gp, "grad_" + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.constraint_ls, compfull, cand) grad_cross_t = np.squeeze(cand_cross_grad) # Now compute the gradients w.r.t. ei # The primary covariances for prediction. comp_cov = self.cov(self.amp2, self.ls, comp) cand_cross = self.cov(self.amp2, self.ls, comp, cand) comp_cov_full = self.cov(self.amp2, self.ls, compfull) cand_cross_full = self.cov(self.amp2, self.ls, compfull, cand) # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = self.cov(self.amp2, self.ls, comp_pend) + self.noise * np.eye(comp_pend.shape[0]) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(self.amp2, self.ls, comp, pend) pend_kappa = self.cov(self.amp2, self.ls, pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[: comp.shape[0], : comp.shape[0]] # Compute the required Cholesky. # obsv_cov = comp_cov + self.noise*np.eye(comp.shape[0]) # obsv_chol = spla.cholesky(obsv_cov, lower=True) obsv_cov_full = comp_cov_full + self.noise * np.eye(compfull.shape[0]) obsv_chol_full = spla.cholesky(obsv_cov_full, lower=True) # Predictive things. # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. npr.set_state(self.randomstate) pend_fant = np.dot(pend_chol, npr.randn(pend.shape[0], self.pending_samples)) + pend_m[:, None] # Include the fantasies. fant_vals = np.concatenate((np.tile(vals[:, np.newaxis], (1, self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(self.amp2, self.ls, comp_pend, cand) cov_grad_func = getattr(gp, "grad_" + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.ls, comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta ** 2, axis=0) # beta = spla.solve_triangular(obsv_chol_full, cand_cross_full, # lower=True) # beta = spla.solve_triangular(obsv_chol, cand_cross, # lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta ** 2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) constrained_ei = -np.sum(ei * func_constraint_m) if not compute_grad: return constrained_ei # Gradients of ei w.r.t. mean and variance g_ei_m = -ncdf g_ei_s2 = 0.5 * npdf / func_s # Apply covariance function grad_cross = np.squeeze(cand_cross_grad) grad_xp_m = np.dot(alpha.transpose(), grad_cross) grad_xp_v = np.dot(-2 * spla.cho_solve((comp_pend_chol, True), cand_cross).transpose(), grad_cross) grad_xp = 0.5 * self.amp2 * (grad_xp_m * np.tile(g_ei_m, (comp.shape[1], 1)).T + (grad_xp_v.T * g_ei_s2).T) grad_xp = np.sum(grad_xp, axis=0) if use_vanilla_ei: return -np.sum(ei), grad_xp.flatten() grad_constraint_xp_m = np.dot(t_alpha.transpose(), grad_cross_t) grad_constraint_xp_m = ( 0.5 * self.constraint_amp2 * self.constraint_gain * grad_constraint_xp_m * sps.norm.pdf(self.constraint_gain * ff) ) grad_xp = func_constraint_m * grad_xp + np.sum(ei) * grad_constraint_xp_m return constrained_ei, grad_xp.flatten() def grad_optimize_ei_nopend(self, cand, comp, vals, labels, compute_grad=True): # Here we have to compute the gradients for constrained ei # This means deriving through the two kernels, the one for predicting # constraint violations and the one predicting ei # First pull out violating points compfull = comp.copy() comp = comp[labels > 0, :] vals = vals[labels > 0] # Use standard EI if there aren't enough observations of either # positive or negative constraint violations use_vanilla_ei = np.all(labels > 0) or np.all(labels <= 0) best = np.min(vals) cand = np.reshape(cand, (-1, comp.shape[1])) func_constraint_m = 1 if not use_vanilla_ei: # First we make predictions for the durations # Compute covariances comp_constraint_cov = self.cov(self.constraint_amp2, self.constraint_ls, compfull) cand_constraint_cross = self.cov(self.constraint_amp2, self.constraint_ls, compfull, cand) # Cholesky decompositions obsv_constraint_cov = comp_constraint_cov + self.constraint_noise * np.eye(compfull.shape[0]) obsv_constraint_chol = spla.cholesky(obsv_constraint_cov, lower=True) # Linear systems t_alpha = spla.cho_solve((obsv_constraint_chol, True), self.ff) # Predict marginal mean times and (possibly) variances ff = np.dot(cand_constraint_cross.T, t_alpha) # Squash through Gaussian cdf func_constraint_m = sps.norm.cdf(self.constraint_gain * ff) # Now compute the gradients w.r.t. ei # The primary covariances for prediction. comp_cov = self.cov(self.amp2, self.ls, comp) cand_cross = self.cov(self.amp2, self.ls, comp, cand) comp_cov_full = self.cov(self.amp2, self.ls, compfull) cand_cross_full = self.cov(self.amp2, self.ls, compfull, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) obsv_cov_full = comp_cov_full + self.noise * np.eye(compfull.shape[0]) obsv_chol_full = spla.cholesky(obsv_cov_full, lower=True) # Predictive things. # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol_full, cand_cross_full, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta ** 2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) constrained_ei = -np.sum(ei * func_constraint_m) if not compute_grad: return constrained_ei # Gradients of ei w.r.t. mean and variance g_ei_m = -ncdf g_ei_s2 = 0.5 * npdf / func_s # Apply covariance function cov_grad_func = getattr(gp, "grad_" + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.ls, comp, cand) grad_cross = np.squeeze(cand_cross_grad) cand_cross_grad_full = cov_grad_func(self.ls, compfull, cand) grad_cross_full = np.squeeze(cand_cross_grad_full) grad_xp_m = np.dot(alpha.transpose(), grad_cross) grad_xp_v = np.dot(-2 * spla.cho_solve((obsv_chol_full, True), cand_cross_full).transpose(), grad_cross_full) grad_xp = 0.5 * self.amp2 * (grad_xp_m * g_ei_m + grad_xp_v * g_ei_s2) if use_vanilla_ei: return -np.sum(ei), grad_xp.flatten() # Apply constraint classifier cand_cross_grad = cov_grad_func(self.constraint_ls, compfull, cand) grad_cross_t = np.squeeze(cand_cross_grad) grad_constraint_xp_m = np.dot(t_alpha.transpose(), grad_cross_t) grad_constraint_xp_m = ( 0.5 * self.constraint_amp2 * self.constraint_gain * grad_constraint_xp_m * sps.norm.pdf(self.constraint_gain * ff) ) grad_xp = func_constraint_m * grad_xp + ei * grad_constraint_xp_m return constrained_ei, grad_xp.flatten() def compute_constrained_ei(self, comp, pend, cand, vals, labels): # First we make predictions for the durations as that # doesn't depend on pending experiments # First pull out violating points compfull = comp.copy() comp = comp[labels > 0, :] vals = vals[labels > 0] # Use standard EI if there aren't enough observations of either # positive or negative constraint violations if np.all(labels > 0) or np.all(labels <= 0): func_constraint_m = 1 else: # Compute covariances comp_constraint_cov = self.cov(self.constraint_amp2, self.constraint_ls, compfull) cand_constraint_cross = self.cov(self.constraint_amp2, self.constraint_ls, compfull, cand) # Cholesky decompositions obsv_constraint_cov = comp_constraint_cov + self.constraint_noise * np.eye(compfull.shape[0]) obsv_constraint_chol = spla.cholesky(obsv_constraint_cov, lower=True) # Linear systems t_alpha = spla.cho_solve((obsv_constraint_chol, True), self.ff) t_beta = spla.solve_triangular(obsv_constraint_chol, cand_constraint_cross, lower=True) # Predict marginal mean times and (possibly) variances func_constraint_m = np.dot(cand_constraint_cross.T, t_alpha) # Squash through a probit func_constraint_m = sps.norm.cdf(self.constraint_gain * func_constraint_m) if pend.shape[0] == 0: # If there are no pending, don't do anything fancy. # Current best. best = np.min(vals) # The primary covariances for prediction. comp_cov = self.cov(self.amp2, self.ls, comp) comp_cov_full = self.cov(self.amp2, self.ls, compfull) cand_cross = self.cov(self.amp2, self.ls, comp, cand) cand_cross_full = self.cov(self.amp2, self.ls, compfull, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0]) obsv_cov_full = comp_cov_full + self.noise * np.eye(compfull.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) obsv_chol_full = spla.cholesky(obsv_cov_full, lower=True) # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # beta = spla.solve_triangular(obsv_chol_full, cand_cross_full, # lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta ** 2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) constrained_ei = ei * func_constraint_m return constrained_ei else: # If there are pending experiments, fantasize their outcomes. # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = self.cov(self.amp2, self.ls, comp_pend) + self.noise * np.eye(comp_pend.shape[0]) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(self.amp2, self.ls, comp, pend) pend_kappa = self.cov(self.amp2, self.ls, pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[: comp.shape[0], : comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. pend_fant = np.dot(pend_chol, npr.randn(pend.shape[0], self.pending_samples)) + pend_m[:, None] # Include the fantasies. fant_vals = np.concatenate((np.tile(vals[:, np.newaxis], (1, self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(self.amp2, self.ls, comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta ** 2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:, np.newaxis]) u = (bests[np.newaxis, :] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) return np.mean(ei, axis=1) * func_constraint_m def compute_ei(self, comp, pend, cand, vals, labels): # First we make predictions for the durations as that # doesn't depend on pending experiments # First pull out violating points compfull = comp.copy() comp = comp[labels > 0, :] vals = vals[labels > 0] # Compute covariances comp_constraint_cov = self.cov(self.constraint_amp2, self.constraint_ls, compfull) cand_constraint_cross = self.cov(self.constraint_amp2, self.constraint_ls, compfull, cand) # Cholesky decompositions obsv_constraint_cov = comp_constraint_cov + self.constraint_noise * np.eye(compfull.shape[0]) obsv_constraint_chol = spla.cholesky(obsv_constraint_cov, lower=True) # Linear systems t_alpha = spla.cho_solve((obsv_constraint_chol, True), self.ff) # Predict marginal mean times and (possibly) variances func_constraint_m = np.dot(cand_constraint_cross.T, t_alpha) # Squash through a probit to get prob of not violating a constraint func_constraint_m = 1.0 / (1 + np.exp(-self.constraint_gain * func_constraint_m)) if pend.shape[0] == 0: # If there are no pending, don't do anything fancy. # Current best. best = np.min(vals) # The primary covariances for prediction. comp_cov = self.cov(self.amp2, self.ls, comp) comp_cov_full = self.cov(self.amp2, self.ls, compfull) cand_cross = self.cov(self.amp2, self.ls, comp, cand) cand_cross_full = self.cov(self.amp2, self.ls, compfull, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0]) obsv_cov_full = comp_cov_full + self.noise * np.eye(compfull.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) obsv_chol_full = spla.cholesky(obsv_cov_full, lower=True) # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # beta = spla.solve_triangular(obsv_chol_full, cand_cross_full, # lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta ** 2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) return ei else: return 0 def sample_constraint_hypers(self, comp, labels): # The latent GP projection # The latent GP projection if self.ff is None or self.ff.shape[0] < comp.shape[0]: self.ff_samples = [] comp_cov = self.cov(self.constraint_amp2, self.constraint_ls, comp) obsv_cov = comp_cov + 1e-6 * np.eye(comp.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) self.ff = np.dot(obsv_chol, npr.randn(obsv_chol.shape[0])) self._sample_constraint_noisy(comp, labels) self._sample_constraint_ls(comp, labels) self.constraint_hyper_samples.append( (self.constraint_mean, self.constraint_gain, self.constraint_amp2, self.constraint_ls) ) self.ff_samples.append(self.ff) def sample_hypers(self, comp, vals): if self.noiseless: self.noise = 1e-3 self._sample_noiseless(comp, vals) else: self._sample_noisy(comp, vals) self._sample_ls(comp, vals) self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) def _sample_ls(self, comp, vals): def logprob(ls): if np.any(ls < 0) or np.any(ls > self.max_ls): return -np.inf cov = self.amp2 * (self.cov_func(ls, comp, None) + 1e-6 * np.eye(comp.shape[0])) + self.noise * np.eye( comp.shape[0] ) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - self.mean) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot(vals - self.mean, solve) return lp self.ls = util.slice_sample(self.ls, logprob, compwise=True) def _sample_constraint_ls(self, comp, vals): def lpProbit(ff, gain=self.constraint_gain): probs = sps.norm.cdf(ff * gain) probs[probs <= 0] = 1e-12 probs[probs >= 1] = 1 - 1e-12 llh = np.sum(vals * np.log(probs) + (1 - vals) * np.log(1 - probs)) return llh def lpSigmoid(ff, gain=self.constraint_gain): probs = 1.0 / (1.0 + np.exp(-gain * ff)) probs[probs <= 0] = 1e-12 probs[probs >= 1] = 1 - 1e-12 llh = np.sum(vals * np.log(probs) + (1 - vals) * np.log(1 - probs)) return llh def updateGain(gain): if gain < 0.01 or gain > 10: return -np.inf cov = self.constraint_amp2 * ( self.cov_func(self.constraint_ls, comp, None) + 1e-6 * np.eye(comp.shape[0]) ) + self.constraint_noise * np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals) lp = lpProbit(self.ff, gain) return lp def logprob(ls): if np.any(ls < 0) or np.any(ls > self.constraint_max_ls): return -np.inf cov = self.constraint_amp2 * ( self.cov_func(ls, comp, None) + 1e-6 * np.eye(comp.shape[0]) ) + self.constraint_noise * np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), self.ff) lp = lpProbit(self.ff) return lp hypers = util.slice_sample(self.constraint_ls, logprob, compwise=True) self.constraint_ls = hypers cov = self.constraint_amp2 * ( self.cov_func(self.constraint_ls, comp, None) + 1e-6 * np.eye(comp.shape[0]) ) + self.constraint_noise * np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=False) ff = self.ff for jj in range(20): (ff, lpell) = self.elliptical_slice(ff, chol, lpProbit) self.ff = ff # Update gain hypers = util.slice_sample(np.array([self.constraint_gain]), updateGain, compwise=True) self.constraint_gain = hypers[0] def _sample_noisy(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = hypers[2] # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0 or noise < 0: return -np.inf cov = amp2 * ( (self.cov_func(self.ls, comp, None) + 1e-6 * np.eye(comp.shape[0])) + noise * np.eye(comp.shape[0]) ) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot(vals - mean, solve) # Roll in noise horseshoe prior. lp += np.log(np.log(1 + (self.noise_scale / noise) ** 2)) # Roll in amplitude lognormal prior lp -= 0.5 * (np.log(amp2) / self.amp2_scale) ** 2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = hypers[2] def _sample_constraint_noisy(self, comp, vals): def lpProbit(ff, gain=self.constraint_gain): probs = sps.norm.cdf(ff * gain) probs[probs <= 0] = 1e-12 probs[probs >= 1] = 1 - 1e-12 llh = np.sum(vals * np.log(probs) + (1 - vals) * np.log(1 - probs)) if np.any(np.isnan(probs)): print(probs) return llh def lpSigmoid(ff, gain=self.constraint_gain): probs = 1.0 / (1.0 + np.exp(-gain * ff)) probs[probs <= 0] = 1e-12 probs[probs >= 1] = 1 - 1e-12 llh = np.sum(vals * np.log(probs) + (1 - vals) * np.log(1 - probs)) return llh def logprob(hypers): amp2 = hypers[0] ff = hypers[1:] if amp2 < 0: return -np.inf noise = self.constraint_noise cov = amp2 * ( self.cov_func(self.constraint_ls, comp, None) + 1e-6 * np.eye(comp.shape[0]) ) + noise * np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), ff) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot(ff, solve) # Roll in amplitude lognormal prior lp -= 0.5 * (np.log(amp2) / self.constraint_amp2_scale) ** 2 lp += lpProbit(ff, self.constraint_gain) return lp hypers = util.slice_sample(np.hstack((np.array([self.constraint_amp2]), self.ff)), logprob, compwise=False) self.constraint_amp2 = hypers[0] self.ff = hypers[1:] cov = self.constraint_amp2 * ( (self.cov_func(self.constraint_ls, comp, None) + 1e-6 * np.eye(comp.shape[0])) + self.constraint_noise * np.eye(comp.shape[0]) ) chol = spla.cholesky(cov, lower=False) ff = self.ff for jj in range(50): (ff, lpell) = self.elliptical_slice(ff, chol, lpProbit) self.ff = ff def _sample_noiseless(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = 1e-3 # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0: return -np.inf cov = amp2 * ( (self.cov_func(self.ls, comp, None) + 1e-6 * np.eye(comp.shape[0])) + noise * np.eye(comp.shape[0]) ) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot(vals - mean, solve) # Roll in amplitude lognormal prior lp -= 0.5 * (np.log(amp2) / self.amp2_scale) ** 2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = 1e-3 def elliptical_slice(self, xx, chol_Sigma, log_like_fn, cur_log_like=None, angle_range=0): D = xx.shape[0] if cur_log_like is None: cur_log_like = log_like_fn(xx) nu = np.dot(chol_Sigma.T, np.random.randn(D, 1)).flatten() hh = np.log(np.random.rand()) + cur_log_like # Set up a bracket of angles and pick a first proposal. # "phi = (theta'-theta)" is a change in angle. if angle_range <= 0: # Bracket whole ellipse with both edges at first proposed point phi = np.random.rand() * 2 * math.pi phi_min = phi - 2 * math.pi phi_max = phi else: # Randomly center bracket on current point phi_min = -angle_range * np.random.rand() phi_max = phi_min + angle_range phi = np.random.rand() * (phi_max - phi_min) + phi_min # Slice sampling loop while True: # Compute xx for proposed angle difference # and check if it's on the slice xx_prop = xx * np.cos(phi) + nu * np.sin(phi) cur_log_like = log_like_fn(xx_prop) if cur_log_like > hh: # New point is on slice, ** EXIT LOOP ** break # Shrink slice to rejected point if phi > 0: phi_max = phi elif phi < 0: phi_min = phi else: raise Exception("BUG DETECTED: Shrunk to current position " "and still not acceptable.") # Propose new angle difference phi = np.random.rand() * (phi_max - phi_min) + phi_min xx = xx_prop return (xx, cur_log_like)
class GPEIperSecChooser: def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False, burnin=100, grid_subset=20): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.stats_file = os.path.join(expt_dir, self.__module__ + "_hyperparameters.txt") self.mcmc_iters = int(mcmc_iters) self.burnin = int(burnin) self.needs_burnin = True self.pending_samples = pending_samples self.D = -1 self.hyper_iters = 1 # Number of points to optimize EI over self.grid_subset = int(grid_subset) self.noiseless = bool(int(noiseless)) self.hyper_samples = [] self.time_hyper_samples = [] self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 10 # top-hat prior on length scales self.time_noise_scale = 0.1 # horseshoe prior self.time_amp2_scale = 1 # zero-mean log normal prior self.time_max_ls = 10 # top-hat prior on length scales # A simple function to dump out hyperparameters to allow for a hot start # if the optimization is restarted. def dump_hypers(self): self.locker.lock_wait(self.state_pkl) # Write the hyperparameters out to a Pickle. fh = tempfile.NamedTemporaryFile(mode='w', delete=False) cPickle.dump({ 'dims' : self.D, 'ls' : self.ls, 'amp2' : self.amp2, 'noise' : self.noise, 'mean' : self.mean, 'time_ls' : self.time_ls, 'time_amp2' : self.time_amp2, 'time_noise' : self.time_noise, 'time_mean' : self.time_mean }, fh) fh.close() # Use an atomic move for better NFS happiness. cmd = 'mv "%s" "%s"' % (fh.name, self.state_pkl) os.system(cmd) # TODO: Should check system-dependent return status. self.locker.unlock(self.state_pkl) def _real_init(self, dims, values, durations): self.locker.lock_wait(self.state_pkl) if os.path.exists(self.state_pkl): fh = open(self.state_pkl, 'r') state = cPickle.load(fh) fh.close() self.D = state['dims'] self.ls = state['ls'] self.amp2 = state['amp2'] self.noise = state['noise'] self.mean = state['mean'] self.time_ls = state['time_ls'] self.time_amp2 = state['time_amp2'] self.time_noise = state['time_noise'] self.time_mean = state['time_mean'] else: # Input dimensionality. self.D = dims # Initial length scales. self.ls = np.ones(self.D) self.time_ls = np.ones(self.D) # Initial amplitude. self.amp2 = np.std(values)+1e-4 self.time_amp2 = np.std(durations)+1e-4 # Initial observation noise. self.noise = 1e-3 self.time_noise = 1e-3 # Initial mean. self.mean = np.mean(values) self.time_mean = np.mean(np.log(durations)) self.locker.unlock(self.state_pkl) def cov(self, amp2, ls, x1, x2=None): if x2 is None: return amp2 * (self.cov_func(ls, x1, None) + 1e-6*np.eye(x1.shape[0])) else: return amp2 * self.cov_func(ls, x1, x2) # Given a set of completed 'experiments' in the unit hypercube with # corresponding objective 'values', pick from the next experiment to # run according to the acquisition function. def next(self, grid, values, durations, candidates, pending, complete): # Don't bother using fancy GP stuff at first. if complete.shape[0] < 2: return int(candidates[0]) # Perform the real initialization. if self.D == -1: self._real_init(grid.shape[1], values[complete], durations[complete]) # Grab out the relevant sets. comp = grid[complete,:] cand = grid[candidates,:] pend = grid[pending,:] vals = values[complete] durs = durations[complete] # Bring time into the log domain before we do anything # to maintain strict positivity durs = np.log(durs) # Spray a set of candidates around the min so far numcand = cand.shape[0] best_comp = np.argmin(vals) cand2 = np.vstack((np.random.randn(10,comp.shape[1])*0.001 + comp[best_comp,:], cand)) if self.mcmc_iters > 0: # Possibly burn in. if self.needs_burnin: for mcmc_iter in range(self.burnin): self.sample_hypers(comp, vals, durs) log("BURN %d/%d] mean: %.2f amp: %.2f " "noise: %.4f min_ls: %.4f max_ls: %.4f" % (mcmc_iter+1, self.burnin, self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) self.needs_burnin = False # Sample from hyperparameters. # Adjust the candidates to hit ei/sec peaks self.hyper_samples = [] for mcmc_iter in range(self.mcmc_iters): self.sample_hypers(comp, vals, durs) log("%d/%d] mean: %.2f amp: %.2f noise: %.4f " "min_ls: %.4f max_ls: %.4f" % (mcmc_iter+1, self.mcmc_iters, self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) log("%d/%d] time_mean: %.2fs time_amp: %.2f time_noise: %.4f " "time_min_ls: %.4f time_max_ls: %.4f" % (mcmc_iter+1, self.mcmc_iters, np.exp(self.time_mean), np.sqrt(self.time_amp2), np.exp(self.time_noise), np.min(self.time_ls), np.max(self.time_ls))) self.dump_hypers() # Pick the top candidates to optimize over overall_ei = self.ei_over_hypers(comp,pend,cand2,vals,durs) inds = np.argsort(np.mean(overall_ei, axis=1))[-self.grid_subset:] cand2 = cand2[inds,:] # Adjust the candidates to hit ei peaks b = []# optimization bounds for i in range(0, cand.shape[1]): b.append((0, 1)) for i in range(0, cand2.shape[0]): log("Optimizing candidate %d/%d" % (i+1, cand2.shape[0])) ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei_over_hypers, cand2[i,:].flatten(), args=(comp,vals,durs,True), bounds=b, disp=0) cand2[i,:] = ret[0] cand = np.vstack((cand, cand2)) overall_ei = self.ei_over_hypers(comp,pend,cand,vals,durs) best_cand = np.argmax(np.mean(overall_ei, axis=1)) self.dump_hypers() if (best_cand >= numcand): return (int(numcand), cand[best_cand,:]) return int(candidates[best_cand]) else: # Optimize hyperparameters self.optimize_hypers(comp, vals, durs) log("mean: %f amp: %f noise: %f " "min_ls: %f max_ls: %f" % (self.mean, np.sqrt(self.amp2), self.noise, np.min(self.ls), np.max(self.ls))) # Pick the top candidates to optimize over ei = self.compute_ei_per_s(comp, pend, cand2, vals, durs) inds = np.argsort(np.mean(overall_ei, axis=1))[-self.grid_subset:] cand2 = cand2[inds,:] # Adjust the candidates to hit ei peaks b = []# optimization bounds for i in range(0, cand.shape[1]): b.append((0, 1)) for i in range(0, cand2.shape[0]): log("Optimizing candidate %d/%d" % (i+1, cand2.shape[0])) ret = spo.fmin_l_bfgs_b(self.grad_optimize_ei, cand2[i,:].flatten(), args=(comp,vals,durs,True), bounds=b, disp=0) cand2[i,:] = ret[0] cand = np.vstack((cand, cand2)) ei = self.compute_ei_per_s(comp, pend, cand, vals, durs) best_cand = np.argmax(ei) self.dump_hypers() if (best_cand >= numcand): return (int(numcand), cand[best_cand,:]) return int(candidates[best_cand]) # Compute EI over hyperparameter samples def ei_over_hypers(self,comp,pend,cand,vals,durs): overall_ei = np.zeros((cand.shape[0], self.mcmc_iters)) for mcmc_iter in range(self.mcmc_iters): hyper = self.hyper_samples[mcmc_iter] time_hyper = self.time_hyper_samples[mcmc_iter] self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] self.time_mean = time_hyper[0] self.time_noise = time_hyper[1] self.time_amp2 = time_hyper[2] self.time_ls = time_hyper[3] overall_ei[:,mcmc_iter] = self.compute_ei_per_s(comp, pend, cand, vals, durs.squeeze()) return overall_ei def check_grad_ei_per(self, cand, comp, vals, durs): (ei,dx1) = self.grad_optimize_ei_over_hypers(cand, comp, vals, durs) dx2 = dx1*0 idx = np.zeros(cand.shape[0]) for i in range(0, cand.shape[0]): idx[i] = 1e-6 (ei1,tmp) = self.grad_optimize_ei_over_hypers(cand + idx, comp, vals, durs) (ei2,tmp) = self.grad_optimize_ei_over_hypers(cand - idx, comp, vals, durs) dx2[i] = (ei - ei2)/(2*1e-6) idx[i] = 0 print('computed grads', dx1) print('finite diffs', dx2) print(dx1/dx2) print(np.sum((dx1 - dx2)**2)) time.sleep(2) # Adjust points by optimizing EI over a set of hyperparameter samples def grad_optimize_ei_over_hypers(self, cand, comp, vals, durs, compute_grad=True): summed_ei = 0 summed_grad_ei = np.zeros(cand.shape).flatten() for mcmc_iter in range(self.mcmc_iters): hyper = self.hyper_samples[mcmc_iter] time_hyper = self.time_hyper_samples[mcmc_iter] self.mean = hyper[0] self.noise = hyper[1] self.amp2 = hyper[2] self.ls = hyper[3] self.time_mean = time_hyper[0] self.time_noise = time_hyper[1] self.time_amp2 = time_hyper[2] self.time_ls = time_hyper[3] if compute_grad: (ei,g_ei) = self.grad_optimize_ei(cand,comp,vals,durs,compute_grad) summed_grad_ei = summed_grad_ei + g_ei else: ei = self.grad_optimize_ei(cand,comp,vals,durs,compute_grad) summed_ei += ei if compute_grad: return (summed_ei, summed_grad_ei) else: return summed_ei def grad_optimize_ei(self, cand, comp, vals, durs, compute_grad=True): # Here we have to compute the gradients for ei per second # This means deriving through the two kernels, the one for predicting # time and the one predicting ei best = np.min(vals) cand = np.reshape(cand, (-1, comp.shape[1])) # First we make predictions for the durations # Compute covariances comp_time_cov = self.cov(self.time_amp2, self.time_ls, comp) cand_time_cross = self.cov(self.time_amp2, self.time_ls,comp,cand) # Cholesky decompositions obsv_time_cov = comp_time_cov + self.time_noise*np.eye(comp.shape[0]) obsv_time_chol = spla.cholesky( obsv_time_cov, lower=True ) # Linear systems t_alpha = spla.cho_solve((obsv_time_chol, True), durs - self.time_mean) # Predict marginal mean times and (possibly) variances func_time_m = np.dot(cand_time_cross.T, t_alpha) + self.time_mean # We don't really need the time variances now #func_time_v = self.time_amp2*(1+1e-6) - np.sum(t_beta**2, axis=0) # Bring time out of the log domain func_time_m = np.exp(func_time_m) # Compute derivative of cross-distances. grad_cross_r = gp.grad_dist2(self.time_ls, comp, cand) # Apply covariance function cov_grad_func = getattr(gp, 'grad_' + self.cov_func.__name__) cand_cross_grad = cov_grad_func(self.time_ls, comp, cand) grad_cross_t = np.squeeze(cand_cross_grad) # Now compute the gradients w.r.t. ei # The primary covariances for prediction. comp_cov = self.cov(self.amp2, self.ls, comp) cand_cross = self.cov(self.amp2, self.ls, comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise*np.eye(comp.shape[0]) obsv_chol = spla.cholesky( obsv_cov, lower=True ) cand_cross_grad = cov_grad_func(self.ls, comp, cand) # Predictive things. # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*(u*ncdf + npdf) ei_per_s = -np.sum(ei/func_time_m) if not compute_grad: return ei grad_time_xp_m = np.dot(t_alpha.transpose(),grad_cross_t) # Gradients of ei w.r.t. mean and variance g_ei_m = -ncdf g_ei_s2 = 0.5*npdf / func_s # Apply covariance function grad_cross = np.squeeze(cand_cross_grad) grad_xp_m = np.dot(alpha.transpose(),grad_cross) grad_xp_v = np.dot(-2*spla.cho_solve((obsv_chol, True), cand_cross).transpose(),grad_cross) grad_xp = 0.5*self.amp2*(grad_xp_m*g_ei_m + grad_xp_v*g_ei_s2) grad_time_xp_m = 0.5*self.time_amp2*grad_time_xp_m*func_time_m grad_xp = (func_time_m*grad_xp - ei*grad_time_xp_m)/(func_time_m**2) return ei_per_s, grad_xp.flatten() def compute_ei_per_s(self, comp, pend, cand, vals, durs): # First we make predictions for the durations as that # doesn't depend on pending experiments # Compute covariances comp_time_cov = self.cov(self.time_amp2, self.time_ls, comp) cand_time_cross = self.cov(self.time_amp2, self.time_ls,comp,cand) # Cholesky decompositions obsv_time_cov = comp_time_cov + self.time_noise*np.eye(comp.shape[0]) obsv_time_chol = spla.cholesky( obsv_time_cov, lower=True ) # Linear systems t_alpha = spla.cho_solve((obsv_time_chol, True), durs - self.time_mean) #t_beta = spla.solve_triangular(obsv_time_chol, cand_time_cross, lower=True) # Predict marginal mean times and (possibly) variances func_time_m = np.dot(cand_time_cross.T, t_alpha) + self.time_mean # We don't really need the time variances now #func_time_v = self.time_amp2*(1+1e-6) - np.sum(t_beta**2, axis=0) # Bring time out of the log domain func_time_m = np.exp(func_time_m) if pend.shape[0] == 0: # If there are no pending, don't do anything fancy. # Current best. best = np.min(vals) # The primary covariances for prediction. comp_cov = self.cov(self.amp2, self.ls, comp) cand_cross = self.cov(self.amp2, self.ls, comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise*np.eye(comp.shape[0]) obsv_chol = spla.cholesky( obsv_cov, lower=True ) # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*( u*ncdf + npdf) ei_per_s = ei/func_time_m return ei_per_s else: # If there are pending experiments, fantasize their outcomes. # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = self.cov(self.amp2, self.ls, comp_pend) + self.noise*np.eye(comp_pend.shape[0]) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(self.amp2, self.ls, comp, pend) pend_kappa = self.cov(self.amp2, self.ls, pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[:comp.shape[0],:comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. pend_fant = np.dot(pend_chol, npr.randn(pend.shape[0],self.pending_samples)) + pend_m[:,None] # Include the fantasies. fant_vals = np.concatenate((np.tile(vals[:,np.newaxis], (1,self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(self.amp2, self.ls, comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2*(1+1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:,np.newaxis]) u = (bests[np.newaxis,:] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s*( u*ncdf + npdf) return np.divide(np.mean(ei, axis=1), func_time_m) def sample_hypers(self, comp, vals, durs): if self.noiseless: self.noise = 1e-3 self._sample_noiseless(comp, vals) else: self._sample_noisy(comp, vals) self._sample_ls(comp, vals) self._sample_time_noisy(comp, durs.squeeze()) self._sample_time_ls(comp, durs.squeeze()) self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) self.time_hyper_samples.append((self.time_mean, self.time_noise, self.time_amp2, self.time_ls)) def _sample_ls(self, comp, vals): def logprob(ls): if np.any(ls < 0) or np.any(ls > self.max_ls): return -np.inf cov = self.amp2 * (self.cov_func(ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + self.noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - self.mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-self.mean, solve) return lp self.ls = util.slice_sample(self.ls, logprob, compwise=True) def _sample_time_ls(self, comp, vals): def logprob(ls): if np.any(ls < 0) or np.any(ls > self.time_max_ls): return -np.inf cov = self.time_amp2 * (self.cov_func(ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + self.time_noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - self.time_mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-self.time_mean, solve) return lp self.time_ls = util.slice_sample(self.time_ls, logprob, compwise=True) def _sample_noisy(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = hypers[2] # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0 or noise < 0: return -np.inf cov = amp2 * (self.cov_func(self.ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve) # Roll in noise horseshoe prior. lp += np.log(np.log(1 + (self.noise_scale/noise)**2)) #lp -= 0.5*(np.log(noise)/self.noise_scale)**2 # Roll in amplitude lognormal prior lp -= 0.5*(np.log(amp2)/self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = hypers[2] def _sample_time_noisy(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = hypers[2] # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0 or noise < 0: return -np.inf cov = amp2 * (self.cov_func(self.time_ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve) # Roll in noise horseshoe prior. lp += np.log(np.log(1 + (self.time_noise_scale/noise)**2)) #lp -= 0.5*(np.log(noise)/self.time_noise_scale)**2 # Roll in amplitude lognormal prior lp -= 0.5*(np.log(np.sqrt(amp2))/self.time_amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.time_mean, self.time_amp2, self.time_noise]), logprob, compwise=False) self.time_mean = hypers[0] self.time_amp2 = hypers[1] self.time_noise = hypers[2] def _sample_noiseless(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = 1e-3 # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0: return -np.inf cov = amp2 * (self.cov_func(self.ls, comp, None) + 1e-6*np.eye(comp.shape[0])) + noise*np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol)))-0.5*np.dot(vals-mean, solve) # Roll in amplitude lognormal prior lp -= 0.5*(np.log(amp2)/self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = 1e-3 def optimize_hypers(self, comp, vals, durs): # First the GP to observations mygp = gp.GP(self.cov_func.__name__) mygp.real_init(comp.shape[1], vals) mygp.optimize_hypers(comp,vals) self.mean = mygp.mean self.ls = mygp.ls self.amp2 = mygp.amp2 self.noise = mygp.noise # Now the GP to times timegp = gp.GP(self.cov_func.__name__) timegp.real_init(comp.shape[1], durs) timegp.optimize_hypers(comp, durs) self.time_mean = timegp.mean self.time_amp2 = timegp.amp2 self.time_noise = timegp.noise self.time_ls = timegp.ls # Save hyperparameter samples self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) self.time_hyper_samples.append((self.time_mean, self.time_noise, self.time_amp2, self.time_ls)) self.dump_hypers()
class GPEIChooser: def __init__(self, expt_dir, covar="Matern52", mcmc_iters=10, pending_samples=100, noiseless=False): self.cov_func = getattr(gp, covar) self.locker = Locker() self.state_pkl = os.path.join(expt_dir, self.__module__ + ".pkl") self.mcmc_iters = int(mcmc_iters) self.pending_samples = pending_samples self.D = -1 self.hyper_iters = 1 self.noiseless = bool(int(noiseless)) self.noise_scale = 0.1 # horseshoe prior self.amp2_scale = 1 # zero-mean log normal prior self.max_ls = 2 # top-hat prior on length scales self.ls = None def __del__(self): self.locker.lock_wait(self.state_pkl) # Write the hyperparameters out to a Pickle. fh = tempfile.NamedTemporaryFile(mode='w', delete=False) cPickle.dump( { 'dims': self.D, 'ls': self.ls, 'amp2': self.amp2, 'noise': self.noise, 'mean': self.mean }, fh) fh.close() # Use an atomic move for better NFS happiness. cmd = 'mv "%s" "%s"' % (fh.name, self.state_pkl) os.system(cmd) # TODO: Should check system-dependent return status. self.locker.unlock(self.state_pkl) def _real_init(self, dims, values): self.locker.lock_wait(self.state_pkl) if os.path.exists(self.state_pkl): fh = open(self.state_pkl, 'r') state = cPickle.load(fh) fh.close() self.D = state['dims'] self.ls = state['ls'] self.amp2 = state['amp2'] self.noise = state['noise'] self.mean = state['mean'] else: # Input dimensionality. self.D = dims # Initial length scales. self.ls = np.ones(self.D) # Initial amplitude. self.amp2 = np.std(values) + 1e-4 # Initial observation noise. self.noise = 1e-3 # Initial mean. self.mean = np.mean(values) self.locker.unlock(self.state_pkl) def cov(self, x1, x2=None): if x2 is None: return self.amp2 * (self.cov_func(self.ls, x1, None) + 1e-6 * np.eye(x1.shape[0])) else: return self.amp2 * self.cov_func(self.ls, x1, x2) def next(self, grid, values, durations, candidates, pending, complete): # Don't bother using fancy GP stuff at first. if complete.shape[0] < 2: return int(candidates[0]) # Perform the real initialization. if self.D == -1: self._real_init(grid.shape[1], values[complete]) # Grab out the relevant sets. comp = grid[complete, :] cand = grid[candidates, :] pend = grid[pending, :] vals = values[complete] if self.mcmc_iters > 0: # Sample from hyperparameters. overall_ei = np.zeros((cand.shape[0], self.mcmc_iters)) for mcmc_iter in xrange(self.mcmc_iters): self.sample_hypers(comp, vals) log("mean: %f amp: %f noise: %f min_ls: %f max_ls: %f" % (self.mean, np.sqrt(self.amp2), self.noise, np.min( self.ls), np.max(self.ls))) overall_ei[:, mcmc_iter] = self.compute_ei( comp, pend, cand, vals) best_cand = np.argmax(np.mean(overall_ei, axis=1)) return int(candidates[best_cand]) else: # Optimize hyperparameters try: self.optimize_hypers(comp, vals) except: # Initial length scales. self.ls = np.ones(self.D) # Initial amplitude. self.amp2 = np.std(vals) # Initial observation noise. self.noise = 1e-3 log("mean: %f amp: %f noise: %f min_ls: %f max_ls: %f" % (self.mean, np.sqrt(self.amp2), self.noise, np.min( self.ls), np.max(self.ls))) ei = self.compute_ei(comp, pend, cand, vals) best_cand = np.argmax(ei) return int(candidates[best_cand]) def compute_ei(self, comp, pend, cand, vals): if pend.shape[0] == 0: # If there are no pending, don't do anything fancy. # Current best. best = np.min(vals) # The primary covariances for prediction. comp_cov = self.cov(comp) cand_cross = self.cov(comp, cand) # Compute the required Cholesky. obsv_cov = comp_cov + self.noise * np.eye(comp.shape[0]) obsv_chol = spla.cholesky(obsv_cov, lower=True) # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.solve_triangular(obsv_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v) u = (best - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) return ei else: # If there are pending experiments, fantasize their outcomes. # Create a composite vector of complete and pending. comp_pend = np.concatenate((comp, pend)) # Compute the covariance and Cholesky decomposition. comp_pend_cov = self.cov(comp_pend) + self.noise * np.eye( comp_pend.shape[0]) comp_pend_chol = spla.cholesky(comp_pend_cov, lower=True) # Compute submatrices. pend_cross = self.cov(comp, pend) pend_kappa = self.cov(pend) # Use the sub-Cholesky. obsv_chol = comp_pend_chol[:comp.shape[0], :comp.shape[0]] # Solve the linear systems. alpha = spla.cho_solve((obsv_chol, True), vals - self.mean) beta = spla.cho_solve((obsv_chol, True), pend_cross) # Finding predictive means and variances. pend_m = np.dot(pend_cross.T, alpha) + self.mean pend_K = pend_kappa - np.dot(pend_cross.T, beta) # Take the Cholesky of the predictive covariance. pend_chol = spla.cholesky(pend_K, lower=True) # Make predictions. pend_fant = (np.dot( pend_chol, npr.randn(pend.shape[0], self.pending_samples)) + pend_m[:, None]) # Include the fantasies. fant_vals = np.concatenate( (np.tile(vals[:, np.newaxis], (1, self.pending_samples)), pend_fant)) # Compute bests over the fantasies. bests = np.min(fant_vals, axis=0) # Now generalize from these fantasies. cand_cross = self.cov(comp_pend, cand) # Solve the linear systems. alpha = spla.cho_solve((comp_pend_chol, True), fant_vals - self.mean) beta = spla.solve_triangular(comp_pend_chol, cand_cross, lower=True) # Predict the marginal means and variances at candidates. func_m = np.dot(cand_cross.T, alpha) + self.mean func_v = self.amp2 * (1 + 1e-6) - np.sum(beta**2, axis=0) # Expected improvement func_s = np.sqrt(func_v[:, np.newaxis]) u = (bests[np.newaxis, :] - func_m) / func_s ncdf = sps.norm.cdf(u) npdf = sps.norm.pdf(u) ei = func_s * (u * ncdf + npdf) return np.mean(ei, axis=1) def sample_hypers(self, comp, vals): if self.noiseless: self.noise = 1e-3 self._sample_noiseless(comp, vals) else: self._sample_noisy(comp, vals) self._sample_ls(comp, vals) def _sample_ls(self, comp, vals): def logprob(ls): if np.any(ls < 0) or np.any(ls > self.max_ls): return -np.inf cov = self.amp2 * (self.cov_func(ls, comp, None) + 1e-6 * np.eye( comp.shape[0])) + self.noise * np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - self.mean) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot( vals - self.mean, solve) return lp self.ls = util.slice_sample(self.ls, logprob, compwise=True) def _sample_noisy(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = hypers[2] # This is pretty hacky, but keeps things sane. if mean > np.max(vals) or mean < np.min(vals): return -np.inf if amp2 < 0 or noise < 0: return -np.inf cov = amp2 * (self.cov_func(self.ls, comp, None) + 1e-6 * np.eye( comp.shape[0])) + noise * np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot( vals - mean, solve) # Roll in noise horseshoe prior. lp += np.log(np.log(1 + (self.noise_scale / noise)**2)) # Roll in amplitude lognormal prior lp -= 0.5 * (np.log(amp2) / self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = hypers[2] def _sample_noiseless(self, comp, vals): def logprob(hypers): mean = hypers[0] amp2 = hypers[1] noise = 1e-3 if amp2 < 0: return -np.inf cov = amp2 * (self.cov_func(self.ls, comp, None) + 1e-6 * np.eye( comp.shape[0])) + noise * np.eye(comp.shape[0]) chol = spla.cholesky(cov, lower=True) solve = spla.cho_solve((chol, True), vals - mean) lp = -np.sum(np.log(np.diag(chol))) - 0.5 * np.dot( vals - mean, solve) # Roll in amplitude lognormal prior lp -= 0.5 * (np.log(amp2) / self.amp2_scale)**2 return lp hypers = util.slice_sample(np.array([self.mean, self.amp2, self.noise]), logprob, compwise=False) self.mean = hypers[0] self.amp2 = hypers[1] self.noise = 1e-3 def optimize_hypers(self, comp, vals): mygp = gp.GP(self.cov_func.__name__) mygp.real_init(comp.shape[1], vals) mygp.optimize_hypers(comp, vals) self.mean = mygp.mean self.ls = mygp.ls self.amp2 = mygp.amp2 self.noise = mygp.noise # Save hyperparameter samples #self.hyper_samples.append((self.mean, self.noise, self.amp2, self.ls)) #self.dump_hypers() return
class ExperimentGrid: @staticmethod def job_running(expt_dir, id): expt_grid = ExperimentGrid(expt_dir) expt_grid.set_running(id) @staticmethod def job_complete(expt_dir, id, value, duration): log("setting job %d complete" % id) expt_grid = ExperimentGrid(expt_dir) expt_grid.set_complete(id, value, duration) log("set...") @staticmethod def job_broken(expt_dir, id): expt_grid = ExperimentGrid(expt_dir) expt_grid.set_broken(id) def __init__(self, expt_dir, variables=None, grid_size=None, grid_seed=1): self._ready = False self.expt_dir = expt_dir self.jobs_pkl = os.path.join(expt_dir, EXPERIMENT_GRID_FILE) self.locker = Locker() # Only one process at a time is allowed to have access to the grid. self.locker.lock_wait(self.jobs_pkl) # Set up the grid for the first time if it doesn't exist. if variables is not None and not os.path.exists(self.jobs_pkl): self.seed = grid_seed self.vmap = GridMap(variables, grid_size) self.grid = self._hypercube_grid(self.vmap.card(), grid_size) self.status = np.zeros(grid_size, dtype=int) + CANDIDATE_STATE self.values = np.zeros(grid_size) + np.nan self.durs = np.zeros(grid_size) + np.nan self.proc_ids = np.zeros(grid_size, dtype=int) self._ready = True self._save_jobs() # Or load in the grid from the pickled file. else: self._load_jobs() def __del__(self): self._save_jobs() if self.locker.unlock(self.jobs_pkl): pass else: raise Exception("Could not release lock on job grid.\n") def get_grid(self): return self.grid, self.values, self.durs def get_candidates(self): return np.nonzero(self.status == CANDIDATE_STATE)[0] def get_pending(self): return np.nonzero((self.status == SUBMITTED_STATE) | (self.status == RUNNING_STATE))[0] def get_complete(self): return np.nonzero(self.status == COMPLETE_STATE)[0] def get_broken(self): return np.nonzero(self.status == BROKEN_STATE)[0] def get_params(self, index): return self.vmap.get_params(self.grid[index,:]) def get_best(self): finite = self.values[np.isfinite(self.values)] if len(finite) > 0: cur_min = np.min(finite) index = np.nonzero(self.values==cur_min)[0][0] return cur_min, index else: return np.nan, -1 def get_proc_id(self, id): return self.proc_ids[id] def add_to_grid(self, candidate): # Checks to prevent numerical over/underflow from corrupting the grid candidate[candidate > 1.0] = 1.0 candidate[candidate < 0.0] = 0.0 # Set up the grid self.grid = np.vstack((self.grid, candidate)) self.status = np.append(self.status, np.zeros(1, dtype=int) + int(CANDIDATE_STATE)) self.values = np.append(self.values, np.zeros(1)+np.nan) self.durs = np.append(self.durs, np.zeros(1)+np.nan) self.proc_ids = np.append(self.proc_ids, np.zeros(1,dtype=int)) # Save this out. self._save_jobs() return self.grid.shape[0]-1 def set_candidate(self, id): self.status[id] = CANDIDATE_STATE self._save_jobs() def set_submitted(self, id, proc_id): self.status[id] = SUBMITTED_STATE self.proc_ids[id] = proc_id self._save_jobs() def set_running(self, id): self.status[id] = RUNNING_STATE self._save_jobs() def set_complete(self, id, value, duration): self.status[id] = COMPLETE_STATE self.values[id] = value self.durs[id] = duration self._save_jobs() def set_broken(self, id): self.status[id] = BROKEN_STATE self._save_jobs() def _load_jobs(self): fh = open(self.jobs_pkl, 'r') jobs = cPickle.load(fh) fh.close() self.vmap = jobs['vmap'] self.grid = jobs['grid'] self.status = jobs['status'] self.values = jobs['values'] self.durs = jobs['durs'] self.proc_ids = jobs['proc_ids'] self._ready = True def _save_jobs(self): if not self._ready: return # Write everything to a temporary file first. fh = tempfile.NamedTemporaryFile(mode='w', delete=False) cPickle.dump({ 'vmap' : self.vmap, 'grid' : self.grid, 'status' : self.status, 'values' : self.values, 'durs' : self.durs, 'proc_ids' : self.proc_ids }, fh, protocol=-1) fh.close() # Use an atomic move for better NFS happiness. cmd = 'mv "%s" "%s.new"' % (fh.name, self.jobs_pkl) assert os.system(cmd) == 0 # TODO: Should check system-dependent return status. if os.path.exists(self.jobs_pkl): assert os.system('mv "%s" "%s.old"' % (self.jobs_pkl, self.jobs_pkl)) == 0 assert os.system('mv "%s.new" "%s"' % (self.jobs_pkl, self.jobs_pkl)) == 0 def _hypercube_grid(self, dims, size): # Generate from a sobol sequence sobol_grid = np.transpose(i4_sobol_generate(dims,size,self.seed)) return sobol_grid