def select_next_points_botorch(observed_X: List[List[float]], observed_y: List[float]) -> np.ndarray: """Generate the next sample to evaluate with XTB Uses BOTorch to pick the next sample using Expected Improvement Args: observed_X: Observed coordinates observed_y: Observed energies Returns: Next coordinates to try """ # Clip the energies if needed observed_y = np.clip(observed_y, -np.inf, 2 + np.log10(np.clip(observed_y, 1, np.inf))) # Convert inputs to torch arrays train_X = torch.tensor(observed_X, dtype=torch.float) train_y = torch.tensor(observed_y, dtype=torch.float) train_y = train_y[:, None] train_y = standardize(-1 * train_y) # Make the GP gp = SingleTaskGP(train_X, train_y, covar_module=gpykernels.ScaleKernel( gpykernels.ProductStructureKernel( num_dims=train_X.shape[1], base_kernel=gpykernels.PeriodicKernel( period_length_prior=NormalPrior(360, 0.1))))) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) fit_gpytorch_model(mll) # Solve the optimization problem # Following boss, we use Eq. 5 of https://arxiv.org/pdf/1012.2599.pdf with delta=0.1 n_sampled, n_dim = train_X.shape kappa = np.sqrt( 2 * np.log10(np.power(n_sampled, n_dim / 2 + 2) * np.pi**2 / (3.0 * 0.1))) # Results in more exploration over time ei = UpperConfidenceBound(gp, kappa) bounds = torch.zeros(2, train_X.shape[1]) bounds[1, :] = 360 candidate, acq_value = optimize_acqf(ei, bounds=bounds, q=1, num_restarts=64, raw_samples=64) return candidate.detach().numpy()[0, :]
def initialize_model(self, train_X, train_Y, state_dict=None): """Initialise model for BO.""" # From: https://github.com/pytorch/botorch/issues/179 noise_prior = GammaPrior(1.1, 0.05) noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate MIN_INFERRED_NOISE_LEVEL = 1e-3 likelihood = GaussianLikelihood( noise_prior=noise_prior, noise_constraint=GreaterThan( MIN_INFERRED_NOISE_LEVEL, transform=None, initial_value=noise_prior_mode, ), ) # train_x = self.scale_to_0_1_bounds(train_X) train_Y = standardize(train_Y) gp = SingleTaskGP(train_X, train_Y, likelihood=likelihood) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) # load state dict if it is passed if state_dict is not None: gp.load_state_dict(state_dict) return mll, gp
def main( benchmark_name, dataset_name, dimensions, method_name, num_runs, run_start, num_iterations, acquisition_name, # acquisition_optimizer_name, gamma, num_random_init, mc_samples, batch_size, num_fantasies, num_restarts, raw_samples, noise_variance_init, # use_ard, # use_input_warping, standardize_targets, input_dir, output_dir): # TODO(LT): Turn into options # device = "cpu" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") dtype = torch.double benchmark = make_benchmark(benchmark_name, dimensions=dimensions, dataset_name=dataset_name, input_dir=input_dir) name = make_name(benchmark_name, dimensions=dimensions, dataset_name=dataset_name) output_path = Path(output_dir).joinpath(name, method_name) output_path.mkdir(parents=True, exist_ok=True) options = dict(gamma=gamma, num_random_init=num_random_init, acquisition_name=acquisition_name, mc_samples=mc_samples, batch_size=batch_size, num_restarts=num_restarts, raw_samples=raw_samples, num_fantasies=num_fantasies, noise_variance_init=noise_variance_init, standardize_targets=standardize_targets) with output_path.joinpath("options.yaml").open('w') as f: yaml.dump(options, f) config_space = DenseConfigurationSpace(benchmark.get_config_space()) bounds = create_bounds(config_space.get_bounds(), device=device, dtype=dtype) input_dim = config_space.get_dimensions() def func(tensor, *args, **kwargs): """ Wrapper that receives and returns torch.Tensor """ config = dict_from_tensor(tensor, cs=config_space) # turn into maximization problem res = -benchmark.evaluate(config).value return torch.tensor(res, device=device, dtype=dtype) for run_id in trange(run_start, num_runs, unit="run"): run_begin_t = batch_end_t_adj = batch_end_t = datetime.now() frames = [] features = [] targets = [] noise_variance = torch.tensor(noise_variance_init, device=device, dtype=dtype) state_dict = None with trange(num_iterations) as iterations: for batch in iterations: if len(targets) < num_random_init: # click.echo(f"Completed {i}/{num_random_init} initial runs. " # "Suggesting random candidate...") # TODO(LT): support random seed X_batch = torch.rand(size=(batch_size, input_dim), device=device, dtype=dtype) else: # construct dataset X = torch.vstack(features) y = torch.hstack(targets).unsqueeze(axis=-1) y = standardize(y) if standardize_targets else y # construct model # model = FixedNoiseGP(X, standardize(y), noise_variance.expand_as(y), model = FixedNoiseGP(X, y, noise_variance.expand_as(y), input_transform=None).to(X) mll = ExactMarginalLogLikelihood(model.likelihood, model) if state_dict is not None: model.load_state_dict(state_dict) # update model fit_gpytorch_model(mll) # construct acquisition function tau = torch.quantile(y, q=1 - gamma) iterations.set_postfix(tau=tau.item()) if acquisition_name == "q-KG": assert num_fantasies is not None and num_fantasies > 0 acq = qKnowledgeGradient(model, num_fantasies=num_fantasies) elif acquisition_name == "q-EI": assert mc_samples is not None and mc_samples > 0 qmc_sampler = SobolQMCNormalSampler( num_samples=mc_samples) acq = qExpectedImprovement(model=model, best_f=tau, sampler=qmc_sampler) # optimize acquisition function X_batch, b = optimize_acqf(acq_function=acq, bounds=bounds, q=batch_size, num_restarts=num_restarts, raw_samples=raw_samples, options=dict(batch_limit=5, maxiter=200)) state_dict = model.state_dict() # begin batch evaluation batch_begin_t = datetime.now() decision_duration = batch_begin_t - batch_end_t batch_begin_t_adj = batch_end_t_adj + decision_duration eval_end_times = [] # TODO(LT): Deliberately not doing broadcasting for now since # batch sizes are so small anyway. Can revisit later if there # is a compelling reason to do it. rows = [] for j, x_next in enumerate(X_batch): # eval begin time eval_begin_t = datetime.now() # evaluate blackbox objective y_next = func(x_next) # eval end time eval_end_t = datetime.now() # eval duration eval_duration = eval_end_t - eval_begin_t # adjusted eval end time is the duration added to the # time at which batch eval was started eval_end_t_adj = batch_begin_t_adj + eval_duration eval_end_times.append(eval_end_t_adj) elapsed = eval_end_t_adj - run_begin_t # update dataset features.append(x_next) targets.append(y_next) row = dict_from_tensor(x_next, cs=config_space) row["loss"] = -y_next.item() row["cost_eval"] = eval_duration.total_seconds() row["finished"] = elapsed.total_seconds() rows.append(row) batch_end_t = datetime.now() batch_end_t_adj = max(eval_end_times) frame = pd.DataFrame(data=rows) \ .assign(batch=batch, cost_decision=decision_duration.total_seconds()) frames.append(frame) data = pd.concat(frames, axis="index", ignore_index=True) data.to_csv(output_path.joinpath(f"{run_id:03d}.csv")) return 0
def generate_outer_restart_points(self, acqf: ApxCVaRKG, w_samples: Tensor = None) -> Tensor: """ Generates the restart points for acqf optimization. :param acqf: The acquisition function being optimized :param w_samples: the list of w samples to use :return: restart points """ X = draw_constrained_sobol( bounds=self.outer_bounds, n=self.raw_samples, q=self.q, inequality_constraints=self.inequality_constraints, ).to(dtype=self.dtype, device=self.device) # get the optimizers of the inner problem if w_samples is None: w_samples = (acqf.fixed_samples if acqf.fixed_samples is not None else torch.rand(acqf.num_samples, acqf.dim_w, dtype=self.dtype, device=self.device)) inner_rho = InnerRho( model=acqf.model, w_samples=w_samples, alpha=acqf.alpha, dim_x=acqf.dim_x, num_repetitions=acqf.num_repetitions, inner_seed=acqf.inner_seed, CVaR=acqf.CVaR, expectation=acqf.expectation, weights=getattr(acqf, "weights", None), ) inner_solutions, inner_values = super().optimize_inner( inner_rho, False) # sample from the optimizers n_value = int((1 - self.random_frac) * self.num_fantasies) weights = torch.exp(self.eta * standardize(inner_values)) idx = torch.multinomial(weights, self.raw_samples * n_value, replacement=True) # set the respective raw samples to the sampled optimizers # we first get the corresponding beta values and merge them with sampled # optimizers. this avoids the need for complicated indexing betas = X[..., self.beta_idcs][..., -n_value:].reshape(self.raw_samples, -1, 1) X[..., -n_value * (self.dim_x + 1):] = torch.cat( [ inner_solutions[idx, 0].reshape(self.raw_samples, n_value, self.dim_x), betas, ], dim=-1, ).view(self.raw_samples, 1, -1) if w_samples is not None: w_ind = torch.randint(w_samples.shape[0], (self.raw_samples, self.q)) if self.q > 1: raise NotImplementedError("This does not support q>1!") X[..., self.dim_x:self.dim] = w_samples[w_ind, :] return self.generate_restart_points_from_samples(X, acqf)
def sample_arch(self, START_BO, g, steps, hyperparams, og_flops, full_val_loss, target_flops=0): if args.slim: if target_flops == 0: parameterization = hyperparams.random_sample() layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) else: parameterization = np.ones(hyperparams.get_dim()) * args.lower_channel layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) else: # random sample to warmup history for MOBO if g < START_BO: if target_flops == 0: f = np.random.rand(1) * (args.upper_channel-args.lower_channel) + args.lower_channel else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) # put the largest model into the history elif g == START_BO: if target_flops == 0: parameterization = np.ones(hyperparams.get_dim()) else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) # MOBO else: # this is the scalarization (lambda_{FLOPs}) rand = torch.rand(1).cuda() # standardize data for building Gaussian Processes train_X = torch.FloatTensor(self.X).cuda() train_Y_loss = torch.FloatTensor(np.array(self.Y)[:, 0].reshape(-1, 1)).cuda() train_Y_loss = standardize(train_Y_loss) train_Y_cost = torch.FloatTensor(np.array(self.Y)[:, 1].reshape(-1, 1)).cuda() train_Y_cost = standardize(train_Y_cost) new_train_X = train_X # GP for the cross entropy loss gp_loss = SingleTaskGP(new_train_X, train_Y_loss) mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss) mll = mll.to('cuda') fit_gpytorch_model(mll) # GP for FLOPs # we use add-gp since FLOPs has addive structure (not exactly though) # the parameters for ScaleKernel and MaternKernel simply follow the default covar_module = AdditiveStructureKernel( ScaleKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), num_dims=1 ), outputscale_prior=GammaPrior(2.0, 0.15), ), num_dims=train_X.shape[1] ) gp_cost = SingleTaskGP(new_train_X, train_Y_cost, covar_module=covar_module) mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost) mll = mll.to('cuda') fit_gpytorch_model(mll) # Build acquisition functions UCB_loss = UpperConfidenceBound(gp_loss, beta=0.1).cuda() UCB_cost = UpperConfidenceBound(gp_cost, beta=0.1).cuda() # Combine them via augmented Tchebyshev scalarization self.mobo_obj = RandAcquisition(UCB_loss).cuda() self.mobo_obj.setup(UCB_loss, UCB_cost, rand) # Bounds for the optimization variable (alpha) lower = torch.ones(new_train_X.shape[1])*args.lower_channel upper = torch.ones(new_train_X.shape[1])*args.upper_channel self.mobo_bounds = torch.stack([lower, upper]).cuda() # Pareto-aware sampling if args.pas: # Generate approximate Pareto front first costs = [] for i in range(len(self.population_data)): costs.append([self.population_data[i]['loss'], self.population_data[i]['ratio']]) costs = np.array(costs) efficient_mask = is_pareto_efficient(costs) costs = costs[efficient_mask] loss = costs[:, 0] flops = costs[:, 1] sorted_idx = np.argsort(flops) loss = loss[sorted_idx] flops = flops[sorted_idx] if flops[0] > args.lower_flops: flops = np.concatenate([[args.lower_flops], flops.reshape(-1)]) loss = np.concatenate([[8], loss.reshape(-1)]) else: flops = flops.reshape(-1) loss = loss.reshape(-1) if flops[-1] < args.upper_flops and (loss[-1] > full_val_loss): flops = np.concatenate([flops.reshape(-1), [args.upper_flops]]) loss = np.concatenate([loss.reshape(-1), [full_val_loss]]) else: flops = flops.reshape(-1) loss = loss.reshape(-1) # Equation (4) in paper areas = (flops[1:]-flops[:-1])*(loss[:-1]-loss[1:]) # Quantize into 50 bins to sample from multinomial self.sampling_weights = np.zeros(50) k = 0 while k < len(flops) and flops[k] < args.lower_flops: k+=1 for i in range(50): lower = i/50. upper = (i+1)/50. if upper < args.lower_flops or lower > args.upper_flops or lower < args.lower_flops: continue cnt = 1 while ((k+1) < len(flops)) and upper > flops[k+1]: self.sampling_weights[i] += areas[k] cnt += 1 k += 1 if k < len(areas): self.sampling_weights[i] += areas[k] self.sampling_weights[i] /= cnt if np.sum(self.sampling_weights) == 0: self.sampling_weights = np.ones(50) if target_flops == 0: val = np.arange(0.01, 1, 0.02) chosen_target_flops = np.random.choice(val, p=(self.sampling_weights/np.sum(self.sampling_weights))) else: chosen_target_flops = target_flops # Binary search is here lower_bnd, upper_bnd = 0, 1 lmda = 0.5 for i in range(10): self.mobo_obj.rand = lmda parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) sim_flops = self.mask_pruner.simulate_and_count_flops(layer_budget) ratio = sim_flops/og_flops if np.abs(ratio - chosen_target_flops) <= 0.02: break if args.baseline > 0: if ratio < chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 elif ratio > chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 else: if ratio < chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 elif ratio > chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 rand[0] = lmda writer.add_scalar('Binary search trials', i, steps) else: parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) return layer_budget, parameterization, self.sampling_weights/np.sum(self.sampling_weights)
# excursionsearch. If not, see <http://www.gnu.org/licenses/>. # # import torch from botorch.models import SingleTaskGP from botorch.fit import fit_gpytorch_model from botorch.utils import standardize from gpytorch.mlls import ExactMarginalLogLikelihood from botorch.acquisition import UpperConfidenceBound from botorch.optim import optimize_acqf # Training data: train_X = torch.rand(10, 2) Y = 1 - torch.norm(train_X - 0.5, dim=-1, keepdim=True) Y = Y + 0.1 * torch.randn_like(Y) # add some noise train_Y = standardize(Y) # Fir the model: gp = SingleTaskGP(train_X, train_Y) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) fit_gpytorch_model(mll) print(mll) # Construct acquisition function: UCB = UpperConfidenceBound(gp, beta=0.1) print(UCB) bounds = torch.stack([torch.zeros(2), torch.ones(2)]) candidate, acq_value = optimize_acqf(
def sample_arch(self, START_BO, g, hyperparams, og_flops, empty_val_loss, full_val_loss, target_flops=0): if g < START_BO: if target_flops == 0: f = np.random.rand(1) * (args.upper_channel-args.lower_channel) + args.lower_channel else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) elif g == START_BO: if target_flops == 0: parameterization = np.ones(hyperparams.get_dim()) else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) else: rand = torch.rand(1).cuda() train_X = torch.FloatTensor(self.X).cuda() train_Y_loss = torch.FloatTensor(np.array(self.Y)[:, 0].reshape(-1, 1)).cuda() train_Y_loss = standardize(train_Y_loss) train_Y_cost = torch.FloatTensor(np.array(self.Y)[:, 1].reshape(-1, 1)).cuda() train_Y_cost = standardize(train_Y_cost) covar_module = None if args.ski and g > 128: if args.additive: covar_module = AdditiveStructureKernel( ScaleKernel( GridInterpolationKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), ), grid_size=128, num_dims=1, grid_bounds=[(0, 1)] ), outputscale_prior=GammaPrior(2.0, 0.15), ), num_dims=train_X.shape[1] ) else: covar_module = ScaleKernel( GridInterpolationKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), ), grid_size=128, num_dims=train_X.shape[1], grid_bounds=[(0, 1) for _ in range(train_X.shape[1])] ), outputscale_prior=GammaPrior(2.0, 0.15), ) else: if args.additive: covar_module = AdditiveStructureKernel( ScaleKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), num_dims=1 ), outputscale_prior=GammaPrior(2.0, 0.15), ), num_dims=train_X.shape[1] ) else: covar_module = ScaleKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), num_dims=train_X.shape[1] ), outputscale_prior=GammaPrior(2.0, 0.15), ) new_train_X = train_X gp_loss = SingleTaskGP(new_train_X, train_Y_loss, covar_module=covar_module) mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss) mll = mll.to('cuda') fit_gpytorch_model(mll) # Use add-gp for cost covar_module = AdditiveStructureKernel( ScaleKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), num_dims=1 ), outputscale_prior=GammaPrior(2.0, 0.15), ), num_dims=train_X.shape[1] ) gp_cost = SingleTaskGP(new_train_X, train_Y_cost, covar_module=covar_module) mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost) mll = mll.to('cuda') fit_gpytorch_model(mll) UCB_loss = UpperConfidenceBound(gp_loss, beta=args.beta).cuda() UCB_cost = UpperConfidenceBound(gp_cost, beta=args.beta).cuda() self.mobo_obj = RandAcquisition(UCB_loss).cuda() self.mobo_obj.setup(UCB_loss, UCB_cost, rand) lower = torch.ones(new_train_X.shape[1])*args.lower_channel upper = torch.ones(new_train_X.shape[1])*args.upper_channel self.mobo_bounds = torch.stack([lower, upper]).cuda() if args.pas: val = np.linspace(args.lower_flops, 1, 50) chosen_target_flops = np.random.choice(val, p=(self.sampling_weights/np.sum(self.sampling_weights))) lower_bnd, upper_bnd = 0, 1 lmda = 0.5 for i in range(10): self.mobo_obj.rand = lmda parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) sim_flops = self.mask_pruner.simulate_and_count_flops(layer_budget, self.use_mem) ratio = sim_flops/og_flops if np.abs(ratio - chosen_target_flops) <= 0.02: break if args.baseline > 0: if ratio < chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 elif ratio > chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 else: if ratio < chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 elif ratio > chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 rand[0] = lmda writer.add_scalar('Binary search trials', i, g) else: parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) return layer_budget, parameterization, self.sampling_weights/np.sum(self.sampling_weights)
def main(benchmark_name, dataset_name, dimensions, method_name, num_runs, run_start, num_iterations, # acquisition_name, # acquisition_optimizer_name, gamma, num_random_init, num_restarts, raw_samples, noise_variance_init, # use_ard, # use_input_warping, standardize_targets, input_dir, output_dir): # TODO(LT): Turn into options # device = "cpu" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") dtype = torch.double benchmark = make_benchmark(benchmark_name, dimensions=dimensions, dataset_name=dataset_name, input_dir=input_dir) name = make_name(benchmark_name, dimensions=dimensions, dataset_name=dataset_name) output_path = Path(output_dir).joinpath(name, method_name) output_path.mkdir(parents=True, exist_ok=True) options = dict(gamma=gamma, num_random_init=num_random_init, num_restarts=num_restarts, raw_samples=raw_samples, noise_variance_init=noise_variance_init, standardize_targets=standardize_targets) with output_path.joinpath("options.yaml").open('w') as f: yaml.dump(options, f) config_space = DenseConfigurationSpace(benchmark.get_config_space()) bounds = create_bounds(config_space.get_bounds(), device=device, dtype=dtype) input_dim = config_space.get_dimensions() def func(tensor, *args, **kwargs): """ Wrapper that receives and returns torch.Tensor """ config = dict_from_tensor(tensor, cs=config_space) # turn into maximization problem res = - benchmark.evaluate(config).value return torch.tensor(res, device=device, dtype=dtype) for run_id in trange(run_start, num_runs, unit="run"): t_start = datetime.now() rows = [] features = [] targets = [] noise_variance = torch.tensor(noise_variance_init, device=device, dtype=dtype) state_dict = None with trange(num_iterations) as iterations: for i in iterations: if len(targets) < num_random_init: # click.echo(f"Completed {i}/{num_random_init} initial runs. " # "Suggesting random candidate...") # TODO(LT): support random seed x_new = torch.rand(size=(input_dim,), device=device, dtype=dtype) else: # construct dataset X = torch.vstack(features) y = torch.hstack(targets).unsqueeze(axis=-1) y = standardize(y) if standardize_targets else y # construct model # model = FixedNoiseGP(X, standardize(y), noise_variance.expand_as(y), model = FixedNoiseGP(X, y, noise_variance.expand_as(y), input_transform=None).to(X) mll = ExactMarginalLogLikelihood(model.likelihood, model) if state_dict is not None: model.load_state_dict(state_dict) # update model fit_gpytorch_model(mll) # construct acquisition function tau = torch.quantile(y, q=1-gamma) iterations.set_postfix(tau=tau.item()) ei = ExpectedImprovement(model=model, best_f=tau) # optimize acquisition function X_batch, b = optimize_acqf(acq_function=ei, bounds=bounds, q=1, num_restarts=num_restarts, raw_samples=raw_samples, options=dict(batch_limit=5, maxiter=200)) x_new = X_batch.squeeze(axis=0) state_dict = model.state_dict() # evaluate blackbox objective # t0 = datetime.now() y_new = func(x_new) t1 = datetime.now() delta = t1 - t_start # update dataset features.append(x_new) targets.append(y_new) row = dict_from_tensor(x_new, cs=config_space) row["loss"] = - y_new.item() row["finished"] = delta.total_seconds() rows.append(row) data = pd.DataFrame(data=rows) data.to_csv(output_path.joinpath(f"{run_id:03d}.csv")) return 0
def observe(self, X, y): """Send an observation of a suggestion back to the optimizer. Parameters ---------- X : list of dict-like Places where the objective function has already been evaluated. Each suggestion is a dictionary where each key corresponds to a parameter being optimized. y : array-like, shape (n,) Corresponding values where objective has been evaluated """ try: assert len(X) == len(y) c = 0 for x_, y_ in zip(X, y): # Archive stores all the solutions self.archive.append(x_) self.arc_fitness.append( -y_) # As BoTorch solves a maximization problem if self.iter == 1: self.population.append(x_) self.fitness.append(y_) else: if y_ <= self.fitness[c]: self.population[c] = x_ self.fitness[c] = y_ c += 1 # Just ignore, any inf observations we got, unclear if right thing if np.isfinite(y_): self._observe(x_, y_) # Transform the data (seen till now) into tensors and train the model train_x = normalize(torch.from_numpy( self.search_space.warp(self.archive)), bounds=self.torch_bounds) train_y = standardize( torch.from_numpy( np.array(self.arc_fitness).reshape(len(self.arc_fitness), 1))) # Fit the GP based on the actual observed values if self.iter == 1: self.model, mll = self.make_model(train_x, train_y) else: self.model, mll = self.make_model(train_x, train_y, self.model.state_dict()) # mll.train() fit_gpytorch_model(mll) # define the sampler sampler = SobolQMCNormalSampler(num_samples=512) # define the acquisition function self.acquisition = qExpectedImprovement(model=self.model, best_f=train_y.max(), sampler=sampler) except Exception as e: print('Error: {} in observe()'.format(e))
def sample_arch(self, START_BO, g, hyperparams, og_flops, empty_val_loss, full_val_loss, target_flops=0): # Warming up the history with a single width-multiplier if g < START_BO: if target_flops == 0: f = np.random.rand(1) * (args.upper_channel - args. lower_channel) + args.lower_channel else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization( parameterization, self.mask_pruner) # Put largest model into the history elif g == START_BO: if target_flops == 0: parameterization = np.ones(hyperparams.get_dim()) else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization( parameterization, self.mask_pruner) # MOBO-RS else: rand = torch.rand(1).cuda() train_X = torch.FloatTensor(self.X).cuda() train_Y_loss = torch.FloatTensor( np.array(self.Y)[:, 0].reshape(-1, 1)).cuda() train_Y_loss = standardize(train_Y_loss) train_Y_cost = torch.FloatTensor( np.array(self.Y)[:, 1].reshape(-1, 1)).cuda() train_Y_cost = standardize(train_Y_cost) new_train_X = train_X gp_loss = SingleTaskGP(new_train_X, train_Y_loss) mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss) mll = mll.to('cuda') fit_gpytorch_model(mll) # Use add-gp for cost covar_module = AdditiveStructureKernel(ScaleKernel( MaternKernel(nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), num_dims=1), outputscale_prior=GammaPrior(2.0, 0.15), ), num_dims=train_X.shape[1]) gp_cost = SingleTaskGP(new_train_X, train_Y_cost, covar_module=covar_module) mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost) mll = mll.to('cuda') fit_gpytorch_model(mll) UCB_loss = UpperConfidenceBound(gp_loss).cuda() UCB_cost = UpperConfidenceBound(gp_cost).cuda() self.mobo_obj = RandAcquisition(UCB_loss).cuda() self.mobo_obj.setup(UCB_loss, UCB_cost, rand) lower = torch.ones(new_train_X.shape[1]) * args.lower_channel upper = torch.ones(new_train_X.shape[1]) * args.upper_channel self.mobo_bounds = torch.stack([lower, upper]).cuda() if args.pas: costs = [] for i in range(len(self.population_data)): costs.append([ self.population_data[i]['loss'], self.population_data[i]['ratio'] ]) costs = np.array(costs) efficient_mask = is_pareto_efficient(costs) costs = costs[efficient_mask] loss = costs[:, 0] flops = costs[:, 1] sorted_idx = np.argsort(flops) loss = loss[sorted_idx] flops = flops[sorted_idx] if flops[0] > args.lower_flops: flops = np.concatenate([[args.lower_flops], flops.reshape(-1)]) loss = np.concatenate([[empty_val_loss], loss.reshape(-1)]) else: flops = flops.reshape(-1) loss = loss.reshape(-1) if flops[-1] < args.upper_flops and (loss[-1] > full_val_loss): flops = np.concatenate( [flops.reshape(-1), [args.upper_flops]]) loss = np.concatenate([loss.reshape(-1), [full_val_loss]]) else: flops = flops.reshape(-1) loss = loss.reshape(-1) areas = (flops[1:] - flops[:-1]) * (loss[:-1] - loss[1:]) self.sampling_weights = np.zeros(50) k = 0 while k < len(flops) and flops[k] < args.lower_flops: k += 1 for i in range(50): lower = i / 50. upper = (i + 1) / 50. if upper < args.lower_flops or lower > args.upper_flops or lower < args.lower_flops: continue cnt = 1 while ((k + 1) < len(flops)) and upper > flops[k + 1]: self.sampling_weights[i] += areas[k] cnt += 1 k += 1 if k < len(areas): self.sampling_weights[i] += areas[k] self.sampling_weights[i] /= cnt if np.sum(self.sampling_weights) == 0: self.sampling_weights = np.ones(50) if target_flops == 0: val = np.arange(0.01, 1, 0.02) chosen_target_flops = np.random.choice( val, p=(self.sampling_weights / np.sum(self.sampling_weights))) else: chosen_target_flops = target_flops lower_bnd, upper_bnd = 0, 1 lmda = 0.5 for i in range(10): self.mobo_obj.rand = lmda parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization( parameterization, self.mask_pruner) sim_flops = self.mask_pruner.simulate_and_count_flops( layer_budget) ratio = sim_flops / og_flops if np.abs(ratio - chosen_target_flops) <= 0.02: break if args.baseline > 0: if ratio < chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 elif ratio > chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 else: if ratio < chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 elif ratio > chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 rand[0] = lmda writer.add_scalar('Binary search trials', i, g) else: parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization( parameterization, self.mask_pruner) return layer_budget, parameterization, self.sampling_weights / np.sum( self.sampling_weights)