def __init__( self, decomposition: Dict[str, List[int]], batch_shape: torch.Size ) -> None: super().__init__(batch_shape=batch_shape) self.decomposition = decomposition num_param = len(next(iter(decomposition.values()))) for active_parameters in decomposition.values(): # check number of parameters are same in each decomp if len(active_parameters) != num_param: raise ValueError( "num of parameters needs to be same across all contexts" ) self._indexers = { context: torch.tensor(active_params) for context, active_params in self.decomposition.items() } self.base_kernel = MaternKernel( nu=2.5, ard_num_dims=num_param, batch_shape=batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ) self.kernel_dict = {} # scaled kernel for each parameter space partition for context in list(decomposition.keys()): self.kernel_dict[context] = ScaleKernel( base_kernel=self.base_kernel, outputscale_prior=GammaPrior(2.0, 15.0) ) self.kernel_dict = ModuleDict(self.kernel_dict)
def __init__( self, train_X: Tensor, train_Y: Tensor, nu: float = 2.5, train_iteration_fidelity: bool = True, train_data_fidelity: bool = True, likelihood: Optional[Likelihood] = None, ) -> None: if not train_iteration_fidelity and not train_data_fidelity: raise UnsupportedError( "You should have at least one fidelity parameter.") self._set_dimensions(train_X=train_X, train_Y=train_Y) kernel = LinearTruncatedFidelityKernel( nu=nu, dimension=train_X.shape[-1], train_iteration_fidelity=train_iteration_fidelity, train_data_fidelity=train_data_fidelity, batch_shape=self._aug_batch_shape, power_prior=GammaPrior(3.0, 3.0), ) covar_module = ScaleKernel( kernel, batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) super().__init__(train_X=train_X, train_Y=train_Y, covar_module=covar_module) self.to(train_X)
def __init__( self, train_X: Tensor, train_Y: Tensor, likelihood: Optional[Likelihood] = None, covar_module: Optional[Module] = None, ) -> None: r"""A single-task exact GP model. Args: train_X: A `n x d` or `batch_shape x n x d` (batch mode) tensor of training features. train_Y: A `n x m` or `batch_shape x n x m` (batch mode) tensor of training observations. likelihood: A likelihood. If omitted, use a standard GaussianLikelihood with inferred noise level. covar_module: The covariance (kernel) matrix. If omitted, use the MaternKernel. Example: >>> train_X = torch.rand(20, 2) >>> train_Y = torch.sin(train_X).sum(dim=1, keepdim=True) >>> model = SingleTaskGP(train_X, train_Y) """ validate_input_scaling(train_X=train_X, train_Y=train_Y) self._validate_tensor_args(X=train_X, Y=train_Y) self._set_dimensions(train_X=train_X, train_Y=train_Y) train_X, train_Y, _ = self._transform_tensor_args(X=train_X, Y=train_Y) if likelihood is None: noise_prior = GammaPrior(1.1, 0.05) noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate likelihood = GaussianLikelihood( noise_prior=noise_prior, batch_shape=self._aug_batch_shape, noise_constraint=GreaterThan( MIN_INFERRED_NOISE_LEVEL, transform=None, initial_value=noise_prior_mode, ), ) else: self._is_custom_likelihood = True ExactGP.__init__(self, train_X, train_Y, likelihood) self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape) if covar_module is None: self.covar_module = ScaleKernel( MaternKernel( nu=2.5, ard_num_dims=train_X.shape[-1], batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ), batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) else: self.covar_module = covar_module self.to(train_X)
def __init__( self, train_X: Tensor, train_Y: Tensor, train_Yvar: Tensor, outcome_transform: Optional[OutcomeTransform] = None, ) -> None: r"""A single-task exact GP model using fixed noise levels. Args: train_X: A `batch_shape x n x d` tensor of training features. train_Y: A `batch_shape x n x m` tensor of training observations. train_Yvar: A `batch_shape x n x m` tensor of observed measurement noise. outcome_transform: An outcome transform that is applied to the training data during instantiation and to the posterior during inference (that is, the `Posterior` obtained by calling `.posterior` on the model will be on the original scale). Example: >>> train_X = torch.rand(20, 2) >>> train_Y = torch.sin(train_X).sum(dim=1, keepdim=True) >>> train_Yvar = torch.full_like(train_Y, 0.2) >>> model = FixedNoiseGP(train_X, train_Y, train_Yvar) """ if outcome_transform is not None: train_Y, train_Yvar = outcome_transform(train_Y, train_Yvar) validate_input_scaling(train_X=train_X, train_Y=train_Y, train_Yvar=train_Yvar) self._validate_tensor_args(X=train_X, Y=train_Y, Yvar=train_Yvar) self._set_dimensions(train_X=train_X, train_Y=train_Y) train_X, train_Y, train_Yvar = self._transform_tensor_args( X=train_X, Y=train_Y, Yvar=train_Yvar) likelihood = FixedNoiseGaussianLikelihood( noise=train_Yvar, batch_shape=self._aug_batch_shape) ExactGP.__init__(self, train_inputs=train_X, train_targets=train_Y, likelihood=likelihood) self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape) self.covar_module = ScaleKernel( base_kernel=MaternKernel( nu=2.5, ard_num_dims=train_X.shape[-1], batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ), batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) if outcome_transform is not None: self.outcome_transform = outcome_transform self._subset_batch_dict = { "mean_module.constant": -2, "covar_module.raw_outputscale": -1, "covar_module.base_kernel.raw_lengthscale": -3, } self.to(train_X)
def test_sample_all_priors(self, cuda=False): device = torch.device("cuda" if cuda else "cpu") for dtype in (torch.float, torch.double): train_X = torch.rand(3, 5, device=device, dtype=dtype) train_Y = torch.rand(3, 1, device=device, dtype=dtype) model = SingleTaskGP(train_X=train_X, train_Y=train_Y) mll = ExactMarginalLogLikelihood(model.likelihood, model) mll.to(device=device, dtype=dtype) original_state_dict = dict(deepcopy(mll.model.state_dict())) sample_all_priors(model) # make sure one of the hyperparameters changed self.assertTrue( dict(model.state_dict())["likelihood.noise_covar.raw_noise"] != original_state_dict["likelihood.noise_covar.raw_noise"]) # check that lengthscales are all different ls = model.covar_module.base_kernel.raw_lengthscale.view( -1).tolist() self.assertTrue(all(ls[0] != ls[i]) for i in range(1, len(ls))) # change one of the priors to SmoothedBoxPrior model.covar_module = ScaleKernel( MaternKernel( nu=2.5, ard_num_dims=model.train_inputs[0].shape[-1], batch_shape=model._aug_batch_shape, lengthscale_prior=SmoothedBoxPrior(3.0, 6.0), ), batch_shape=model._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) original_state_dict = dict(deepcopy(mll.model.state_dict())) with warnings.catch_warnings( record=True) as ws, settings.debug(True): sample_all_priors(model) self.assertEqual(len(ws), 1) self.assertTrue("rsample" in str(ws[0].message)) # the lengthscale should not have changed because sampling is # not implemented for SmoothedBoxPrior self.assertTrue( torch.equal( dict(model.state_dict()) ["covar_module.base_kernel.raw_lengthscale"], original_state_dict[ "covar_module.base_kernel.raw_lengthscale"], )) # set setting_closure to None and make sure RuntimeError is raised prior_tuple = model.likelihood.noise_covar._priors["noise_prior"] model.likelihood.noise_covar._priors["noise_prior"] = ( prior_tuple[0], prior_tuple[1], None, ) with self.assertRaises(RuntimeError): sample_all_priors(model)
def __init__( self, B: Tensor, train_X: Tensor, train_Y: Tensor, train_Yvar: Tensor ) -> None: super().__init__(train_X=train_X, train_Y=train_Y, train_Yvar=train_Yvar) self.covar_module = ScaleKernel( base_kernel=ALEBOKernel(B=B, batch_shape=self._aug_batch_shape), batch_shape=self._aug_batch_shape, ) self.to(train_X)
def __init__( self, train_X: Tensor, train_Y: Tensor, train_iteration_fidelity: bool = True, train_data_fidelity: bool = True, likelihood: Optional[Likelihood] = None, ) -> None: train_X, train_Y, _ = self._set_dimensions(train_X=train_X, train_Y=train_Y) num_fidelity = train_iteration_fidelity + train_data_fidelity ard_num_dims = train_X.shape[-1] - num_fidelity active_dimsX = list(range(train_X.shape[-1] - num_fidelity)) rbf_kernel = RBFKernel( ard_num_dims=ard_num_dims, batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), active_dims=active_dimsX, ) exp_kernel = ExpDecayKernel( batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), offset_prior=GammaPrior(3.0, 6.0), power_prior=GammaPrior(3.0, 6.0), ) ds_kernel = DownsamplingKernel( batch_shape=self._aug_batch_shape, offset_prior=GammaPrior(3.0, 6.0), power_prior=GammaPrior(3.0, 6.0), ) if train_iteration_fidelity and train_data_fidelity: active_dimsS1 = [train_X.shape[-1] - 1] active_dimsS2 = [train_X.shape[-1] - 2] exp_kernel.active_dims = torch.tensor(active_dimsS1) ds_kernel.active_dims = torch.tensor(active_dimsS2) kernel = rbf_kernel * exp_kernel * ds_kernel elif train_iteration_fidelity or train_data_fidelity: active_dimsS = [train_X.shape[-1] - 1] if train_iteration_fidelity: exp_kernel.active_dims = torch.tensor(active_dimsS) kernel = rbf_kernel * exp_kernel else: ds_kernel.active_dims = torch.tensor(active_dimsS) kernel = rbf_kernel * ds_kernel else: raise UnsupportedError( "You should have at least one fidelity parameter.") covar_module = ScaleKernel( kernel, batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) super().__init__(train_X=train_X, train_Y=train_Y, covar_module=covar_module) self.to(train_X)
def __init__(self, train_X: Tensor, train_Y: Tensor, likelihood: Optional[Likelihood] = None) -> None: r"""A single-task exact GP model. Args: train_X: A `n x d` or `batch_shape x n x d` (batch mode) tensor of training features. train_Y: A `n x (o)` or `batch_shape x n x (o)` (batch mode) tensor of training observations. likelihood: A likelihood. If omitted, use a standard GaussianLikelihood with inferred noise level. Example: >>> train_X = torch.rand(20, 2) >>> train_Y = torch.sin(train_X[:, 0]) + torch.cos(train_X[:, 1]) >>> model = SingleTaskGP(train_X, train_Y) """ ard_num_dims = train_X.shape[-1] train_X, train_Y, _ = self._set_dimensions(train_X=train_X, train_Y=train_Y) train_X, train_Y, _ = multioutput_to_batch_mode_transform( train_X=train_X, train_Y=train_Y, num_outputs=self._num_outputs) if likelihood is None: noise_prior = GammaPrior(1.1, 0.05) noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate likelihood = GaussianLikelihood( noise_prior=noise_prior, batch_shape=self._aug_batch_shape, noise_constraint=GreaterThan( MIN_INFERRED_NOISE_LEVEL, transform=None, initial_value=noise_prior_mode, ), ) else: self._likelihood_state_dict = deepcopy(likelihood.state_dict()) ExactGP.__init__(self, train_X, train_Y, likelihood) self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape) self.covar_module = ScaleKernel( MaternKernel( nu=2.5, ard_num_dims=ard_num_dims, batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ), batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) self.to(train_X)
def __init__( self, train_x: torch.Tensor, train_y: torch.Tensor, inducing_points: torch.Tensor, scales: Union[torch.Tensor, float] = 1.0, mean_module: Optional[Mean] = None, covar_module: Optional[Kernel] = None, fixed_prior_mean: Optional[float] = None, ) -> None: variational_distribution = CholeskyVariationalDistribution( inducing_points.size(0)) variational_distribution.to(train_x) variational_strategy = VariationalStrategy( model=self, inducing_points=inducing_points, variational_distribution=variational_distribution, learn_inducing_locations=False, ) super(MixedDerivativeVariationalGP, self).__init__(variational_strategy) # Set the mean if specified to if mean_module is None: self.mean_module = ConstantMeanPartialObsGrad() else: self.mean_module = mean_module if fixed_prior_mean is not None: self.mean_module.constant.requires_grad_(False) self.mean_module.constant.copy_( torch.tensor([fixed_prior_mean], dtype=train_x.dtype)) if covar_module is None: self.base_kernel = RBFKernelPartialObsGrad( ard_num_dims=train_x.shape[-1] - 1, lengthscale_prior=GammaPrior(3.0, 6.0 / scales), ) self.covar_module = ScaleKernel(self.base_kernel, outputscale_prior=GammaPrior( 2.0, 0.15)) else: self.covar_module = covar_module self._num_outputs = 1 self.train_inputs = (train_x, ) self.train_targets = train_y self(train_x) # Necessary for CholeskyVariationalDistribution
def __init__(self, train_X: Tensor, train_Y: Tensor, train_Yvar: Tensor) -> None: r"""A single-task exact GP model using fixed noise levels. Args: train_X: A `n x d` or `batch_shape x n x d` (batch mode) tensor of training features. train_Y: A `n x (o)` or `batch_shape x n x (o)` (batch mode) tensor of training observations. train_Yvar: A `batch_shape x n x (o)` or `batch_shape x n x (o)` (batch mode) tensor of observed measurement noise. Example: >>> train_X = torch.rand(20, 2) >>> train_Y = torch.sin(train_X[:, 0]]) + torch.cos(train_X[:, 1]) >>> train_Yvar = torch.full_like(train_Y, 0.2) >>> model = FixedNoiseGP(train_X, train_Y, train_Yvar) """ ard_num_dims = train_X.shape[-1] train_X, train_Y, train_Yvar = self._set_dimensions( train_X=train_X, train_Y=train_Y, train_Yvar=train_Yvar ) train_X, train_Y, train_Yvar = multioutput_to_batch_mode_transform( train_X=train_X, train_Y=train_Y, num_outputs=self._num_outputs, train_Yvar=train_Yvar, ) likelihood = FixedNoiseGaussianLikelihood( noise=train_Yvar, batch_shape=self._aug_batch_shape ) ExactGP.__init__( self, train_inputs=train_X, train_targets=train_Y, likelihood=likelihood ) self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape) self.covar_module = ScaleKernel( base_kernel=MaternKernel( nu=2.5, ard_num_dims=ard_num_dims, batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ), batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) self.to(train_X)
def __init__(self, train_X: Tensor, train_Y: Tensor, train_Yvar: Tensor) -> None: self._validate_tensor_args(X=train_X, Y=train_Y, Yvar=train_Yvar) self._set_dimensions(train_X=train_X, train_Y=train_Y) train_X, train_Y, train_Yvar = self._transform_tensor_args( X=train_X, Y=train_Y, Yvar=train_Yvar) likelihood = FixedNoiseGaussianLikelihood( noise=train_Yvar, batch_shape=self._aug_batch_shape) ExactGP.__init__(self, train_inputs=train_X, train_targets=train_Y, likelihood=likelihood) self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape) self.covar_module = ScaleKernel( base_kernel=RBFKernel( ard_num_dims=train_X.shape[-1], batch_shape=self._aug_batch_shape, ), batch_shape=self._aug_batch_shape, ) self.to(train_X)
def __init__( self, train_X: Tensor, train_Y: Tensor, cat_dims: List[int], cont_kernel_factory: Optional[Callable[[int, List[int]], Kernel]] = None, likelihood: Optional[Likelihood] = None, outcome_transform: Optional[OutcomeTransform] = None, # TODO input_transform: Optional[InputTransform] = None, # TODO ) -> None: r"""A single-task exact GP model supporting categorical parameters. Args: train_X: A `batch_shape x n x d` tensor of training features. train_Y: A `batch_shape x n x m` tensor of training observations. cat_dims: A list of indices corresponding to the columns of the input `X` that should be considered categorical features. cont_kernel_factory: A method that accepts `ard_num_dims` and `active_dims` arguments and returns an instatiated GPyTorch `Kernel` object to be used as the ase kernel for the continuous dimensions. If omitted, this model uses a Matern-2.5 kernel as the kernel for the ordinal parameters. likelihood: A likelihood. If omitted, use a standard GaussianLikelihood with inferred noise level. # outcome_transform: An outcome transform that is applied to the # training data during instantiation and to the posterior during # inference (that is, the `Posterior` obtained by calling # `.posterior` on the model will be on the original scale). # input_transform: An input transform that is applied in the model's # forward pass. Example: >>> train_X = torch.cat( [torch.rand(20, 2), torch.randint(3, (20, 1))], dim=-1) ) >>> train_Y = ( torch.sin(train_X[..., :-1]).sum(dim=1, keepdim=True) + train_X[..., -1:] ) >>> model = MixedSingleTaskGP(train_X, train_Y, cat_dims=[-1]) """ if outcome_transform is not None: raise UnsupportedError("outcome transforms not yet supported") if input_transform is not None: raise UnsupportedError("input transforms not yet supported") if len(cat_dims) == 0: raise ValueError( "Must specify categorical dimensions for MixedSingleTaskGP" ) input_batch_shape, aug_batch_shape = self.get_batch_dimensions( train_X=train_X, train_Y=train_Y ) if cont_kernel_factory is None: def cont_kernel_factory( batch_shape: torch.Size, ard_num_dims: int, active_dims: List[int] ) -> MaternKernel: return MaternKernel( nu=2.5, batch_shape=batch_shape, ard_num_dims=ard_num_dims, active_dims=active_dims, ) if likelihood is None: # This Gamma prior is quite close to the Horseshoe prior min_noise = 1e-5 if train_X.dtype == torch.float else 1e-6 likelihood = GaussianLikelihood( batch_shape=aug_batch_shape, noise_constraint=GreaterThan( min_noise, transform=None, initial_value=1e-3 ), noise_prior=GammaPrior(0.9, 10.0), ) d = train_X.shape[-1] cat_dims = normalize_indices(indices=cat_dims, d=d) ord_dims = sorted(set(range(d)) - set(cat_dims)) if len(ord_dims) == 0: covar_module = ScaleKernel( CategoricalKernel( batch_shape=aug_batch_shape, ard_num_dims=len(cat_dims), ) ) else: sum_kernel = ScaleKernel( cont_kernel_factory( batch_shape=aug_batch_shape, ard_num_dims=len(ord_dims), active_dims=ord_dims, ) + ScaleKernel( CategoricalKernel( batch_shape=aug_batch_shape, ard_num_dims=len(cat_dims), active_dims=cat_dims, ) ) ) prod_kernel = ScaleKernel( cont_kernel_factory( batch_shape=aug_batch_shape, ard_num_dims=len(ord_dims), active_dims=ord_dims, ) * CategoricalKernel( batch_shape=aug_batch_shape, ard_num_dims=len(cat_dims), active_dims=cat_dims, ) ) covar_module = sum_kernel + prod_kernel super().__init__( train_X=train_X, train_Y=train_Y, likelihood=likelihood, covar_module=covar_module, outcome_transform=outcome_transform, input_transform=input_transform, )
def sample_arch(self, START_BO, g, hyperparams, og_flops, empty_val_loss, full_val_loss, target_flops=0): if g < START_BO: if target_flops == 0: f = np.random.rand(1) * (args.upper_channel-args.lower_channel) + args.lower_channel else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) elif g == START_BO: if target_flops == 0: parameterization = np.ones(hyperparams.get_dim()) else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) else: rand = torch.rand(1).cuda() train_X = torch.FloatTensor(self.X).cuda() train_Y_loss = torch.FloatTensor(np.array(self.Y)[:, 0].reshape(-1, 1)).cuda() train_Y_loss = standardize(train_Y_loss) train_Y_cost = torch.FloatTensor(np.array(self.Y)[:, 1].reshape(-1, 1)).cuda() train_Y_cost = standardize(train_Y_cost) covar_module = None if args.ski and g > 128: if args.additive: covar_module = AdditiveStructureKernel( ScaleKernel( GridInterpolationKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), ), grid_size=128, num_dims=1, grid_bounds=[(0, 1)] ), outputscale_prior=GammaPrior(2.0, 0.15), ), num_dims=train_X.shape[1] ) else: covar_module = ScaleKernel( GridInterpolationKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), ), grid_size=128, num_dims=train_X.shape[1], grid_bounds=[(0, 1) for _ in range(train_X.shape[1])] ), outputscale_prior=GammaPrior(2.0, 0.15), ) else: if args.additive: covar_module = AdditiveStructureKernel( ScaleKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), num_dims=1 ), outputscale_prior=GammaPrior(2.0, 0.15), ), num_dims=train_X.shape[1] ) else: covar_module = ScaleKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), num_dims=train_X.shape[1] ), outputscale_prior=GammaPrior(2.0, 0.15), ) new_train_X = train_X gp_loss = SingleTaskGP(new_train_X, train_Y_loss, covar_module=covar_module) mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss) mll = mll.to('cuda') fit_gpytorch_model(mll) # Use add-gp for cost covar_module = AdditiveStructureKernel( ScaleKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), num_dims=1 ), outputscale_prior=GammaPrior(2.0, 0.15), ), num_dims=train_X.shape[1] ) gp_cost = SingleTaskGP(new_train_X, train_Y_cost, covar_module=covar_module) mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost) mll = mll.to('cuda') fit_gpytorch_model(mll) UCB_loss = UpperConfidenceBound(gp_loss, beta=args.beta).cuda() UCB_cost = UpperConfidenceBound(gp_cost, beta=args.beta).cuda() self.mobo_obj = RandAcquisition(UCB_loss).cuda() self.mobo_obj.setup(UCB_loss, UCB_cost, rand) lower = torch.ones(new_train_X.shape[1])*args.lower_channel upper = torch.ones(new_train_X.shape[1])*args.upper_channel self.mobo_bounds = torch.stack([lower, upper]).cuda() if args.pas: val = np.linspace(args.lower_flops, 1, 50) chosen_target_flops = np.random.choice(val, p=(self.sampling_weights/np.sum(self.sampling_weights))) lower_bnd, upper_bnd = 0, 1 lmda = 0.5 for i in range(10): self.mobo_obj.rand = lmda parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) sim_flops = self.mask_pruner.simulate_and_count_flops(layer_budget, self.use_mem) ratio = sim_flops/og_flops if np.abs(ratio - chosen_target_flops) <= 0.02: break if args.baseline > 0: if ratio < chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 elif ratio > chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 else: if ratio < chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 elif ratio > chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 rand[0] = lmda writer.add_scalar('Binary search trials', i, g) else: parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) return layer_budget, parameterization, self.sampling_weights/np.sum(self.sampling_weights)
def __init__( self, train_X: Tensor, train_Y: Tensor, likelihood: Optional[Likelihood] = None, covar_module: Optional[Module] = None, outcome_transform: Optional[OutcomeTransform] = None, ) -> None: r"""A single-task exact GP model. Args: train_X: A `batch_shape x n x d` tensor of training features. train_Y: A `batch_shape x n x m` tensor of training observations. likelihood: A likelihood. If omitted, use a standard GaussianLikelihood with inferred noise level. covar_module: The module computing the covariance (Kernel) matrix. If omitted, use a `MaternKernel`. outcome_transform: An outcome transform that is applied to the training data during instantiation and to the posterior during inference (that is, the `Posterior` obtained by calling `.posterior` on the model will be on the original scale). Example: >>> train_X = torch.rand(20, 2) >>> train_Y = torch.sin(train_X).sum(dim=1, keepdim=True) >>> model = SingleTaskGP(train_X, train_Y) """ if outcome_transform is not None: train_Y, _ = outcome_transform(train_Y) validate_input_scaling(train_X=train_X, train_Y=train_Y) self._validate_tensor_args(X=train_X, Y=train_Y) self._set_dimensions(train_X=train_X, train_Y=train_Y) train_X, train_Y, _ = self._transform_tensor_args(X=train_X, Y=train_Y) if likelihood is None: noise_prior = GammaPrior(1.1, 0.05) noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate likelihood = GaussianLikelihood( noise_prior=noise_prior, batch_shape=self._aug_batch_shape, noise_constraint=GreaterThan( MIN_INFERRED_NOISE_LEVEL, transform=None, initial_value=noise_prior_mode, ), ) else: self._is_custom_likelihood = True ExactGP.__init__(self, train_X, train_Y, likelihood) self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape) if covar_module is None: self.covar_module = ScaleKernel( MaternKernel( nu=2.5, ard_num_dims=train_X.shape[-1], batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ), batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) self._subset_batch_dict = { "likelihood.noise_covar.raw_noise": -2, "mean_module.constant": -2, "covar_module.raw_outputscale": -1, "covar_module.base_kernel.raw_lengthscale": -3, } else: self.covar_module = covar_module # TODO: Allow subsetting of other covar modules if outcome_transform is not None: self.outcome_transform = outcome_transform self.to(train_X)
def sample_arch(self, START_BO, g, hyperparams, og_flops, empty_val_loss, full_val_loss, target_flops=0): # Warming up the history with a single width-multiplier if g < START_BO: if target_flops == 0: f = np.random.rand(1) * (args.upper_channel - args. lower_channel) + args.lower_channel else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization( parameterization, self.mask_pruner) # Put largest model into the history elif g == START_BO: if target_flops == 0: parameterization = np.ones(hyperparams.get_dim()) else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization( parameterization, self.mask_pruner) # MOBO-RS else: rand = torch.rand(1).cuda() train_X = torch.FloatTensor(self.X).cuda() train_Y_loss = torch.FloatTensor( np.array(self.Y)[:, 0].reshape(-1, 1)).cuda() train_Y_loss = standardize(train_Y_loss) train_Y_cost = torch.FloatTensor( np.array(self.Y)[:, 1].reshape(-1, 1)).cuda() train_Y_cost = standardize(train_Y_cost) new_train_X = train_X gp_loss = SingleTaskGP(new_train_X, train_Y_loss) mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss) mll = mll.to('cuda') fit_gpytorch_model(mll) # Use add-gp for cost covar_module = AdditiveStructureKernel(ScaleKernel( MaternKernel(nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), num_dims=1), outputscale_prior=GammaPrior(2.0, 0.15), ), num_dims=train_X.shape[1]) gp_cost = SingleTaskGP(new_train_X, train_Y_cost, covar_module=covar_module) mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost) mll = mll.to('cuda') fit_gpytorch_model(mll) UCB_loss = UpperConfidenceBound(gp_loss).cuda() UCB_cost = UpperConfidenceBound(gp_cost).cuda() self.mobo_obj = RandAcquisition(UCB_loss).cuda() self.mobo_obj.setup(UCB_loss, UCB_cost, rand) lower = torch.ones(new_train_X.shape[1]) * args.lower_channel upper = torch.ones(new_train_X.shape[1]) * args.upper_channel self.mobo_bounds = torch.stack([lower, upper]).cuda() if args.pas: costs = [] for i in range(len(self.population_data)): costs.append([ self.population_data[i]['loss'], self.population_data[i]['ratio'] ]) costs = np.array(costs) efficient_mask = is_pareto_efficient(costs) costs = costs[efficient_mask] loss = costs[:, 0] flops = costs[:, 1] sorted_idx = np.argsort(flops) loss = loss[sorted_idx] flops = flops[sorted_idx] if flops[0] > args.lower_flops: flops = np.concatenate([[args.lower_flops], flops.reshape(-1)]) loss = np.concatenate([[empty_val_loss], loss.reshape(-1)]) else: flops = flops.reshape(-1) loss = loss.reshape(-1) if flops[-1] < args.upper_flops and (loss[-1] > full_val_loss): flops = np.concatenate( [flops.reshape(-1), [args.upper_flops]]) loss = np.concatenate([loss.reshape(-1), [full_val_loss]]) else: flops = flops.reshape(-1) loss = loss.reshape(-1) areas = (flops[1:] - flops[:-1]) * (loss[:-1] - loss[1:]) self.sampling_weights = np.zeros(50) k = 0 while k < len(flops) and flops[k] < args.lower_flops: k += 1 for i in range(50): lower = i / 50. upper = (i + 1) / 50. if upper < args.lower_flops or lower > args.upper_flops or lower < args.lower_flops: continue cnt = 1 while ((k + 1) < len(flops)) and upper > flops[k + 1]: self.sampling_weights[i] += areas[k] cnt += 1 k += 1 if k < len(areas): self.sampling_weights[i] += areas[k] self.sampling_weights[i] /= cnt if np.sum(self.sampling_weights) == 0: self.sampling_weights = np.ones(50) if target_flops == 0: val = np.arange(0.01, 1, 0.02) chosen_target_flops = np.random.choice( val, p=(self.sampling_weights / np.sum(self.sampling_weights))) else: chosen_target_flops = target_flops lower_bnd, upper_bnd = 0, 1 lmda = 0.5 for i in range(10): self.mobo_obj.rand = lmda parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization( parameterization, self.mask_pruner) sim_flops = self.mask_pruner.simulate_and_count_flops( layer_budget) ratio = sim_flops / og_flops if np.abs(ratio - chosen_target_flops) <= 0.02: break if args.baseline > 0: if ratio < chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 elif ratio > chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 else: if ratio < chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 elif ratio > chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 rand[0] = lmda writer.add_scalar('Binary search trials', i, g) else: parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization( parameterization, self.mask_pruner) return layer_budget, parameterization, self.sampling_weights / np.sum( self.sampling_weights)
def __init__(self, x, y, likelihood, kernel, mean): super().__init__(x, y, likelihood) self.mean_module = mean self.covar_module = ScaleKernel(kernel) self.float() likelihood.float()
def __init__( self, train_x: torch.Tensor, train_y: torch.Tensor, inducing_points: torch.Tensor, scales: Union[torch.Tensor, float] = 1.0, mean_module: Optional[Mean] = None, covar_module: Optional[Kernel] = None, fixed_prior_mean: Optional[float] = None, ) -> None: """Initialize MixedDerivativeVariationalGP Args: train_x (torch.Tensor): Training x points. The last column of x is the derivative indiciator: 0 if it is an observation of f(x), and i if it is an observation of df/dx_i. train_y (torch.Tensor): Training y points inducing_points (torch.Tensor): Inducing points to use scales (Union[torch.Tensor, float], optional): Typical scale of each dimension of input space (this is used to set the lengthscale prior). Defaults to 1.0. mean_module (Mean, optional): A mean class that supports derivative indexes as the final dim. Defaults to a constant mean. covar_module (Kernel, optional): A covariance kernel class that supports derivative indexes as the final dim. Defaults to RBF kernel. fixed_prior_mean (float, optional): A prior mean value to use with the constant mean. Often setting this to the target threshold speeds up experiments. Defaults to None, in which case the mean will be inferred. """ variational_distribution = CholeskyVariationalDistribution( inducing_points.size(0)) variational_distribution.to(train_x) variational_strategy = VariationalStrategy( model=self, inducing_points=inducing_points, variational_distribution=variational_distribution, learn_inducing_locations=False, ) super(MixedDerivativeVariationalGP, self).__init__(variational_strategy) # Set the mean if specified to if mean_module is None: self.mean_module = ConstantMeanPartialObsGrad() else: self.mean_module = mean_module if fixed_prior_mean is not None: self.mean_module.constant.requires_grad_(False) self.mean_module.constant.copy_( torch.tensor([fixed_prior_mean], dtype=train_x.dtype)) if covar_module is None: self.base_kernel = RBFKernelPartialObsGrad( ard_num_dims=train_x.shape[-1] - 1, lengthscale_prior=GammaPrior(3.0, 6.0 / scales), ) self.covar_module = ScaleKernel(self.base_kernel, outputscale_prior=GammaPrior( 2.0, 0.15)) else: self.covar_module = covar_module self._num_outputs = 1 self.train_inputs = (train_x, ) self.train_targets = train_y self(train_x) # Necessary for CholeskyVariationalDistribution
def __init__( self, datapoints: Tensor, comparisons: Tensor, covar_module: Optional[Module] = None, input_transform: Optional[InputTransform] = None, **kwargs, ) -> None: r"""A probit-likelihood GP with Laplace approximation model that learns via pairwise comparison data. By default it uses a scaled RBF kernel. Args: datapoints: A `batch_shape x n x d` tensor of training features. comparisons: A `batch_shape x m x 2` training comparisons; comparisons[i] is a noisy indicator suggesting the utility value of comparisons[i, 0]-th is greater than comparisons[i, 1]-th. covar_module: Covariance module. input_transform: An input transform that is applied in the model's forward pass. """ super().__init__() if input_transform is not None: input_transform.to(datapoints) # input transformation is applied in set_train_data self.input_transform = input_transform # Compatibility variables with fit_gpytorch_*: Dummy likelihood # Likelihood is tightly tied with this model and # it doesn't make much sense to keep it separate self.likelihood = None # TODO: remove these variables from `state_dict()` so that when calling # `load_state_dict()`, only the hyperparameters are copied over self.register_buffer("datapoints", None) self.register_buffer("comparisons", None) self.register_buffer("D", None) self.register_buffer("DT", None) self.register_buffer("utility", None) self.register_buffer("covar_chol", None) self.register_buffer("likelihood_hess", None) self.register_buffer("hlcov_eye", None) self.register_buffer("covar", None) self.register_buffer("covar_inv", None) self.train_inputs = [] self.train_targets = None self.pred_cov_fac_need_update = True self.dim = None # See set_train_data for additional compatibility variables. # Not that the datapoints here are not transformed even if input_transform # is not None to avoid double transformation during model fitting. # self.transform_inputs is called in `forward` self.set_train_data(datapoints, comparisons, update_model=False) # Set optional parameters # jitter to add for numerical stability self._jitter = kwargs.get("jitter", 1e-6) # Clamping z lim for better numerical stability. See self._calc_z for detail # norm_cdf(z=3) ~= 0.999, top 0.1% percent self._zlim = kwargs.get("zlim", 3) # Stopping creteria in scipy.optimize.fsolve used to find f_map in _update() # If None, set to 1e-6 by default in _update self._xtol = kwargs.get("xtol") # The maximum number of calls to the function in scipy.optimize.fsolve # If None, set to 100 by default in _update # If zero, then 100*(N+1) is used by default by fsolve; self._maxfev = kwargs.get("maxfev") # Set hyperparameters # Do not set the batch_shape explicitly so mean_module can operate in both mode # once fsolve used in _update can run in batch mode, we should explicitly set # the bacth shape here self.mean_module = ConstantMean() # Do not optimize constant mean prior for param in self.mean_module.parameters(): param.requires_grad = False # set covariance module # the default outputscale here is only a rule of thumb, meant to keep # estimates away from scale value that would make Phi(f(x)) saturate # at 0 or 1 if covar_module is None: ls_prior = GammaPrior(1.2, 0.5) ls_prior_mode = (ls_prior.concentration - 1) / ls_prior.rate covar_module = ScaleKernel( RBFKernel( batch_shape=self.batch_shape, ard_num_dims=self.dim, lengthscale_prior=ls_prior, lengthscale_constraint=Positive( transform=None, initial_value=ls_prior_mode), ), outputscale_prior=SmoothedBoxPrior(a=1, b=4), ) self.covar_module = covar_module self._x0 = None # will store temporary results for warm-starting if self.datapoints is not None and self.comparisons is not None: self.to(dtype=self.datapoints.dtype, device=self.datapoints.device) # Find f_map for initial parameters with transformed datapoints transformed_dp = self.transform_inputs(datapoints) self._update(transformed_dp) self.to(self.datapoints)
def __init__( self, train_X: Tensor, train_Y: Tensor, task_feature: int, output_tasks: Optional[List[int]] = None, rank: Optional[int] = None, ) -> None: r"""Multi-Task GP model using an ICM kernel, inferring observation noise. Args: train_X: A `n x (d + 1)` or `b x n x (d + 1)` (batch mode) tensor of training data. One of the columns should contain the task features (see `task_feature` argument). train_Y: A `n` or `b x n` (batch mode) tensor of training observations. task_feature: The index of the task feature (`-d <= task_feature <= d`). output_tasks: A list of task indices for which to compute model outputs for. If omitted, return outputs for all task indices. rank: The rank to be used for the index kernel. If omitted, use a full rank (i.e. number of tasks) kernel. Example: >>> X1, X2 = torch.rand(10, 2), torch.rand(20, 2) >>> i1, i2 = torch.zeros(10, 1), torch.ones(20, 1) >>> train_X = torch.stack([ >>> torch.cat([X1, i1], -1), torch.cat([X2, i2], -1), >>> ]) >>> train_Y = torch.cat(f1(X1), f2(X2)) >>> model = MultiTaskGP(train_X, train_Y, task_feature=-1) """ if train_X.ndimension() != 2: # Currently, batch mode MTGPs are blocked upstream in GPyTorch raise ValueError(f"Unsupported shape {train_X.shape} for train_X.") d = train_X.shape[-1] - 1 if not (-d <= task_feature <= d): raise ValueError(f"Must have that -{d} <= task_feature <= {d}") all_tasks = train_X[:, task_feature].unique().to( dtype=torch.long).tolist() if output_tasks is None: output_tasks = all_tasks else: if any(t not in all_tasks for t in output_tasks): raise RuntimeError( "All output tasks must be present in input data.") self._output_tasks = output_tasks # TODO (T41270962): Support task-specific noise levels in likelihood likelihood = GaussianLikelihood(noise_prior=GammaPrior(1.1, 0.05)) # construct indexer to be used in forward self._task_feature = task_feature self._base_idxr = torch.arange(d) self._base_idxr[task_feature:] += 1 # exclude task feature super().__init__(train_inputs=train_X, train_targets=train_Y, likelihood=likelihood) self.mean_module = ConstantMean() self.covar_module = ScaleKernel( base_kernel=MaternKernel(nu=2.5, ard_num_dims=d, lengthscale_prior=GammaPrior(3.0, 6.0)), outputscale_prior=GammaPrior(2.0, 0.15), ) num_tasks = len(all_tasks) self._rank = rank if rank is not None else num_tasks # TODO: Add LKJ prior for the index kernel self.task_covar_module = IndexKernel(num_tasks=num_tasks, rank=self._rank) self.to(train_X)
def sample_arch(self, START_BO, g, steps, hyperparams, og_flops, full_val_loss, target_flops=0): if args.slim: if target_flops == 0: parameterization = hyperparams.random_sample() layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) else: parameterization = np.ones(hyperparams.get_dim()) * args.lower_channel layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) else: # random sample to warmup history for MOBO if g < START_BO: if target_flops == 0: f = np.random.rand(1) * (args.upper_channel-args.lower_channel) + args.lower_channel else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) # put the largest model into the history elif g == START_BO: if target_flops == 0: parameterization = np.ones(hyperparams.get_dim()) else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) # MOBO else: # this is the scalarization (lambda_{FLOPs}) rand = torch.rand(1).cuda() # standardize data for building Gaussian Processes train_X = torch.FloatTensor(self.X).cuda() train_Y_loss = torch.FloatTensor(np.array(self.Y)[:, 0].reshape(-1, 1)).cuda() train_Y_loss = standardize(train_Y_loss) train_Y_cost = torch.FloatTensor(np.array(self.Y)[:, 1].reshape(-1, 1)).cuda() train_Y_cost = standardize(train_Y_cost) new_train_X = train_X # GP for the cross entropy loss gp_loss = SingleTaskGP(new_train_X, train_Y_loss) mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss) mll = mll.to('cuda') fit_gpytorch_model(mll) # GP for FLOPs # we use add-gp since FLOPs has addive structure (not exactly though) # the parameters for ScaleKernel and MaternKernel simply follow the default covar_module = AdditiveStructureKernel( ScaleKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), num_dims=1 ), outputscale_prior=GammaPrior(2.0, 0.15), ), num_dims=train_X.shape[1] ) gp_cost = SingleTaskGP(new_train_X, train_Y_cost, covar_module=covar_module) mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost) mll = mll.to('cuda') fit_gpytorch_model(mll) # Build acquisition functions UCB_loss = UpperConfidenceBound(gp_loss, beta=0.1).cuda() UCB_cost = UpperConfidenceBound(gp_cost, beta=0.1).cuda() # Combine them via augmented Tchebyshev scalarization self.mobo_obj = RandAcquisition(UCB_loss).cuda() self.mobo_obj.setup(UCB_loss, UCB_cost, rand) # Bounds for the optimization variable (alpha) lower = torch.ones(new_train_X.shape[1])*args.lower_channel upper = torch.ones(new_train_X.shape[1])*args.upper_channel self.mobo_bounds = torch.stack([lower, upper]).cuda() # Pareto-aware sampling if args.pas: # Generate approximate Pareto front first costs = [] for i in range(len(self.population_data)): costs.append([self.population_data[i]['loss'], self.population_data[i]['ratio']]) costs = np.array(costs) efficient_mask = is_pareto_efficient(costs) costs = costs[efficient_mask] loss = costs[:, 0] flops = costs[:, 1] sorted_idx = np.argsort(flops) loss = loss[sorted_idx] flops = flops[sorted_idx] if flops[0] > args.lower_flops: flops = np.concatenate([[args.lower_flops], flops.reshape(-1)]) loss = np.concatenate([[8], loss.reshape(-1)]) else: flops = flops.reshape(-1) loss = loss.reshape(-1) if flops[-1] < args.upper_flops and (loss[-1] > full_val_loss): flops = np.concatenate([flops.reshape(-1), [args.upper_flops]]) loss = np.concatenate([loss.reshape(-1), [full_val_loss]]) else: flops = flops.reshape(-1) loss = loss.reshape(-1) # Equation (4) in paper areas = (flops[1:]-flops[:-1])*(loss[:-1]-loss[1:]) # Quantize into 50 bins to sample from multinomial self.sampling_weights = np.zeros(50) k = 0 while k < len(flops) and flops[k] < args.lower_flops: k+=1 for i in range(50): lower = i/50. upper = (i+1)/50. if upper < args.lower_flops or lower > args.upper_flops or lower < args.lower_flops: continue cnt = 1 while ((k+1) < len(flops)) and upper > flops[k+1]: self.sampling_weights[i] += areas[k] cnt += 1 k += 1 if k < len(areas): self.sampling_weights[i] += areas[k] self.sampling_weights[i] /= cnt if np.sum(self.sampling_weights) == 0: self.sampling_weights = np.ones(50) if target_flops == 0: val = np.arange(0.01, 1, 0.02) chosen_target_flops = np.random.choice(val, p=(self.sampling_weights/np.sum(self.sampling_weights))) else: chosen_target_flops = target_flops # Binary search is here lower_bnd, upper_bnd = 0, 1 lmda = 0.5 for i in range(10): self.mobo_obj.rand = lmda parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) sim_flops = self.mask_pruner.simulate_and_count_flops(layer_budget) ratio = sim_flops/og_flops if np.abs(ratio - chosen_target_flops) <= 0.02: break if args.baseline > 0: if ratio < chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 elif ratio > chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 else: if ratio < chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 elif ratio > chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 rand[0] = lmda writer.add_scalar('Binary search trials', i, steps) else: parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) return layer_budget, parameterization, self.sampling_weights/np.sum(self.sampling_weights)
def __init__( self, train_X: Tensor, train_Y: Tensor, iteration_fidelity: Optional[int] = None, data_fidelity: Optional[int] = None, linear_truncated: bool = True, nu: float = 2.5, likelihood: Optional[Likelihood] = None, ) -> None: if iteration_fidelity is None and data_fidelity is None: raise UnsupportedError( "SingleTaskMultiFidelityGP requires at least one fidelity parameter." ) if iteration_fidelity is not None and iteration_fidelity < 0: iteration_fidelity = train_X.size(-1) + iteration_fidelity if data_fidelity is not None and data_fidelity < 0: data_fidelity = train_X.size(-1) + data_fidelity self._set_dimensions(train_X=train_X, train_Y=train_Y) if linear_truncated: fidelity_dims = [ i for i in (iteration_fidelity, data_fidelity) if i is not None ] kernel = LinearTruncatedFidelityKernel( fidelity_dims=fidelity_dims, dimension=train_X.size(-1), nu=nu, batch_shape=self._aug_batch_shape, power_prior=GammaPrior(3.0, 3.0), ) else: active_dimsX = [ i for i in range(train_X.size(-1)) if i not in {iteration_fidelity, data_fidelity} ] kernel = RBFKernel( ard_num_dims=len(active_dimsX), batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), active_dims=active_dimsX, ) additional_kernels = [] if iteration_fidelity is not None: exp_kernel = ExponentialDecayKernel( batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), offset_prior=GammaPrior(3.0, 6.0), power_prior=GammaPrior(3.0, 6.0), active_dims=[iteration_fidelity], ) additional_kernels.append(exp_kernel) if data_fidelity is not None: ds_kernel = DownsamplingKernel( batch_shape=self._aug_batch_shape, offset_prior=GammaPrior(3.0, 6.0), power_prior=GammaPrior(3.0, 6.0), active_dims=[data_fidelity], ) additional_kernels.append(ds_kernel) kernel = ProductKernel(kernel, *additional_kernels) covar_module = ScaleKernel( kernel, batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) super().__init__(train_X=train_X, train_Y=train_Y, covar_module=covar_module) self.to(train_X)
def __init__( self, train_X: Tensor, train_Y: Tensor, task_feature: int, covar_module: Optional[Module] = None, task_covar_prior: Optional[Prior] = None, output_tasks: Optional[List[int]] = None, rank: Optional[int] = None, input_transform: Optional[InputTransform] = None, outcome_transform: Optional[OutcomeTransform] = None, ) -> None: r"""Multi-Task GP model using an ICM kernel, inferring observation noise. Args: train_X: A `n x (d + 1)` or `b x n x (d + 1)` (batch mode) tensor of training data. One of the columns should contain the task features (see `task_feature` argument). train_Y: A `n x 1` or `b x n x 1` (batch mode) tensor of training observations. task_feature: The index of the task feature (`-d <= task_feature <= d`). output_tasks: A list of task indices for which to compute model outputs for. If omitted, return outputs for all task indices. rank: The rank to be used for the index kernel. If omitted, use a full rank (i.e. number of tasks) kernel. task_covar_prior : A Prior on the task covariance matrix. Must operate on p.s.d. matrices. A common prior for this is the `LKJ` prior. input_transform: An input transform that is applied in the model's forward pass. Example: >>> X1, X2 = torch.rand(10, 2), torch.rand(20, 2) >>> i1, i2 = torch.zeros(10, 1), torch.ones(20, 1) >>> train_X = torch.cat([ >>> torch.cat([X1, i1], -1), torch.cat([X2, i2], -1), >>> ]) >>> train_Y = torch.cat(f1(X1), f2(X2)).unsqueeze(-1) >>> model = MultiTaskGP(train_X, train_Y, task_feature=-1) """ with torch.no_grad(): transformed_X = self.transform_inputs( X=train_X, input_transform=input_transform) self._validate_tensor_args(X=transformed_X, Y=train_Y) all_tasks, task_feature, d = self.get_all_tasks( transformed_X, task_feature, output_tasks) if outcome_transform is not None: train_Y, _ = outcome_transform(train_Y) # squeeze output dim train_Y = train_Y.squeeze(-1) if output_tasks is None: output_tasks = all_tasks else: if set(output_tasks) - set(all_tasks): raise RuntimeError( "All output tasks must be present in input data.") self._output_tasks = output_tasks self._num_outputs = len(output_tasks) # TODO (T41270962): Support task-specific noise levels in likelihood likelihood = GaussianLikelihood(noise_prior=GammaPrior(1.1, 0.05)) # construct indexer to be used in forward self._task_feature = task_feature self._base_idxr = torch.arange(d) self._base_idxr[task_feature:] += 1 # exclude task feature super().__init__(train_inputs=train_X, train_targets=train_Y, likelihood=likelihood) self.mean_module = ConstantMean() if covar_module is None: self.covar_module = ScaleKernel( base_kernel=MaternKernel(nu=2.5, ard_num_dims=d, lengthscale_prior=GammaPrior( 3.0, 6.0)), outputscale_prior=GammaPrior(2.0, 0.15), ) else: self.covar_module = covar_module num_tasks = len(all_tasks) self._rank = rank if rank is not None else num_tasks self.task_covar_module = IndexKernel(num_tasks=num_tasks, rank=self._rank, prior=task_covar_prior) if input_transform is not None: self.input_transform = input_transform if outcome_transform is not None: self.outcome_transform = outcome_transform self.to(train_X)
def _setup_multifidelity_covar_module( dim: int, aug_batch_shape: torch.Size, iteration_fidelity: Optional[int], data_fidelity: Optional[int], linear_truncated: bool, nu: float, ) -> Tuple[ScaleKernel, Dict]: """Helper function to get the covariance module and associated subset_batch_dict for the multifidelity setting. Args: dim: The dimensionality of the training data. aug_batch_shape: The output-augmented batch shape as defined in `BatchedMultiOutputGPyTorchModel`. iteration_fidelity: The column index for the training iteration fidelity parameter (optional). data_fidelity: The column index for the downsampling fidelity parameter (optional). linear_truncated: If True, use a `LinearTruncatedFidelityKernel` instead of the default kernel. nu: The smoothness parameter for the Matern kernel: either 1/2, 3/2, or 5/2. Only used when `linear_truncated=True`. Returns: The covariance module and subset_batch_dict. """ if iteration_fidelity is not None and iteration_fidelity < 0: iteration_fidelity = dim + iteration_fidelity if data_fidelity is not None and data_fidelity < 0: data_fidelity = dim + data_fidelity if linear_truncated: fidelity_dims = [ i for i in (iteration_fidelity, data_fidelity) if i is not None ] kernel = LinearTruncatedFidelityKernel( fidelity_dims=fidelity_dims, dimension=dim, nu=nu, batch_shape=aug_batch_shape, power_prior=GammaPrior(3.0, 3.0), ) else: active_dimsX = [ i for i in range(dim) if i not in {iteration_fidelity, data_fidelity} ] kernel = RBFKernel( ard_num_dims=len(active_dimsX), batch_shape=aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), active_dims=active_dimsX, ) additional_kernels = [] if iteration_fidelity is not None: exp_kernel = ExponentialDecayKernel( batch_shape=aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), offset_prior=GammaPrior(3.0, 6.0), power_prior=GammaPrior(3.0, 6.0), active_dims=[iteration_fidelity], ) additional_kernels.append(exp_kernel) if data_fidelity is not None: ds_kernel = DownsamplingKernel( batch_shape=aug_batch_shape, offset_prior=GammaPrior(3.0, 6.0), power_prior=GammaPrior(3.0, 6.0), active_dims=[data_fidelity], ) additional_kernels.append(ds_kernel) kernel = ProductKernel(kernel, *additional_kernels) covar_module = ScaleKernel(kernel, batch_shape=aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15)) if linear_truncated: subset_batch_dict = { "covar_module.base_kernel.raw_power": -2, "covar_module.base_kernel.covar_module_unbiased.raw_lengthscale": -3, "covar_module.base_kernel.covar_module_biased.raw_lengthscale": -3, } else: subset_batch_dict = { "covar_module.base_kernel.kernels.0.raw_lengthscale": -3, "covar_module.base_kernel.kernels.1.raw_power": -2, "covar_module.base_kernel.kernels.1.raw_offset": -2, } if iteration_fidelity is not None: subset_batch_dict = { "covar_module.base_kernel.kernels.1.raw_lengthscale": -3, **subset_batch_dict, } if data_fidelity is not None: subset_batch_dict = { "covar_module.base_kernel.kernels.2.raw_power": -2, "covar_module.base_kernel.kernels.2.raw_offset": -2, **subset_batch_dict, } return covar_module, subset_batch_dict
def __init__( self, train_X: Tensor, train_Y: Tensor, iteration_fidelity: Optional[int] = None, data_fidelity: Optional[int] = None, linear_truncated: bool = True, nu: float = 2.5, likelihood: Optional[Likelihood] = None, outcome_transform: Optional[OutcomeTransform] = None, ) -> None: self._init_args = { "iteration_fidelity": iteration_fidelity, "data_fidelity": data_fidelity, "linear_truncated": linear_truncated, "nu": nu, "outcome_transform": outcome_transform, } if iteration_fidelity is None and data_fidelity is None: raise UnsupportedError( "SingleTaskMultiFidelityGP requires at least one fidelity parameter." ) if iteration_fidelity is not None and iteration_fidelity < 0: iteration_fidelity = train_X.size(-1) + iteration_fidelity if data_fidelity is not None and data_fidelity < 0: data_fidelity = train_X.size(-1) + data_fidelity self._set_dimensions(train_X=train_X, train_Y=train_Y) if linear_truncated: fidelity_dims = [ i for i in (iteration_fidelity, data_fidelity) if i is not None ] kernel = LinearTruncatedFidelityKernel( fidelity_dims=fidelity_dims, dimension=train_X.size(-1), nu=nu, batch_shape=self._aug_batch_shape, power_prior=GammaPrior(3.0, 3.0), ) else: active_dimsX = [ i for i in range(train_X.size(-1)) if i not in {iteration_fidelity, data_fidelity} ] kernel = RBFKernel( ard_num_dims=len(active_dimsX), batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), active_dims=active_dimsX, ) additional_kernels = [] if iteration_fidelity is not None: exp_kernel = ExponentialDecayKernel( batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), offset_prior=GammaPrior(3.0, 6.0), power_prior=GammaPrior(3.0, 6.0), active_dims=[iteration_fidelity], ) additional_kernels.append(exp_kernel) if data_fidelity is not None: ds_kernel = DownsamplingKernel( batch_shape=self._aug_batch_shape, offset_prior=GammaPrior(3.0, 6.0), power_prior=GammaPrior(3.0, 6.0), active_dims=[data_fidelity], ) additional_kernels.append(ds_kernel) kernel = ProductKernel(kernel, *additional_kernels) covar_module = ScaleKernel( kernel, batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) super().__init__( train_X=train_X, train_Y=train_Y, covar_module=covar_module, outcome_transform=outcome_transform, ) if linear_truncated: subset_batch_dict = { "covar_module.base_kernel.raw_power": -2, "covar_module.base_kernel.covar_module_unbiased.raw_lengthscale": -3, "covar_module.base_kernel.covar_module_biased.raw_lengthscale": -3, } else: subset_batch_dict = { "covar_module.base_kernel.kernels.0.raw_lengthscale": -3, "covar_module.base_kernel.kernels.1.raw_power": -2, "covar_module.base_kernel.kernels.1.raw_offset": -2, } if iteration_fidelity is not None: subset_batch_dict = { "covar_module.base_kernel.kernels.1.raw_lengthscale": -3, **subset_batch_dict, } if data_fidelity is not None: subset_batch_dict = { "covar_module.base_kernel.kernels.2.raw_power": -2, "covar_module.base_kernel.kernels.2.raw_offset": -2, **subset_batch_dict, } self._subset_batch_dict = { "likelihood.noise_covar.raw_noise": -2, "mean_module.constant": -2, "covar_module.raw_outputscale": -1, **subset_batch_dict, } self.to(train_X)