def test_compute_linear_truncated_kernel_no_batch(self): x1 = torch.tensor([[1, 0.1, 0.2], [2, 0.3, 0.4]]) x2 = torch.tensor([[3, 0.5, 0.6], [4, 0.7, 0.8]]) t_1 = torch.tensor([[0.3584, 0.1856], [0.2976, 0.1584]]) for nu, fidelity_dims in itertools.product({0.5, 1.5, 2.5}, ([2], [1, 2])): kernel = LinearTruncatedFidelityKernel( fidelity_dims=fidelity_dims, dimension=3, nu=nu ) kernel.power = 1 n_fid = len(fidelity_dims) if n_fid > 1: active_dimsM = [0] t_2 = torch.tensor([[0.4725, 0.2889], [0.4025, 0.2541]]) t_3 = torch.tensor([[0.1685, 0.0531], [0.1168, 0.0386]]) t = 1 + t_1 + t_2 + t_3 else: active_dimsM = [0, 1] t = 1 + t_1 matern_ker = MaternKernel(nu=nu, active_dims=active_dimsM) matern_term = matern_ker(x1, x2).evaluate() actual = t * matern_term res = kernel(x1, x2).evaluate() self.assertLess(torch.norm(res - actual), 1e-4) # test diagonal mode res_diag = kernel(x1, x2, diag=True) self.assertLess(torch.norm(res_diag - actual.diag()), 1e-4) # make sure that we error out if last_dim_is_batch=True with self.assertRaises(NotImplementedError): kernel(x1, x2, diag=True, last_dim_is_batch=True)
def test_compute_linear_truncated_kernel_no_batch(self): x1 = torch.tensor([1, 0.1, 0.2, 2, 0.3, 0.4], dtype=torch.float).view(2, 3) x2 = torch.tensor([3, 0.5, 0.6, 4, 0.7, 0.8], dtype=torch.float).view(2, 3) t_1 = torch.tensor([0.3584, 0.1856, 0.2976, 0.1584], dtype=torch.float).view(2, 2) for nu in {0.5, 1.5, 2.5}: for fidelity_dims in ([2], [1, 2]): kernel = LinearTruncatedFidelityKernel( fidelity_dims=fidelity_dims, dimension=3, nu=nu) kernel.power = 1 if len(fidelity_dims) > 1: active_dimsM = [0] t_2 = torch.tensor([0.4725, 0.2889, 0.4025, 0.2541], dtype=torch.float).view(2, 2) t_3 = torch.tensor([0.1685, 0.0531, 0.1168, 0.0386], dtype=torch.float).view(2, 2) t = 1 + t_1 + t_2 + t_3 else: active_dimsM = [0, 1] t = 1 + t_1 matern_ker = MaternKernel(nu=nu, active_dims=active_dimsM) matern_term = matern_ker(x1, x2).evaluate() actual = t * matern_term res = kernel(x1, x2).evaluate() self.assertLess(torch.norm(res - actual), 1e-4) # test diagonal mode res_diag = kernel(x1, x2, diag=True) self.assertLess(torch.norm(res_diag - actual.diag()), 1e-4) # make sure that we error out if last_dim_is_batch=True with self.assertRaises(NotImplementedError): kernel(x1, x2, diag=True, last_dim_is_batch=True)
def test_compute_linear_truncated_kernel_no_batch(self): x1 = torch.tensor([1, 0.1, 0.2, 2, 0.3, 0.4], dtype=torch.float).view(2, 3) x2 = torch.tensor([3, 0.5, 0.6, 4, 0.7, 0.8], dtype=torch.float).view(2, 3) t_1 = torch.tensor([0.3584, 0.1856, 0.2976, 0.1584], dtype=torch.float).view(2, 2) for nu in {0.5, 1.5, 2.5}: for train_data_fidelity in {False, True}: kernel = LinearTruncatedFidelityKernel( nu=nu, dimension=3, train_data_fidelity=train_data_fidelity) kernel.power = 1 if train_data_fidelity: active_dimsM = [0] t_2 = torch.tensor([0.4725, 0.2889, 0.4025, 0.2541], dtype=torch.float).view(2, 2) t_3 = torch.tensor([0.1685, 0.0531, 0.1168, 0.0386], dtype=torch.float).view(2, 2) t = 1 + t_1 + t_2 + t_3 else: active_dimsM = [0, 1] t = 1 + t_1 matern_ker = MaternKernel(nu=nu, active_dims=active_dimsM) matern_term = matern_ker(x1, x2).evaluate() actual = t * matern_term res = kernel(x1, x2).evaluate() self.assertLess(torch.norm(res - actual), 1e-4)
def __init__( self, decomposition: Dict[str, List[int]], batch_shape: torch.Size ) -> None: super().__init__(batch_shape=batch_shape) self.decomposition = decomposition num_param = len(next(iter(decomposition.values()))) for active_parameters in decomposition.values(): # check number of parameters are same in each decomp if len(active_parameters) != num_param: raise ValueError( "num of parameters needs to be same across all contexts" ) self._indexers = { context: torch.tensor(active_params) for context, active_params in self.decomposition.items() } self.base_kernel = MaternKernel( nu=2.5, ard_num_dims=num_param, batch_shape=batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ) self.kernel_dict = {} # scaled kernel for each parameter space partition for context in list(decomposition.keys()): self.kernel_dict[context] = ScaleKernel( base_kernel=self.base_kernel, outputscale_prior=GammaPrior(2.0, 15.0) ) self.kernel_dict = ModuleDict(self.kernel_dict)
def __init__( self, train_X: Tensor, train_Y: Tensor, train_Yvar: Tensor, outcome_transform: Optional[OutcomeTransform] = None, ) -> None: r"""A single-task exact GP model using fixed noise levels. Args: train_X: A `batch_shape x n x d` tensor of training features. train_Y: A `batch_shape x n x m` tensor of training observations. train_Yvar: A `batch_shape x n x m` tensor of observed measurement noise. outcome_transform: An outcome transform that is applied to the training data during instantiation and to the posterior during inference (that is, the `Posterior` obtained by calling `.posterior` on the model will be on the original scale). Example: >>> train_X = torch.rand(20, 2) >>> train_Y = torch.sin(train_X).sum(dim=1, keepdim=True) >>> train_Yvar = torch.full_like(train_Y, 0.2) >>> model = FixedNoiseGP(train_X, train_Y, train_Yvar) """ if outcome_transform is not None: train_Y, train_Yvar = outcome_transform(train_Y, train_Yvar) validate_input_scaling(train_X=train_X, train_Y=train_Y, train_Yvar=train_Yvar) self._validate_tensor_args(X=train_X, Y=train_Y, Yvar=train_Yvar) self._set_dimensions(train_X=train_X, train_Y=train_Y) train_X, train_Y, train_Yvar = self._transform_tensor_args( X=train_X, Y=train_Y, Yvar=train_Yvar) likelihood = FixedNoiseGaussianLikelihood( noise=train_Yvar, batch_shape=self._aug_batch_shape) ExactGP.__init__(self, train_inputs=train_X, train_targets=train_Y, likelihood=likelihood) self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape) self.covar_module = ScaleKernel( base_kernel=MaternKernel( nu=2.5, ard_num_dims=train_X.shape[-1], batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ), batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) if outcome_transform is not None: self.outcome_transform = outcome_transform self._subset_batch_dict = { "mean_module.constant": -2, "covar_module.raw_outputscale": -1, "covar_module.base_kernel.raw_lengthscale": -3, } self.to(train_X)
def __init__( self, train_X: Tensor, train_Y: Tensor, likelihood: Optional[Likelihood] = None, covar_module: Optional[Module] = None, ) -> None: r"""A single-task exact GP model. Args: train_X: A `n x d` or `batch_shape x n x d` (batch mode) tensor of training features. train_Y: A `n x m` or `batch_shape x n x m` (batch mode) tensor of training observations. likelihood: A likelihood. If omitted, use a standard GaussianLikelihood with inferred noise level. covar_module: The covariance (kernel) matrix. If omitted, use the MaternKernel. Example: >>> train_X = torch.rand(20, 2) >>> train_Y = torch.sin(train_X).sum(dim=1, keepdim=True) >>> model = SingleTaskGP(train_X, train_Y) """ validate_input_scaling(train_X=train_X, train_Y=train_Y) self._validate_tensor_args(X=train_X, Y=train_Y) self._set_dimensions(train_X=train_X, train_Y=train_Y) train_X, train_Y, _ = self._transform_tensor_args(X=train_X, Y=train_Y) if likelihood is None: noise_prior = GammaPrior(1.1, 0.05) noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate likelihood = GaussianLikelihood( noise_prior=noise_prior, batch_shape=self._aug_batch_shape, noise_constraint=GreaterThan( MIN_INFERRED_NOISE_LEVEL, transform=None, initial_value=noise_prior_mode, ), ) else: self._is_custom_likelihood = True ExactGP.__init__(self, train_X, train_Y, likelihood) self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape) if covar_module is None: self.covar_module = ScaleKernel( MaternKernel( nu=2.5, ard_num_dims=train_X.shape[-1], batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ), batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) else: self.covar_module = covar_module self.to(train_X)
def cont_kernel_factory( batch_shape: torch.Size, ard_num_dims: int, active_dims: List[int] ) -> MaternKernel: return MaternKernel( nu=2.5, batch_shape=batch_shape, ard_num_dims=ard_num_dims, active_dims=active_dims, )
def test_compute_linear_truncated_kernel_with_batch(self): x1 = torch.tensor([1, 0.1, 0.2, 3, 0.3, 0.4, 5, 0.5, 0.6, 7, 0.7, 0.8], dtype=torch.float).view(2, 2, 3) x2 = torch.tensor([2, 0.8, 0.7, 4, 0.6, 0.5, 6, 0.4, 0.3, 8, 0.2, 0.1], dtype=torch.float).view(2, 2, 3) t_1 = torch.tensor( [0.2736, 0.44, 0.2304, 0.36, 0.3304, 0.3816, 0.1736, 0.1944], dtype=torch.float, ).view(2, 2, 2) batch_shape = torch.Size([2]) for nu in {0.5, 1.5, 2.5}: for fidelity_dims in ([2], [1, 2]): kernel = LinearTruncatedFidelityKernel( fidelity_dims=fidelity_dims, dimension=3, nu=nu, batch_shape=batch_shape, ) kernel.power = 1 if len(fidelity_dims) > 1: active_dimsM = [0] t_2 = torch.tensor( [ 0.0527, 0.167, 0.0383, 0.1159, 0.1159, 0.167, 0.0383, 0.0527 ], dtype=torch.float, ).view(2, 2, 2) t_3 = torch.tensor( [ 0.1944, 0.3816, 0.1736, 0.3304, 0.36, 0.44, 0.2304, 0.2736 ], dtype=torch.float, ).view(2, 2, 2) t = 1 + t_1 + t_2 + t_3 else: active_dimsM = [0, 1] t = 1 + t_1 matern_ker = MaternKernel(nu=nu, active_dims=active_dimsM, batch_shape=batch_shape) matern_term = matern_ker(x1, x2).evaluate() actual = t * matern_term res = kernel(x1, x2).evaluate() self.assertLess(torch.norm(res - actual), 1e-4) # test diagonal mode res_diag = kernel(x1, x2, diag=True) self.assertLess( torch.norm(res_diag - torch.diagonal(actual, dim1=-1, dim2=-2)), 1e-4, ) # make sure that we error out if last_dim_is_batch=True with self.assertRaises(NotImplementedError): kernel(x1, x2, diag=True, last_dim_is_batch=True)
def test_sample_all_priors(self, cuda=False): device = torch.device("cuda" if cuda else "cpu") for dtype in (torch.float, torch.double): train_X = torch.rand(3, 5, device=device, dtype=dtype) train_Y = torch.rand(3, 1, device=device, dtype=dtype) model = SingleTaskGP(train_X=train_X, train_Y=train_Y) mll = ExactMarginalLogLikelihood(model.likelihood, model) mll.to(device=device, dtype=dtype) original_state_dict = dict(deepcopy(mll.model.state_dict())) sample_all_priors(model) # make sure one of the hyperparameters changed self.assertTrue( dict(model.state_dict())["likelihood.noise_covar.raw_noise"] != original_state_dict["likelihood.noise_covar.raw_noise"]) # check that lengthscales are all different ls = model.covar_module.base_kernel.raw_lengthscale.view( -1).tolist() self.assertTrue(all(ls[0] != ls[i]) for i in range(1, len(ls))) # change one of the priors to SmoothedBoxPrior model.covar_module = ScaleKernel( MaternKernel( nu=2.5, ard_num_dims=model.train_inputs[0].shape[-1], batch_shape=model._aug_batch_shape, lengthscale_prior=SmoothedBoxPrior(3.0, 6.0), ), batch_shape=model._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) original_state_dict = dict(deepcopy(mll.model.state_dict())) with warnings.catch_warnings( record=True) as ws, settings.debug(True): sample_all_priors(model) self.assertEqual(len(ws), 1) self.assertTrue("rsample" in str(ws[0].message)) # the lengthscale should not have changed because sampling is # not implemented for SmoothedBoxPrior self.assertTrue( torch.equal( dict(model.state_dict()) ["covar_module.base_kernel.raw_lengthscale"], original_state_dict[ "covar_module.base_kernel.raw_lengthscale"], )) # set setting_closure to None and make sure RuntimeError is raised prior_tuple = model.likelihood.noise_covar._priors["noise_prior"] model.likelihood.noise_covar._priors["noise_prior"] = ( prior_tuple[0], prior_tuple[1], None, ) with self.assertRaises(RuntimeError): sample_all_priors(model)
def cont_kernel_factory( batch_shape: torch.Size, ard_num_dims: int, active_dims: List[int], ) -> MaternKernel: return MaternKernel( nu=2.5, batch_shape=batch_shape, ard_num_dims=ard_num_dims, active_dims=active_dims, lengthscale_constraint=GreaterThan(1e-04), )
def __init__(self, train_X: Tensor, train_Y: Tensor, likelihood: Optional[Likelihood] = None) -> None: r"""A single-task exact GP model. Args: train_X: A `n x d` or `batch_shape x n x d` (batch mode) tensor of training features. train_Y: A `n x (o)` or `batch_shape x n x (o)` (batch mode) tensor of training observations. likelihood: A likelihood. If omitted, use a standard GaussianLikelihood with inferred noise level. Example: >>> train_X = torch.rand(20, 2) >>> train_Y = torch.sin(train_X[:, 0]) + torch.cos(train_X[:, 1]) >>> model = SingleTaskGP(train_X, train_Y) """ ard_num_dims = train_X.shape[-1] train_X, train_Y, _ = self._set_dimensions(train_X=train_X, train_Y=train_Y) train_X, train_Y, _ = multioutput_to_batch_mode_transform( train_X=train_X, train_Y=train_Y, num_outputs=self._num_outputs) if likelihood is None: noise_prior = GammaPrior(1.1, 0.05) noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate likelihood = GaussianLikelihood( noise_prior=noise_prior, batch_shape=self._aug_batch_shape, noise_constraint=GreaterThan( MIN_INFERRED_NOISE_LEVEL, transform=None, initial_value=noise_prior_mode, ), ) else: self._likelihood_state_dict = deepcopy(likelihood.state_dict()) ExactGP.__init__(self, train_X, train_Y, likelihood) self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape) self.covar_module = ScaleKernel( MaternKernel( nu=2.5, ard_num_dims=ard_num_dims, batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ), batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) self.to(train_X)
def test_compute_linear_truncated_kernel_with_batch(self): x1 = torch.tensor( [[[1.0, 0.1, 0.2], [3.0, 0.3, 0.4]], [[5.0, 0.5, 0.6], [7.0, 0.7, 0.8]]] ) x2 = torch.tensor( [[[2.0, 0.8, 0.7], [4.0, 0.6, 0.5]], [[6.0, 0.4, 0.3], [8.0, 0.2, 0.1]]] ) t_1 = torch.tensor( [[[0.2736, 0.4400], [0.2304, 0.3600]], [[0.3304, 0.3816], [0.1736, 0.1944]]] ) batch_shape = torch.Size([2]) for nu, fidelity_dims in itertools.product({0.5, 1.5, 2.5}, ([2], [1, 2])): kernel = LinearTruncatedFidelityKernel( fidelity_dims=fidelity_dims, dimension=3, nu=nu, batch_shape=batch_shape ) kernel.power = 1 if len(fidelity_dims) > 1: active_dimsM = [0] t_2 = torch.tensor( [ [[0.0527, 0.1670], [0.0383, 0.1159]], [[0.1159, 0.1670], [0.0383, 0.0527]], ] ) t_3 = torch.tensor( [ [[0.1944, 0.3816], [0.1736, 0.3304]], [[0.3600, 0.4400], [0.2304, 0.2736]], ] ) t = 1 + t_1 + t_2 + t_3 else: active_dimsM = [0, 1] t = 1 + t_1 matern_ker = MaternKernel( nu=nu, active_dims=active_dimsM, batch_shape=batch_shape ) matern_term = matern_ker(x1, x2).evaluate() actual = t * matern_term res = kernel(x1, x2).evaluate() self.assertLess(torch.norm(res - actual), 1e-4) # test diagonal mode res_diag = kernel(x1, x2, diag=True) self.assertLess( torch.norm(res_diag - torch.diagonal(actual, dim1=-1, dim2=-2)), 1e-4 ) # make sure that we error out if last_dim_is_batch=True with self.assertRaises(NotImplementedError): kernel(x1, x2, diag=True, last_dim_is_batch=True)
def test_compute_linear_truncated_kernel_with_batch(self): x1 = torch.tensor([1, 0.1, 0.2, 3, 0.3, 0.4, 5, 0.5, 0.6, 7, 0.7, 0.8], dtype=torch.float).view(2, 2, 3) x2 = torch.tensor([2, 0.8, 0.7, 4, 0.6, 0.5, 6, 0.4, 0.3, 8, 0.2, 0.1], dtype=torch.float).view(2, 2, 3) t_1 = torch.tensor( [0.2736, 0.44, 0.2304, 0.36, 0.3304, 0.3816, 0.1736, 0.1944], dtype=torch.float, ).view(2, 2, 2) batch_shape = torch.Size([2]) dimension = 3 for nu in {0.5, 1.5, 2.5}: for train_data_fidelity in {False, True}: kernel = LinearTruncatedFidelityKernel( nu=nu, dimension=dimension, train_data_fidelity=train_data_fidelity, batch_shape=batch_shape, ) kernel.power = 1 kernel.train_data_fidelity = train_data_fidelity if train_data_fidelity: active_dimsM = [0] t_2 = torch.tensor( [ 0.0527, 0.167, 0.0383, 0.1159, 0.1159, 0.167, 0.0383, 0.0527 ], dtype=torch.float, ).view(2, 2, 2) t_3 = torch.tensor( [ 0.1944, 0.3816, 0.1736, 0.3304, 0.36, 0.44, 0.2304, 0.2736 ], dtype=torch.float, ).view(2, 2, 2) t = 1 + t_1 + t_2 + t_3 else: active_dimsM = [0, 1] t = 1 + t_1 matern_ker = MaternKernel(nu=nu, active_dims=active_dimsM, batch_shape=batch_shape) matern_term = matern_ker(x1, x2).evaluate() actual = t * matern_term res = kernel(x1, x2).evaluate() self.assertLess(torch.norm(res - actual), 1e-4)
def __init__(self, train_X: Tensor, train_Y: Tensor, train_Yvar: Tensor) -> None: r"""A single-task exact GP model using fixed noise levels. Args: train_X: A `n x d` or `batch_shape x n x d` (batch mode) tensor of training features. train_Y: A `n x (o)` or `batch_shape x n x (o)` (batch mode) tensor of training observations. train_Yvar: A `batch_shape x n x (o)` or `batch_shape x n x (o)` (batch mode) tensor of observed measurement noise. Example: >>> train_X = torch.rand(20, 2) >>> train_Y = torch.sin(train_X[:, 0]]) + torch.cos(train_X[:, 1]) >>> train_Yvar = torch.full_like(train_Y, 0.2) >>> model = FixedNoiseGP(train_X, train_Y, train_Yvar) """ ard_num_dims = train_X.shape[-1] train_X, train_Y, train_Yvar = self._set_dimensions( train_X=train_X, train_Y=train_Y, train_Yvar=train_Yvar ) train_X, train_Y, train_Yvar = multioutput_to_batch_mode_transform( train_X=train_X, train_Y=train_Y, num_outputs=self._num_outputs, train_Yvar=train_Yvar, ) likelihood = FixedNoiseGaussianLikelihood( noise=train_Yvar, batch_shape=self._aug_batch_shape ) ExactGP.__init__( self, train_inputs=train_X, train_targets=train_Y, likelihood=likelihood ) self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape) self.covar_module = ScaleKernel( base_kernel=MaternKernel( nu=2.5, ard_num_dims=ard_num_dims, batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ), batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) self.to(train_X)
def __init__( self, decomposition: Dict[str, List[int]], batch_shape: torch.Size, train_embedding: bool = True, cat_feature_dict: Optional[Dict] = None, embs_feature_dict: Optional[Dict] = None, embs_dim_list: Optional[List[int]] = None, context_weight_dict: Optional[Dict] = None, device: Optional[torch.device] = None, ) -> None: super().__init__(batch_shape=batch_shape) self.decomposition = decomposition self.batch_shape = batch_shape self.train_embedding = train_embedding self.device = device num_param = len(next(iter(decomposition.values()))) self.context_list = list(decomposition.keys()) self.num_contexts = len(self.context_list) # get parameter space decomposition for active_parameters in decomposition.values(): # check number of parameters are same in each decomp if len(active_parameters) != num_param: raise ValueError( "num of parameters needs to be same across all contexts") self._indexers = { context: torch.tensor(active_params, device=self.device) for context, active_params in self.decomposition.items() } # get context features and set emb dim self.context_cat_feature = None self.context_emb_feature = None self.n_embs = 0 self.emb_weight_matrix_list = None self.emb_dims = None self._set_context_features( cat_feature_dict=cat_feature_dict, embs_feature_dict=embs_feature_dict, embs_dim_list=embs_dim_list, ) # contruct embedding layer if train_embedding: self._set_emb_layers() # task covariance matrix self.task_covar_module = MaternKernel( nu=2.5, ard_num_dims=self.n_embs, batch_shape=batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ) # base kernel self.base_kernel = MaternKernel( nu=2.5, ard_num_dims=num_param, batch_shape=batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ) # outputscales for each context (note this is like sqrt of outputscale) self.context_weight = None if context_weight_dict is None: outputscale_list = torch.zeros(*batch_shape, self.num_contexts, device=self.device) else: outputscale_list = torch.zeros(*batch_shape, 1, device=self.device) self.context_weight = torch.tensor( [context_weight_dict[c] for c in self.context_list], device=self.device) self.register_parameter(name="raw_outputscale_list", parameter=torch.nn.Parameter(outputscale_list)) self.register_prior( "outputscale_list_prior", GammaPrior(2.0, 15.0), lambda m: m.outputscale_list, lambda m, v: m._set_outputscale_list(v), ) self.register_constraint("raw_outputscale_list", Positive())
def __init__( # noqa C901 self, fidelity_dims: List[int], dimension: Optional[int] = None, power_prior: Optional[Prior] = None, power_constraint: Optional[Interval] = None, nu: float = 2.5, lengthscale_prior_unbiased: Optional[Prior] = None, lengthscale_prior_biased: Optional[Prior] = None, lengthscale_constraint_unbiased: Optional[Interval] = None, lengthscale_constraint_biased: Optional[Interval] = None, covar_module_unbiased: Optional[Kernel] = None, covar_module_biased: Optional[Kernel] = None, **kwargs: Any, ) -> None: if dimension is None and kwargs.get("active_dims") is None: raise UnsupportedError( "Must specify dimension when not specifying active_dims.") n_fidelity = len(fidelity_dims) if len(set(fidelity_dims)) != n_fidelity: raise ValueError("fidelity_dims must not have repeated elements") if n_fidelity not in {1, 2}: raise UnsupportedError( "LinearTruncatedFidelityKernel accepts either one or two" "fidelity parameters.") if nu not in {0.5, 1.5, 2.5}: raise ValueError("nu must be one of 0.5, 1.5, or 2.5") super().__init__(**kwargs) self.fidelity_dims = fidelity_dims if power_constraint is None: power_constraint = Positive() if lengthscale_prior_unbiased is None: lengthscale_prior_unbiased = GammaPrior(3, 6) if lengthscale_prior_biased is None: lengthscale_prior_biased = GammaPrior(6, 2) if lengthscale_constraint_unbiased is None: lengthscale_constraint_unbiased = Positive() if lengthscale_constraint_biased is None: lengthscale_constraint_biased = Positive() self.register_parameter( name="raw_power", parameter=torch.nn.Parameter(torch.zeros(*self.batch_shape, 1)), ) self.register_constraint("raw_power", power_constraint) if power_prior is not None: self.register_prior( "power_prior", power_prior, lambda: self.power, lambda v: self._set_power(v), ) if self.active_dims is not None: dimension = len(self.active_dims) if covar_module_unbiased is None: covar_module_unbiased = MaternKernel( nu=nu, batch_shape=self.batch_shape, lengthscale_prior=lengthscale_prior_unbiased, ard_num_dims=dimension - n_fidelity, lengthscale_constraint=lengthscale_constraint_unbiased, ) if covar_module_biased is None: covar_module_biased = MaternKernel( nu=nu, batch_shape=self.batch_shape, lengthscale_prior=lengthscale_prior_biased, ard_num_dims=dimension - n_fidelity, lengthscale_constraint=lengthscale_constraint_biased, ) self.covar_module_unbiased = covar_module_unbiased self.covar_module_biased = covar_module_biased
def sample_arch(self, START_BO, g, hyperparams, og_flops, empty_val_loss, full_val_loss, target_flops=0): if g < START_BO: if target_flops == 0: f = np.random.rand(1) * (args.upper_channel-args.lower_channel) + args.lower_channel else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) elif g == START_BO: if target_flops == 0: parameterization = np.ones(hyperparams.get_dim()) else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) else: rand = torch.rand(1).cuda() train_X = torch.FloatTensor(self.X).cuda() train_Y_loss = torch.FloatTensor(np.array(self.Y)[:, 0].reshape(-1, 1)).cuda() train_Y_loss = standardize(train_Y_loss) train_Y_cost = torch.FloatTensor(np.array(self.Y)[:, 1].reshape(-1, 1)).cuda() train_Y_cost = standardize(train_Y_cost) covar_module = None if args.ski and g > 128: if args.additive: covar_module = AdditiveStructureKernel( ScaleKernel( GridInterpolationKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), ), grid_size=128, num_dims=1, grid_bounds=[(0, 1)] ), outputscale_prior=GammaPrior(2.0, 0.15), ), num_dims=train_X.shape[1] ) else: covar_module = ScaleKernel( GridInterpolationKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), ), grid_size=128, num_dims=train_X.shape[1], grid_bounds=[(0, 1) for _ in range(train_X.shape[1])] ), outputscale_prior=GammaPrior(2.0, 0.15), ) else: if args.additive: covar_module = AdditiveStructureKernel( ScaleKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), num_dims=1 ), outputscale_prior=GammaPrior(2.0, 0.15), ), num_dims=train_X.shape[1] ) else: covar_module = ScaleKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), num_dims=train_X.shape[1] ), outputscale_prior=GammaPrior(2.0, 0.15), ) new_train_X = train_X gp_loss = SingleTaskGP(new_train_X, train_Y_loss, covar_module=covar_module) mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss) mll = mll.to('cuda') fit_gpytorch_model(mll) # Use add-gp for cost covar_module = AdditiveStructureKernel( ScaleKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), num_dims=1 ), outputscale_prior=GammaPrior(2.0, 0.15), ), num_dims=train_X.shape[1] ) gp_cost = SingleTaskGP(new_train_X, train_Y_cost, covar_module=covar_module) mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost) mll = mll.to('cuda') fit_gpytorch_model(mll) UCB_loss = UpperConfidenceBound(gp_loss, beta=args.beta).cuda() UCB_cost = UpperConfidenceBound(gp_cost, beta=args.beta).cuda() self.mobo_obj = RandAcquisition(UCB_loss).cuda() self.mobo_obj.setup(UCB_loss, UCB_cost, rand) lower = torch.ones(new_train_X.shape[1])*args.lower_channel upper = torch.ones(new_train_X.shape[1])*args.upper_channel self.mobo_bounds = torch.stack([lower, upper]).cuda() if args.pas: val = np.linspace(args.lower_flops, 1, 50) chosen_target_flops = np.random.choice(val, p=(self.sampling_weights/np.sum(self.sampling_weights))) lower_bnd, upper_bnd = 0, 1 lmda = 0.5 for i in range(10): self.mobo_obj.rand = lmda parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) sim_flops = self.mask_pruner.simulate_and_count_flops(layer_budget, self.use_mem) ratio = sim_flops/og_flops if np.abs(ratio - chosen_target_flops) <= 0.02: break if args.baseline > 0: if ratio < chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 elif ratio > chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 else: if ratio < chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 elif ratio > chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 rand[0] = lmda writer.add_scalar('Binary search trials', i, g) else: parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) return layer_budget, parameterization, self.sampling_weights/np.sum(self.sampling_weights)
def __init__( self, train_X: Tensor, train_Y: Tensor, likelihood: Optional[Likelihood] = None, covar_module: Optional[Module] = None, outcome_transform: Optional[OutcomeTransform] = None, ) -> None: r"""A single-task exact GP model. Args: train_X: A `batch_shape x n x d` tensor of training features. train_Y: A `batch_shape x n x m` tensor of training observations. likelihood: A likelihood. If omitted, use a standard GaussianLikelihood with inferred noise level. covar_module: The module computing the covariance (Kernel) matrix. If omitted, use a `MaternKernel`. outcome_transform: An outcome transform that is applied to the training data during instantiation and to the posterior during inference (that is, the `Posterior` obtained by calling `.posterior` on the model will be on the original scale). Example: >>> train_X = torch.rand(20, 2) >>> train_Y = torch.sin(train_X).sum(dim=1, keepdim=True) >>> model = SingleTaskGP(train_X, train_Y) """ if outcome_transform is not None: train_Y, _ = outcome_transform(train_Y) validate_input_scaling(train_X=train_X, train_Y=train_Y) self._validate_tensor_args(X=train_X, Y=train_Y) self._set_dimensions(train_X=train_X, train_Y=train_Y) train_X, train_Y, _ = self._transform_tensor_args(X=train_X, Y=train_Y) if likelihood is None: noise_prior = GammaPrior(1.1, 0.05) noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate likelihood = GaussianLikelihood( noise_prior=noise_prior, batch_shape=self._aug_batch_shape, noise_constraint=GreaterThan( MIN_INFERRED_NOISE_LEVEL, transform=None, initial_value=noise_prior_mode, ), ) else: self._is_custom_likelihood = True ExactGP.__init__(self, train_X, train_Y, likelihood) self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape) if covar_module is None: self.covar_module = ScaleKernel( MaternKernel( nu=2.5, ard_num_dims=train_X.shape[-1], batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ), batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) self._subset_batch_dict = { "likelihood.noise_covar.raw_noise": -2, "mean_module.constant": -2, "covar_module.raw_outputscale": -1, "covar_module.base_kernel.raw_lengthscale": -3, } else: self.covar_module = covar_module # TODO: Allow subsetting of other covar modules if outcome_transform is not None: self.outcome_transform = outcome_transform self.to(train_X)
def __init__( self, dimension: int = 3, nu: float = 2.5, train_iteration_fidelity: bool = True, train_data_fidelity: bool = True, lengthscale_prior: Optional[Prior] = None, power_prior: Optional[Prior] = None, power_constraint: Optional[Interval] = None, lengthscale_2_prior: Optional[Prior] = None, lengthscale_2_constraint: Optional[Interval] = None, lengthscale_constraint: Optional[Interval] = None, covar_module_1: Optional[Kernel] = None, covar_module_2: Optional[Kernel] = None, **kwargs: Any, ): if not train_iteration_fidelity and not train_data_fidelity: raise UnsupportedError( "You should have at least one fidelity parameter.") if nu not in {0.5, 1.5, 2.5}: raise ValueError("nu expected to be 0.5, 1.5, or 2.5") super().__init__(**kwargs) self.train_iteration_fidelity = train_iteration_fidelity self.train_data_fidelity = train_data_fidelity if power_constraint is None: power_constraint = Positive() if lengthscale_prior is None: lengthscale_prior = GammaPrior(3, 6) if lengthscale_2_prior is None: lengthscale_2_prior = GammaPrior(6, 2) if lengthscale_constraint is None: lengthscale_constraint = Positive() if lengthscale_2_constraint is None: lengthscale_2_constraint = Positive() self.register_parameter( name="raw_power", parameter=torch.nn.Parameter(torch.zeros(*self.batch_shape, 1)), ) if power_prior is not None: self.register_prior( "power_prior", power_prior, lambda: self.power, lambda v: self._set_power(v), ) self.register_constraint("raw_power", power_constraint) m = self.train_iteration_fidelity + self.train_data_fidelity if self.active_dims is not None: dimension = len(self.active_dims) if covar_module_1 is None: self.covar_module_1 = MaternKernel( nu=nu, batch_shape=self.batch_shape, lengthscale_prior=lengthscale_prior, ard_num_dims=dimension - m, lengthscale_constraint=lengthscale_constraint, ) else: self.covar_module_1 = covar_module_1 if covar_module_2 is None: self.covar_module_2 = MaternKernel( nu=nu, batch_shape=self.batch_shape, lengthscale_prior=lengthscale_2_prior, ard_num_dims=dimension - m, lengthscale_constraint=lengthscale_2_constraint, ) else: self.covar_module_2 = covar_module_2
def sample_arch(self, START_BO, g, steps, hyperparams, og_flops, full_val_loss, target_flops=0): if args.slim: if target_flops == 0: parameterization = hyperparams.random_sample() layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) else: parameterization = np.ones(hyperparams.get_dim()) * args.lower_channel layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) else: # random sample to warmup history for MOBO if g < START_BO: if target_flops == 0: f = np.random.rand(1) * (args.upper_channel-args.lower_channel) + args.lower_channel else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) # put the largest model into the history elif g == START_BO: if target_flops == 0: parameterization = np.ones(hyperparams.get_dim()) else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) # MOBO else: # this is the scalarization (lambda_{FLOPs}) rand = torch.rand(1).cuda() # standardize data for building Gaussian Processes train_X = torch.FloatTensor(self.X).cuda() train_Y_loss = torch.FloatTensor(np.array(self.Y)[:, 0].reshape(-1, 1)).cuda() train_Y_loss = standardize(train_Y_loss) train_Y_cost = torch.FloatTensor(np.array(self.Y)[:, 1].reshape(-1, 1)).cuda() train_Y_cost = standardize(train_Y_cost) new_train_X = train_X # GP for the cross entropy loss gp_loss = SingleTaskGP(new_train_X, train_Y_loss) mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss) mll = mll.to('cuda') fit_gpytorch_model(mll) # GP for FLOPs # we use add-gp since FLOPs has addive structure (not exactly though) # the parameters for ScaleKernel and MaternKernel simply follow the default covar_module = AdditiveStructureKernel( ScaleKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), num_dims=1 ), outputscale_prior=GammaPrior(2.0, 0.15), ), num_dims=train_X.shape[1] ) gp_cost = SingleTaskGP(new_train_X, train_Y_cost, covar_module=covar_module) mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost) mll = mll.to('cuda') fit_gpytorch_model(mll) # Build acquisition functions UCB_loss = UpperConfidenceBound(gp_loss, beta=0.1).cuda() UCB_cost = UpperConfidenceBound(gp_cost, beta=0.1).cuda() # Combine them via augmented Tchebyshev scalarization self.mobo_obj = RandAcquisition(UCB_loss).cuda() self.mobo_obj.setup(UCB_loss, UCB_cost, rand) # Bounds for the optimization variable (alpha) lower = torch.ones(new_train_X.shape[1])*args.lower_channel upper = torch.ones(new_train_X.shape[1])*args.upper_channel self.mobo_bounds = torch.stack([lower, upper]).cuda() # Pareto-aware sampling if args.pas: # Generate approximate Pareto front first costs = [] for i in range(len(self.population_data)): costs.append([self.population_data[i]['loss'], self.population_data[i]['ratio']]) costs = np.array(costs) efficient_mask = is_pareto_efficient(costs) costs = costs[efficient_mask] loss = costs[:, 0] flops = costs[:, 1] sorted_idx = np.argsort(flops) loss = loss[sorted_idx] flops = flops[sorted_idx] if flops[0] > args.lower_flops: flops = np.concatenate([[args.lower_flops], flops.reshape(-1)]) loss = np.concatenate([[8], loss.reshape(-1)]) else: flops = flops.reshape(-1) loss = loss.reshape(-1) if flops[-1] < args.upper_flops and (loss[-1] > full_val_loss): flops = np.concatenate([flops.reshape(-1), [args.upper_flops]]) loss = np.concatenate([loss.reshape(-1), [full_val_loss]]) else: flops = flops.reshape(-1) loss = loss.reshape(-1) # Equation (4) in paper areas = (flops[1:]-flops[:-1])*(loss[:-1]-loss[1:]) # Quantize into 50 bins to sample from multinomial self.sampling_weights = np.zeros(50) k = 0 while k < len(flops) and flops[k] < args.lower_flops: k+=1 for i in range(50): lower = i/50. upper = (i+1)/50. if upper < args.lower_flops or lower > args.upper_flops or lower < args.lower_flops: continue cnt = 1 while ((k+1) < len(flops)) and upper > flops[k+1]: self.sampling_weights[i] += areas[k] cnt += 1 k += 1 if k < len(areas): self.sampling_weights[i] += areas[k] self.sampling_weights[i] /= cnt if np.sum(self.sampling_weights) == 0: self.sampling_weights = np.ones(50) if target_flops == 0: val = np.arange(0.01, 1, 0.02) chosen_target_flops = np.random.choice(val, p=(self.sampling_weights/np.sum(self.sampling_weights))) else: chosen_target_flops = target_flops # Binary search is here lower_bnd, upper_bnd = 0, 1 lmda = 0.5 for i in range(10): self.mobo_obj.rand = lmda parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) sim_flops = self.mask_pruner.simulate_and_count_flops(layer_budget) ratio = sim_flops/og_flops if np.abs(ratio - chosen_target_flops) <= 0.02: break if args.baseline > 0: if ratio < chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 elif ratio > chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 else: if ratio < chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 elif ratio > chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 rand[0] = lmda writer.add_scalar('Binary search trials', i, steps) else: parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) return layer_budget, parameterization, self.sampling_weights/np.sum(self.sampling_weights)
def __init__( self, train_X: Tensor, train_Y: Tensor, likelihood: Optional[MultitaskGaussianLikelihood] = None, data_covar_module: Optional[Module] = None, task_covar_prior: Optional[Prior] = None, rank: Optional[int] = None, input_transform: Optional[InputTransform] = None, outcome_transform: Optional[OutcomeTransform] = None, **kwargs: Any, ) -> None: r"""Multi-task GP with Kronecker structure, using a simple ICM kernel. Args: train_X: A `batch_shape x n x d` tensor of training features. train_Y: A `batch_shape x n x m` tensor of training observations. likelihood: A `MultitaskGaussianLikelihood`. If omitted, uses a `MultitaskGaussianLikelihood` with a `GammaPrior(1.1, 0.05)` noise prior. data_covar_module: The module computing the covariance (Kernel) matrix in data space. If omitted, use a `MaternKernel`. task_covar_prior : A Prior on the task covariance matrix. Must operate on p.s.d. matrices. A common prior for this is the `LKJ` prior. If omitted, uses `LKJCovariancePrior` with `eta` parameter as specified in the keyword arguments (if not specified, use `eta=1.5`). rank: The rank of the ICM kernel. If omitted, use a full rank kernel. kwargs: Additional arguments to override default settings of priors, including: - eta: The eta parameter on the default LKJ task_covar_prior. A value of 1.0 is uninformative, values <1.0 favor stronger correlations (in magnitude), correlations vanish as eta -> inf. - sd_prior: A scalar prior over nonnegative numbers, which is used for the default LKJCovariancePrior task_covar_prior. - likelihood_rank: The rank of the task covariance matrix to fit. Defaults to 0 (which corresponds to a diagonal covariance matrix). Example: >>> train_X = torch.rand(10, 2) >>> train_Y = torch.cat([f_1(X), f_2(X)], dim=-1) >>> model = KroneckerMultiTaskGP(train_X, train_Y) """ with torch.no_grad(): transformed_X = self.transform_inputs( X=train_X, input_transform=input_transform) if outcome_transform is not None: train_Y, _ = outcome_transform(train_Y) self._validate_tensor_args(X=transformed_X, Y=train_Y) self._num_outputs = train_Y.shape[-1] batch_shape, ard_num_dims = train_X.shape[:-2], train_X.shape[-1] num_tasks = train_Y.shape[-1] if rank is None: rank = num_tasks if likelihood is None: noise_prior = GammaPrior(1.1, 0.05) noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate likelihood = MultitaskGaussianLikelihood( num_tasks=num_tasks, batch_shape=batch_shape, noise_prior=noise_prior, noise_constraint=GreaterThan( MIN_INFERRED_NOISE_LEVEL, transform=None, initial_value=noise_prior_mode, ), rank=kwargs.get("likelihood_rank", 0), ) if task_covar_prior is None: task_covar_prior = LKJCovariancePrior( n=num_tasks, eta=torch.tensor(kwargs.get("eta", 1.5)).to(train_X), sd_prior=kwargs.get( "sd_prior", SmoothedBoxPrior(math.exp(-6), math.exp(1.25), 0.05), ), ) super().__init__(train_X, train_Y, likelihood) self.mean_module = MultitaskMean( base_means=ConstantMean(batch_shape=batch_shape), num_tasks=num_tasks) if data_covar_module is None: data_covar_module = MaternKernel( nu=2.5, ard_num_dims=ard_num_dims, lengthscale_prior=GammaPrior(3.0, 6.0), batch_shape=batch_shape, ) else: data_covar_module = data_covar_module self.covar_module = MultitaskKernel( data_covar_module=data_covar_module, num_tasks=num_tasks, rank=rank, batch_shape=batch_shape, task_covar_prior=task_covar_prior, ) if outcome_transform is not None: self.outcome_transform = outcome_transform if input_transform is not None: self.input_transform = input_transform self.to(train_X)
def sample_arch(self, START_BO, g, hyperparams, og_flops, empty_val_loss, full_val_loss, target_flops=0): # Warming up the history with a single width-multiplier if g < START_BO: if target_flops == 0: f = np.random.rand(1) * (args.upper_channel - args. lower_channel) + args.lower_channel else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization( parameterization, self.mask_pruner) # Put largest model into the history elif g == START_BO: if target_flops == 0: parameterization = np.ones(hyperparams.get_dim()) else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization( parameterization, self.mask_pruner) # MOBO-RS else: rand = torch.rand(1).cuda() train_X = torch.FloatTensor(self.X).cuda() train_Y_loss = torch.FloatTensor( np.array(self.Y)[:, 0].reshape(-1, 1)).cuda() train_Y_loss = standardize(train_Y_loss) train_Y_cost = torch.FloatTensor( np.array(self.Y)[:, 1].reshape(-1, 1)).cuda() train_Y_cost = standardize(train_Y_cost) new_train_X = train_X gp_loss = SingleTaskGP(new_train_X, train_Y_loss) mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss) mll = mll.to('cuda') fit_gpytorch_model(mll) # Use add-gp for cost covar_module = AdditiveStructureKernel(ScaleKernel( MaternKernel(nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), num_dims=1), outputscale_prior=GammaPrior(2.0, 0.15), ), num_dims=train_X.shape[1]) gp_cost = SingleTaskGP(new_train_X, train_Y_cost, covar_module=covar_module) mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost) mll = mll.to('cuda') fit_gpytorch_model(mll) UCB_loss = UpperConfidenceBound(gp_loss).cuda() UCB_cost = UpperConfidenceBound(gp_cost).cuda() self.mobo_obj = RandAcquisition(UCB_loss).cuda() self.mobo_obj.setup(UCB_loss, UCB_cost, rand) lower = torch.ones(new_train_X.shape[1]) * args.lower_channel upper = torch.ones(new_train_X.shape[1]) * args.upper_channel self.mobo_bounds = torch.stack([lower, upper]).cuda() if args.pas: costs = [] for i in range(len(self.population_data)): costs.append([ self.population_data[i]['loss'], self.population_data[i]['ratio'] ]) costs = np.array(costs) efficient_mask = is_pareto_efficient(costs) costs = costs[efficient_mask] loss = costs[:, 0] flops = costs[:, 1] sorted_idx = np.argsort(flops) loss = loss[sorted_idx] flops = flops[sorted_idx] if flops[0] > args.lower_flops: flops = np.concatenate([[args.lower_flops], flops.reshape(-1)]) loss = np.concatenate([[empty_val_loss], loss.reshape(-1)]) else: flops = flops.reshape(-1) loss = loss.reshape(-1) if flops[-1] < args.upper_flops and (loss[-1] > full_val_loss): flops = np.concatenate( [flops.reshape(-1), [args.upper_flops]]) loss = np.concatenate([loss.reshape(-1), [full_val_loss]]) else: flops = flops.reshape(-1) loss = loss.reshape(-1) areas = (flops[1:] - flops[:-1]) * (loss[:-1] - loss[1:]) self.sampling_weights = np.zeros(50) k = 0 while k < len(flops) and flops[k] < args.lower_flops: k += 1 for i in range(50): lower = i / 50. upper = (i + 1) / 50. if upper < args.lower_flops or lower > args.upper_flops or lower < args.lower_flops: continue cnt = 1 while ((k + 1) < len(flops)) and upper > flops[k + 1]: self.sampling_weights[i] += areas[k] cnt += 1 k += 1 if k < len(areas): self.sampling_weights[i] += areas[k] self.sampling_weights[i] /= cnt if np.sum(self.sampling_weights) == 0: self.sampling_weights = np.ones(50) if target_flops == 0: val = np.arange(0.01, 1, 0.02) chosen_target_flops = np.random.choice( val, p=(self.sampling_weights / np.sum(self.sampling_weights))) else: chosen_target_flops = target_flops lower_bnd, upper_bnd = 0, 1 lmda = 0.5 for i in range(10): self.mobo_obj.rand = lmda parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization( parameterization, self.mask_pruner) sim_flops = self.mask_pruner.simulate_and_count_flops( layer_budget) ratio = sim_flops / og_flops if np.abs(ratio - chosen_target_flops) <= 0.02: break if args.baseline > 0: if ratio < chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 elif ratio > chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 else: if ratio < chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 elif ratio > chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 rand[0] = lmda writer.add_scalar('Binary search trials', i, g) else: parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization( parameterization, self.mask_pruner) return layer_budget, parameterization, self.sampling_weights / np.sum( self.sampling_weights)
def __init__( self, train_X: Tensor, train_Y: Tensor, task_feature: int, covar_module: Optional[Module] = None, task_covar_prior: Optional[Prior] = None, output_tasks: Optional[List[int]] = None, rank: Optional[int] = None, input_transform: Optional[InputTransform] = None, outcome_transform: Optional[OutcomeTransform] = None, ) -> None: r"""Multi-Task GP model using an ICM kernel, inferring observation noise. Args: train_X: A `n x (d + 1)` or `b x n x (d + 1)` (batch mode) tensor of training data. One of the columns should contain the task features (see `task_feature` argument). train_Y: A `n x 1` or `b x n x 1` (batch mode) tensor of training observations. task_feature: The index of the task feature (`-d <= task_feature <= d`). output_tasks: A list of task indices for which to compute model outputs for. If omitted, return outputs for all task indices. rank: The rank to be used for the index kernel. If omitted, use a full rank (i.e. number of tasks) kernel. task_covar_prior : A Prior on the task covariance matrix. Must operate on p.s.d. matrices. A common prior for this is the `LKJ` prior. input_transform: An input transform that is applied in the model's forward pass. Example: >>> X1, X2 = torch.rand(10, 2), torch.rand(20, 2) >>> i1, i2 = torch.zeros(10, 1), torch.ones(20, 1) >>> train_X = torch.cat([ >>> torch.cat([X1, i1], -1), torch.cat([X2, i2], -1), >>> ]) >>> train_Y = torch.cat(f1(X1), f2(X2)).unsqueeze(-1) >>> model = MultiTaskGP(train_X, train_Y, task_feature=-1) """ with torch.no_grad(): transformed_X = self.transform_inputs( X=train_X, input_transform=input_transform) self._validate_tensor_args(X=transformed_X, Y=train_Y) all_tasks, task_feature, d = self.get_all_tasks( transformed_X, task_feature, output_tasks) if outcome_transform is not None: train_Y, _ = outcome_transform(train_Y) # squeeze output dim train_Y = train_Y.squeeze(-1) if output_tasks is None: output_tasks = all_tasks else: if set(output_tasks) - set(all_tasks): raise RuntimeError( "All output tasks must be present in input data.") self._output_tasks = output_tasks self._num_outputs = len(output_tasks) # TODO (T41270962): Support task-specific noise levels in likelihood likelihood = GaussianLikelihood(noise_prior=GammaPrior(1.1, 0.05)) # construct indexer to be used in forward self._task_feature = task_feature self._base_idxr = torch.arange(d) self._base_idxr[task_feature:] += 1 # exclude task feature super().__init__(train_inputs=train_X, train_targets=train_Y, likelihood=likelihood) self.mean_module = ConstantMean() if covar_module is None: self.covar_module = ScaleKernel( base_kernel=MaternKernel(nu=2.5, ard_num_dims=d, lengthscale_prior=GammaPrior( 3.0, 6.0)), outputscale_prior=GammaPrior(2.0, 0.15), ) else: self.covar_module = covar_module num_tasks = len(all_tasks) self._rank = rank if rank is not None else num_tasks self.task_covar_module = IndexKernel(num_tasks=num_tasks, rank=self._rank, prior=task_covar_prior) if input_transform is not None: self.input_transform = input_transform if outcome_transform is not None: self.outcome_transform = outcome_transform self.to(train_X)
def __init__( self, train_X: Tensor, train_Y: Tensor, task_feature: int, output_tasks: Optional[List[int]] = None, rank: Optional[int] = None, ) -> None: r"""Multi-Task GP model using an ICM kernel, inferring observation noise. Args: train_X: A `n x (d + 1)` or `b x n x (d + 1)` (batch mode) tensor of training data. One of the columns should contain the task features (see `task_feature` argument). train_Y: A `n` or `b x n` (batch mode) tensor of training observations. task_feature: The index of the task feature (`-d <= task_feature <= d`). output_tasks: A list of task indices for which to compute model outputs for. If omitted, return outputs for all task indices. rank: The rank to be used for the index kernel. If omitted, use a full rank (i.e. number of tasks) kernel. Example: >>> X1, X2 = torch.rand(10, 2), torch.rand(20, 2) >>> i1, i2 = torch.zeros(10, 1), torch.ones(20, 1) >>> train_X = torch.stack([ >>> torch.cat([X1, i1], -1), torch.cat([X2, i2], -1), >>> ]) >>> train_Y = torch.cat(f1(X1), f2(X2)) >>> model = MultiTaskGP(train_X, train_Y, task_feature=-1) """ if train_X.ndimension() != 2: # Currently, batch mode MTGPs are blocked upstream in GPyTorch raise ValueError(f"Unsupported shape {train_X.shape} for train_X.") d = train_X.shape[-1] - 1 if not (-d <= task_feature <= d): raise ValueError(f"Must have that -{d} <= task_feature <= {d}") all_tasks = train_X[:, task_feature].unique().to( dtype=torch.long).tolist() if output_tasks is None: output_tasks = all_tasks else: if any(t not in all_tasks for t in output_tasks): raise RuntimeError( "All output tasks must be present in input data.") self._output_tasks = output_tasks # TODO (T41270962): Support task-specific noise levels in likelihood likelihood = GaussianLikelihood(noise_prior=GammaPrior(1.1, 0.05)) # construct indexer to be used in forward self._task_feature = task_feature self._base_idxr = torch.arange(d) self._base_idxr[task_feature:] += 1 # exclude task feature super().__init__(train_inputs=train_X, train_targets=train_Y, likelihood=likelihood) self.mean_module = ConstantMean() self.covar_module = ScaleKernel( base_kernel=MaternKernel(nu=2.5, ard_num_dims=d, lengthscale_prior=GammaPrior(3.0, 6.0)), outputscale_prior=GammaPrior(2.0, 0.15), ) num_tasks = len(all_tasks) self._rank = rank if rank is not None else num_tasks # TODO: Add LKJ prior for the index kernel self.task_covar_module = IndexKernel(num_tasks=num_tasks, rank=self._rank) self.to(train_X)