def __init__( self, decomposition: Dict[str, List[int]], batch_shape: torch.Size ) -> None: super().__init__(batch_shape=batch_shape) self.decomposition = decomposition num_param = len(next(iter(decomposition.values()))) for active_parameters in decomposition.values(): # check number of parameters are same in each decomp if len(active_parameters) != num_param: raise ValueError( "num of parameters needs to be same across all contexts" ) self._indexers = { context: torch.tensor(active_params) for context, active_params in self.decomposition.items() } self.base_kernel = MaternKernel( nu=2.5, ard_num_dims=num_param, batch_shape=batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ) self.kernel_dict = {} # scaled kernel for each parameter space partition for context in list(decomposition.keys()): self.kernel_dict[context] = ScaleKernel( base_kernel=self.base_kernel, outputscale_prior=GammaPrior(2.0, 15.0) ) self.kernel_dict = ModuleDict(self.kernel_dict)
def get_warping_transform( d: int, task_feature: Optional[int] = None, ) -> Warp: """Construct input warping transform. Args: d: The dimension of the input, including task features task_feature: the index of the task feature Returns: The input warping transform. """ indices = list(range(d)) # apply warping to all non-task features, including fidelity features if task_feature is not None: del indices[task_feature] # Note: this currently uses the same warping functions for all tasks tf = Warp( indices=indices, # use an uninformative prior with maximum log probability at 1 concentration1_prior=GammaPrior(1.01, 0.01), concentration0_prior=GammaPrior(1.01, 0.01), ) return tf
def __init__( self, train_X: Tensor, train_Y: Tensor, nu: float = 2.5, train_iteration_fidelity: bool = True, train_data_fidelity: bool = True, likelihood: Optional[Likelihood] = None, ) -> None: if not train_iteration_fidelity and not train_data_fidelity: raise UnsupportedError( "You should have at least one fidelity parameter.") self._set_dimensions(train_X=train_X, train_Y=train_Y) kernel = LinearTruncatedFidelityKernel( nu=nu, dimension=train_X.shape[-1], train_iteration_fidelity=train_iteration_fidelity, train_data_fidelity=train_data_fidelity, batch_shape=self._aug_batch_shape, power_prior=GammaPrior(3.0, 3.0), ) covar_module = ScaleKernel( kernel, batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) super().__init__(train_X=train_X, train_Y=train_Y, covar_module=covar_module) self.to(train_X)
def __init__( self, train_X: Tensor, train_Y: Tensor, likelihood: Optional[Likelihood] = None, covar_module: Optional[Module] = None, ) -> None: r"""A single-task exact GP model. Args: train_X: A `n x d` or `batch_shape x n x d` (batch mode) tensor of training features. train_Y: A `n x m` or `batch_shape x n x m` (batch mode) tensor of training observations. likelihood: A likelihood. If omitted, use a standard GaussianLikelihood with inferred noise level. covar_module: The covariance (kernel) matrix. If omitted, use the MaternKernel. Example: >>> train_X = torch.rand(20, 2) >>> train_Y = torch.sin(train_X).sum(dim=1, keepdim=True) >>> model = SingleTaskGP(train_X, train_Y) """ validate_input_scaling(train_X=train_X, train_Y=train_Y) self._validate_tensor_args(X=train_X, Y=train_Y) self._set_dimensions(train_X=train_X, train_Y=train_Y) train_X, train_Y, _ = self._transform_tensor_args(X=train_X, Y=train_Y) if likelihood is None: noise_prior = GammaPrior(1.1, 0.05) noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate likelihood = GaussianLikelihood( noise_prior=noise_prior, batch_shape=self._aug_batch_shape, noise_constraint=GreaterThan( MIN_INFERRED_NOISE_LEVEL, transform=None, initial_value=noise_prior_mode, ), ) else: self._is_custom_likelihood = True ExactGP.__init__(self, train_X, train_Y, likelihood) self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape) if covar_module is None: self.covar_module = ScaleKernel( MaternKernel( nu=2.5, ard_num_dims=train_X.shape[-1], batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ), batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) else: self.covar_module = covar_module self.to(train_X)
def __init__( self, train_X: Tensor, train_Y: Tensor, train_Yvar: Tensor, outcome_transform: Optional[OutcomeTransform] = None, ) -> None: r"""A single-task exact GP model using fixed noise levels. Args: train_X: A `batch_shape x n x d` tensor of training features. train_Y: A `batch_shape x n x m` tensor of training observations. train_Yvar: A `batch_shape x n x m` tensor of observed measurement noise. outcome_transform: An outcome transform that is applied to the training data during instantiation and to the posterior during inference (that is, the `Posterior` obtained by calling `.posterior` on the model will be on the original scale). Example: >>> train_X = torch.rand(20, 2) >>> train_Y = torch.sin(train_X).sum(dim=1, keepdim=True) >>> train_Yvar = torch.full_like(train_Y, 0.2) >>> model = FixedNoiseGP(train_X, train_Y, train_Yvar) """ if outcome_transform is not None: train_Y, train_Yvar = outcome_transform(train_Y, train_Yvar) validate_input_scaling(train_X=train_X, train_Y=train_Y, train_Yvar=train_Yvar) self._validate_tensor_args(X=train_X, Y=train_Y, Yvar=train_Yvar) self._set_dimensions(train_X=train_X, train_Y=train_Y) train_X, train_Y, train_Yvar = self._transform_tensor_args( X=train_X, Y=train_Y, Yvar=train_Yvar) likelihood = FixedNoiseGaussianLikelihood( noise=train_Yvar, batch_shape=self._aug_batch_shape) ExactGP.__init__(self, train_inputs=train_X, train_targets=train_Y, likelihood=likelihood) self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape) self.covar_module = ScaleKernel( base_kernel=MaternKernel( nu=2.5, ard_num_dims=train_X.shape[-1], batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ), batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) if outcome_transform is not None: self.outcome_transform = outcome_transform self._subset_batch_dict = { "mean_module.constant": -2, "covar_module.raw_outputscale": -1, "covar_module.base_kernel.raw_lengthscale": -3, } self.to(train_X)
def construct_inputs(cls, training_data: TrainingData, **kwargs) -> Dict[str, Any]: r"""Construct kwargs for the `Model` from `TrainingData` and other options. Args: training_data: `TrainingData` container with data for single outcome or for multiple outcomes for batched multi-output case. **kwargs: Additional options for the model that pertain to the training data, including: - `task_features`: Indices of the input columns containing the task features (expected list of length 1), - `task_covar_prior`: A GPyTorch `Prior` object to use as prior on the cross-task covariance matrix, - `prior_config`: A dict representing a prior config, should only be used if `prior` is not passed directly. Should contain: `use_LKJ_prior` (whether to use LKJ prior) and `eta` (eta value, float), - `rank`: The rank of the cross-task covariance matrix. """ task_features = kwargs.pop("task_features", None) if task_features is None: raise ValueError(f"`task_features` required for {cls.__name__}.") task_feature = task_features[0] inputs = { "train_X": training_data.X, "train_Y": training_data.Y, "task_feature": task_feature, "rank": kwargs.get("rank"), } prior = kwargs.get("task_covar_prior") prior_config = kwargs.get("prior_config") if prior and prior_config: raise ValueError( "Only one of `prior` and `prior_config` arguments expected.") if prior_config: if not prior_config.get("use_LKJ_prior"): raise ValueError( "Currently only config for LKJ prior is supported.") all_tasks, _, _ = MultiTaskGP.get_all_tasks( training_data.X, task_feature) num_tasks = len(all_tasks) sd_prior = GammaPrior(1.0, 0.15) sd_prior._event_shape = torch.Size([num_tasks]) eta = prior_config.get("eta", 0.5) if not isinstance(eta, float) and not isinstance(eta, int): raise ValueError( f"eta must be a real number, your eta was {eta}.") prior = LKJCovariancePrior(num_tasks, eta, sd_prior) inputs["task_covar_prior"] = prior return inputs
def __init__(self, train_X: Tensor, train_Y: Tensor, likelihood: Optional[Likelihood] = None) -> None: r"""A single-task exact GP model. Args: train_X: A `n x d` or `batch_shape x n x d` (batch mode) tensor of training features. train_Y: A `n x (o)` or `batch_shape x n x (o)` (batch mode) tensor of training observations. likelihood: A likelihood. If omitted, use a standard GaussianLikelihood with inferred noise level. Example: >>> train_X = torch.rand(20, 2) >>> train_Y = torch.sin(train_X[:, 0]) + torch.cos(train_X[:, 1]) >>> model = SingleTaskGP(train_X, train_Y) """ ard_num_dims = train_X.shape[-1] train_X, train_Y, _ = self._set_dimensions(train_X=train_X, train_Y=train_Y) train_X, train_Y, _ = multioutput_to_batch_mode_transform( train_X=train_X, train_Y=train_Y, num_outputs=self._num_outputs) if likelihood is None: noise_prior = GammaPrior(1.1, 0.05) noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate likelihood = GaussianLikelihood( noise_prior=noise_prior, batch_shape=self._aug_batch_shape, noise_constraint=GreaterThan( MIN_INFERRED_NOISE_LEVEL, transform=None, initial_value=noise_prior_mode, ), ) else: self._likelihood_state_dict = deepcopy(likelihood.state_dict()) ExactGP.__init__(self, train_X, train_Y, likelihood) self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape) self.covar_module = ScaleKernel( MaternKernel( nu=2.5, ard_num_dims=ard_num_dims, batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ), batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) self.to(train_X)
def __init__( self, train_x: torch.Tensor, train_y: torch.Tensor, inducing_points: torch.Tensor, scales: Union[torch.Tensor, float] = 1.0, mean_module: Optional[Mean] = None, covar_module: Optional[Kernel] = None, fixed_prior_mean: Optional[float] = None, ) -> None: variational_distribution = CholeskyVariationalDistribution( inducing_points.size(0)) variational_distribution.to(train_x) variational_strategy = VariationalStrategy( model=self, inducing_points=inducing_points, variational_distribution=variational_distribution, learn_inducing_locations=False, ) super(MixedDerivativeVariationalGP, self).__init__(variational_strategy) # Set the mean if specified to if mean_module is None: self.mean_module = ConstantMeanPartialObsGrad() else: self.mean_module = mean_module if fixed_prior_mean is not None: self.mean_module.constant.requires_grad_(False) self.mean_module.constant.copy_( torch.tensor([fixed_prior_mean], dtype=train_x.dtype)) if covar_module is None: self.base_kernel = RBFKernelPartialObsGrad( ard_num_dims=train_x.shape[-1] - 1, lengthscale_prior=GammaPrior(3.0, 6.0 / scales), ) self.covar_module = ScaleKernel(self.base_kernel, outputscale_prior=GammaPrior( 2.0, 0.15)) else: self.covar_module = covar_module self._num_outputs = 1 self.train_inputs = (train_x, ) self.train_targets = train_y self(train_x) # Necessary for CholeskyVariationalDistribution
def __init__(self, train_X: Tensor, train_Y: Tensor, train_Yvar: Tensor) -> None: r"""A single-task exact GP model using fixed noise levels. Args: train_X: A `n x d` or `batch_shape x n x d` (batch mode) tensor of training features. train_Y: A `n x (o)` or `batch_shape x n x (o)` (batch mode) tensor of training observations. train_Yvar: A `batch_shape x n x (o)` or `batch_shape x n x (o)` (batch mode) tensor of observed measurement noise. Example: >>> train_X = torch.rand(20, 2) >>> train_Y = torch.sin(train_X[:, 0]]) + torch.cos(train_X[:, 1]) >>> train_Yvar = torch.full_like(train_Y, 0.2) >>> model = FixedNoiseGP(train_X, train_Y, train_Yvar) """ ard_num_dims = train_X.shape[-1] train_X, train_Y, train_Yvar = self._set_dimensions( train_X=train_X, train_Y=train_Y, train_Yvar=train_Yvar ) train_X, train_Y, train_Yvar = multioutput_to_batch_mode_transform( train_X=train_X, train_Y=train_Y, num_outputs=self._num_outputs, train_Yvar=train_Yvar, ) likelihood = FixedNoiseGaussianLikelihood( noise=train_Yvar, batch_shape=self._aug_batch_shape ) ExactGP.__init__( self, train_inputs=train_X, train_targets=train_Y, likelihood=likelihood ) self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape) self.covar_module = ScaleKernel( base_kernel=MaternKernel( nu=2.5, ard_num_dims=ard_num_dims, batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ), batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) self.to(train_X)
def test_sample_all_priors(self, cuda=False): device = torch.device("cuda" if cuda else "cpu") for dtype in (torch.float, torch.double): train_X = torch.rand(3, 5, device=device, dtype=dtype) train_Y = torch.rand(3, 1, device=device, dtype=dtype) model = SingleTaskGP(train_X=train_X, train_Y=train_Y) mll = ExactMarginalLogLikelihood(model.likelihood, model) mll.to(device=device, dtype=dtype) original_state_dict = dict(deepcopy(mll.model.state_dict())) sample_all_priors(model) # make sure one of the hyperparameters changed self.assertTrue( dict(model.state_dict())["likelihood.noise_covar.raw_noise"] != original_state_dict["likelihood.noise_covar.raw_noise"]) # check that lengthscales are all different ls = model.covar_module.base_kernel.raw_lengthscale.view( -1).tolist() self.assertTrue(all(ls[0] != ls[i]) for i in range(1, len(ls))) # change one of the priors to SmoothedBoxPrior model.covar_module = ScaleKernel( MaternKernel( nu=2.5, ard_num_dims=model.train_inputs[0].shape[-1], batch_shape=model._aug_batch_shape, lengthscale_prior=SmoothedBoxPrior(3.0, 6.0), ), batch_shape=model._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) original_state_dict = dict(deepcopy(mll.model.state_dict())) with warnings.catch_warnings( record=True) as ws, settings.debug(True): sample_all_priors(model) self.assertEqual(len(ws), 1) self.assertTrue("rsample" in str(ws[0].message)) # the lengthscale should not have changed because sampling is # not implemented for SmoothedBoxPrior self.assertTrue( torch.equal( dict(model.state_dict()) ["covar_module.base_kernel.raw_lengthscale"], original_state_dict[ "covar_module.base_kernel.raw_lengthscale"], )) # set setting_closure to None and make sure RuntimeError is raised prior_tuple = model.likelihood.noise_covar._priors["noise_prior"] model.likelihood.noise_covar._priors["noise_prior"] = ( prior_tuple[0], prior_tuple[1], None, ) with self.assertRaises(RuntimeError): sample_all_priors(model)
def __init__( self, train_X: Tensor, train_Y: Tensor, train_iteration_fidelity: bool = True, train_data_fidelity: bool = True, likelihood: Optional[Likelihood] = None, ) -> None: train_X, train_Y, _ = self._set_dimensions(train_X=train_X, train_Y=train_Y) num_fidelity = train_iteration_fidelity + train_data_fidelity ard_num_dims = train_X.shape[-1] - num_fidelity active_dimsX = list(range(train_X.shape[-1] - num_fidelity)) rbf_kernel = RBFKernel( ard_num_dims=ard_num_dims, batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), active_dims=active_dimsX, ) exp_kernel = ExpDecayKernel( batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), offset_prior=GammaPrior(3.0, 6.0), power_prior=GammaPrior(3.0, 6.0), ) ds_kernel = DownsamplingKernel( batch_shape=self._aug_batch_shape, offset_prior=GammaPrior(3.0, 6.0), power_prior=GammaPrior(3.0, 6.0), ) if train_iteration_fidelity and train_data_fidelity: active_dimsS1 = [train_X.shape[-1] - 1] active_dimsS2 = [train_X.shape[-1] - 2] exp_kernel.active_dims = torch.tensor(active_dimsS1) ds_kernel.active_dims = torch.tensor(active_dimsS2) kernel = rbf_kernel * exp_kernel * ds_kernel elif train_iteration_fidelity or train_data_fidelity: active_dimsS = [train_X.shape[-1] - 1] if train_iteration_fidelity: exp_kernel.active_dims = torch.tensor(active_dimsS) kernel = rbf_kernel * exp_kernel else: ds_kernel.active_dims = torch.tensor(active_dimsS) kernel = rbf_kernel * ds_kernel else: raise UnsupportedError( "You should have at least one fidelity parameter.") covar_module = ScaleKernel( kernel, batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) super().__init__(train_X=train_X, train_Y=train_Y, covar_module=covar_module) self.to(train_X)
def __init__(self, ard_dims, kernel_transform): # Default values from botorch/models/gp_regression.py # GammaPrior(3.0, 6.0)) means alpha=3, beta=6 # nu=2.5 means Matern 5/2 kernel (1/2, 3/2, 5/2) # E[X~Gamma(alpha,beta)] = alpha/beta # Var[X~Gamma(alpha,beta)] = alpha/(beta^2) core_kernel = gpytorch.kernels.MaternKernel( nu=2.5, ard_num_dims=ard_dims, lengthscale_prior=GammaPrior(3.0, 6.0), param_transform=None) super(MMaternKernel, self).__init__() self.kernel_transform = kernel_transform
def fit_gp(self) -> None: """ Re-fits the GP using the most up to date data. """ noise_prior = GammaPrior(1.1, 0.5) noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate likelihood = GaussianLikelihood( noise_prior=noise_prior, batch_shape=[], noise_constraint=GreaterThan( # 0.000005, # minimum observation noise assumed in the GP model 0.0001, transform=None, initial_value=noise_prior_mode, ), ) self.model = SingleTaskGP( self.X, self.Y, likelihood, outcome_transform=Standardize(m=1) ) mll = ExactMarginalLogLikelihood(self.model.likelihood, self.model) fit_gpytorch_model(mll) # dummy computation to be safe with gp fit try: dummy = torch.rand( (1, self.q, self.dim), dtype=self.dtype, device=self.device ) _ = self.model.posterior(dummy).mean except RuntimeError as err: if self.fit_count < 5: self.fit_count += 1 self.Y = self.Y + torch.randn_like(self.Y) * 0.001 self.fit_gp() else: raise err self.fit_count = 0 self.passed = False
def make_kernel(kernel_type, ard_dims, svae=None): kernel_transform = None if svae is not None: assert (kernel_type in ['KL', 'uKL', 'LKL'] or kernel_type.startswith('SVAE')) latent = False if kernel_type == 'KL' else True kernel_transform = KernelTransform(svae, latent=latent) min_outsc = GPConstants.MIN_OUTPUTSCALE max_outsc = GPConstants.MAX_OUTPUTSCALE hypers_unrestricted = (kernel_type == 'uSE' or kernel_type == 'uMatern' or kernel_type == 'uKL') if hypers_unrestricted: min_outsc = GPConstants.MIN_UNRESTR_OUTPUTSCALE max_outsc = GPConstants.MAX_UNRESTR_OUTPUTSCALE outsc_constr = SaneInterval( lower_bound=torch.as_tensor(min_outsc), upper_bound=torch.as_tensor(max_outsc), transform=torch.sigmoid, initial_value=inv_sigmoid(torch.as_tensor(0.9999 / max_outsc))) # ~1.0 if kernel_type.endswith('SE'): covar_module = gpytorch.kernels.ScaleKernel( RRBFKernel(ard_dims, kernel_transform, hypers_unrestricted), outputscale_constraint=outsc_constr) elif kernel_type.endswith('Matern'): covar_module = gpytorch.kernels.ScaleKernel( MMaternKernel(ard_dims, kernel_transform), outputscale_prior=GammaPrior(2.0, 0.15), outputscale_constraint=outsc_constr) elif kernel_type.endswith( 'KL'): # symmetrized KL (in latent space for LKL) assert (kernel_type == 'KL' or kernel_type == 'uKL' or kernel_type == 'LKL') covar_module = gpytorch.kernels.ScaleKernel( KLKernel(kernel_transform), outputscale_constraint=outsc_constr) else: logging.error('Unsupported kernel type', kernel_type) assert (False) return covar_module, kernel_transform
def initialize_model(self, train_X, train_Y, state_dict=None): """Initialise model for BO.""" # From: https://github.com/pytorch/botorch/issues/179 noise_prior = GammaPrior(1.1, 0.05) noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate MIN_INFERRED_NOISE_LEVEL = 1e-3 likelihood = GaussianLikelihood( noise_prior=noise_prior, noise_constraint=GreaterThan( MIN_INFERRED_NOISE_LEVEL, transform=None, initial_value=noise_prior_mode, ), ) # train_x = self.scale_to_0_1_bounds(train_X) train_Y = standardize(train_Y) gp = SingleTaskGP(train_X, train_Y, likelihood=likelihood) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) # load state dict if it is passed if state_dict is not None: gp.load_state_dict(state_dict) return mll, gp
def __init__( self, train_X: Tensor, train_Y: Tensor, task_feature: int, covar_module: Optional[Module] = None, task_covar_prior: Optional[Prior] = None, output_tasks: Optional[List[int]] = None, rank: Optional[int] = None, input_transform: Optional[InputTransform] = None, outcome_transform: Optional[OutcomeTransform] = None, ) -> None: r"""Multi-Task GP model using an ICM kernel, inferring observation noise. Args: train_X: A `n x (d + 1)` or `b x n x (d + 1)` (batch mode) tensor of training data. One of the columns should contain the task features (see `task_feature` argument). train_Y: A `n x 1` or `b x n x 1` (batch mode) tensor of training observations. task_feature: The index of the task feature (`-d <= task_feature <= d`). output_tasks: A list of task indices for which to compute model outputs for. If omitted, return outputs for all task indices. rank: The rank to be used for the index kernel. If omitted, use a full rank (i.e. number of tasks) kernel. task_covar_prior : A Prior on the task covariance matrix. Must operate on p.s.d. matrices. A common prior for this is the `LKJ` prior. input_transform: An input transform that is applied in the model's forward pass. Example: >>> X1, X2 = torch.rand(10, 2), torch.rand(20, 2) >>> i1, i2 = torch.zeros(10, 1), torch.ones(20, 1) >>> train_X = torch.cat([ >>> torch.cat([X1, i1], -1), torch.cat([X2, i2], -1), >>> ]) >>> train_Y = torch.cat(f1(X1), f2(X2)).unsqueeze(-1) >>> model = MultiTaskGP(train_X, train_Y, task_feature=-1) """ with torch.no_grad(): transformed_X = self.transform_inputs( X=train_X, input_transform=input_transform) self._validate_tensor_args(X=transformed_X, Y=train_Y) all_tasks, task_feature, d = self.get_all_tasks( transformed_X, task_feature, output_tasks) if outcome_transform is not None: train_Y, _ = outcome_transform(train_Y) # squeeze output dim train_Y = train_Y.squeeze(-1) if output_tasks is None: output_tasks = all_tasks else: if set(output_tasks) - set(all_tasks): raise RuntimeError( "All output tasks must be present in input data.") self._output_tasks = output_tasks self._num_outputs = len(output_tasks) # TODO (T41270962): Support task-specific noise levels in likelihood likelihood = GaussianLikelihood(noise_prior=GammaPrior(1.1, 0.05)) # construct indexer to be used in forward self._task_feature = task_feature self._base_idxr = torch.arange(d) self._base_idxr[task_feature:] += 1 # exclude task feature super().__init__(train_inputs=train_X, train_targets=train_Y, likelihood=likelihood) self.mean_module = ConstantMean() if covar_module is None: self.covar_module = ScaleKernel( base_kernel=MaternKernel(nu=2.5, ard_num_dims=d, lengthscale_prior=GammaPrior( 3.0, 6.0)), outputscale_prior=GammaPrior(2.0, 0.15), ) else: self.covar_module = covar_module num_tasks = len(all_tasks) self._rank = rank if rank is not None else num_tasks self.task_covar_module = IndexKernel(num_tasks=num_tasks, rank=self._rank, prior=task_covar_prior) if input_transform is not None: self.input_transform = input_transform if outcome_transform is not None: self.outcome_transform = outcome_transform self.to(train_X)
def __init__( self, train_X: Tensor, train_Y: Tensor, likelihood: Optional[MultitaskGaussianLikelihood] = None, data_covar_module: Optional[Module] = None, task_covar_prior: Optional[Prior] = None, rank: Optional[int] = None, input_transform: Optional[InputTransform] = None, outcome_transform: Optional[OutcomeTransform] = None, **kwargs: Any, ) -> None: r"""Multi-task GP with Kronecker structure, using a simple ICM kernel. Args: train_X: A `batch_shape x n x d` tensor of training features. train_Y: A `batch_shape x n x m` tensor of training observations. likelihood: A `MultitaskGaussianLikelihood`. If omitted, uses a `MultitaskGaussianLikelihood` with a `GammaPrior(1.1, 0.05)` noise prior. data_covar_module: The module computing the covariance (Kernel) matrix in data space. If omitted, use a `MaternKernel`. task_covar_prior : A Prior on the task covariance matrix. Must operate on p.s.d. matrices. A common prior for this is the `LKJ` prior. If omitted, uses `LKJCovariancePrior` with `eta` parameter as specified in the keyword arguments (if not specified, use `eta=1.5`). rank: The rank of the ICM kernel. If omitted, use a full rank kernel. kwargs: Additional arguments to override default settings of priors, including: - eta: The eta parameter on the default LKJ task_covar_prior. A value of 1.0 is uninformative, values <1.0 favor stronger correlations (in magnitude), correlations vanish as eta -> inf. - sd_prior: A scalar prior over nonnegative numbers, which is used for the default LKJCovariancePrior task_covar_prior. - likelihood_rank: The rank of the task covariance matrix to fit. Defaults to 0 (which corresponds to a diagonal covariance matrix). Example: >>> train_X = torch.rand(10, 2) >>> train_Y = torch.cat([f_1(X), f_2(X)], dim=-1) >>> model = KroneckerMultiTaskGP(train_X, train_Y) """ with torch.no_grad(): transformed_X = self.transform_inputs( X=train_X, input_transform=input_transform) if outcome_transform is not None: train_Y, _ = outcome_transform(train_Y) self._validate_tensor_args(X=transformed_X, Y=train_Y) self._num_outputs = train_Y.shape[-1] batch_shape, ard_num_dims = train_X.shape[:-2], train_X.shape[-1] num_tasks = train_Y.shape[-1] if rank is None: rank = num_tasks if likelihood is None: noise_prior = GammaPrior(1.1, 0.05) noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate likelihood = MultitaskGaussianLikelihood( num_tasks=num_tasks, batch_shape=batch_shape, noise_prior=noise_prior, noise_constraint=GreaterThan( MIN_INFERRED_NOISE_LEVEL, transform=None, initial_value=noise_prior_mode, ), rank=kwargs.get("likelihood_rank", 0), ) if task_covar_prior is None: task_covar_prior = LKJCovariancePrior( n=num_tasks, eta=torch.tensor(kwargs.get("eta", 1.5)).to(train_X), sd_prior=kwargs.get( "sd_prior", SmoothedBoxPrior(math.exp(-6), math.exp(1.25), 0.05), ), ) super().__init__(train_X, train_Y, likelihood) self.mean_module = MultitaskMean( base_means=ConstantMean(batch_shape=batch_shape), num_tasks=num_tasks) if data_covar_module is None: data_covar_module = MaternKernel( nu=2.5, ard_num_dims=ard_num_dims, lengthscale_prior=GammaPrior(3.0, 6.0), batch_shape=batch_shape, ) else: data_covar_module = data_covar_module self.covar_module = MultitaskKernel( data_covar_module=data_covar_module, num_tasks=num_tasks, rank=rank, batch_shape=batch_shape, task_covar_prior=task_covar_prior, ) if outcome_transform is not None: self.outcome_transform = outcome_transform if input_transform is not None: self.input_transform = input_transform self.to(train_X)
def __init__( self, decomposition: Dict[str, List[int]], batch_shape: torch.Size, train_embedding: bool = True, cat_feature_dict: Optional[Dict] = None, embs_feature_dict: Optional[Dict] = None, embs_dim_list: Optional[List[int]] = None, context_weight_dict: Optional[Dict] = None, device: Optional[torch.device] = None, ) -> None: super().__init__(batch_shape=batch_shape) self.decomposition = decomposition self.batch_shape = batch_shape self.train_embedding = train_embedding self.device = device num_param = len(next(iter(decomposition.values()))) self.context_list = list(decomposition.keys()) self.num_contexts = len(self.context_list) # get parameter space decomposition for active_parameters in decomposition.values(): # check number of parameters are same in each decomp if len(active_parameters) != num_param: raise ValueError( "num of parameters needs to be same across all contexts") self._indexers = { context: torch.tensor(active_params, device=self.device) for context, active_params in self.decomposition.items() } # get context features and set emb dim self.context_cat_feature = None self.context_emb_feature = None self.n_embs = 0 self.emb_weight_matrix_list = None self.emb_dims = None self._set_context_features( cat_feature_dict=cat_feature_dict, embs_feature_dict=embs_feature_dict, embs_dim_list=embs_dim_list, ) # contruct embedding layer if train_embedding: self._set_emb_layers() # task covariance matrix self.task_covar_module = MaternKernel( nu=2.5, ard_num_dims=self.n_embs, batch_shape=batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ) # base kernel self.base_kernel = MaternKernel( nu=2.5, ard_num_dims=num_param, batch_shape=batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ) # outputscales for each context (note this is like sqrt of outputscale) self.context_weight = None if context_weight_dict is None: outputscale_list = torch.zeros(*batch_shape, self.num_contexts, device=self.device) else: outputscale_list = torch.zeros(*batch_shape, 1, device=self.device) self.context_weight = torch.tensor( [context_weight_dict[c] for c in self.context_list], device=self.device) self.register_parameter(name="raw_outputscale_list", parameter=torch.nn.Parameter(outputscale_list)) self.register_prior( "outputscale_list_prior", GammaPrior(2.0, 15.0), lambda m: m.outputscale_list, lambda m, v: m._set_outputscale_list(v), ) self.register_constraint("raw_outputscale_list", Positive())
def __init__( # noqa C901 self, fidelity_dims: List[int], dimension: Optional[int] = None, power_prior: Optional[Prior] = None, power_constraint: Optional[Interval] = None, nu: float = 2.5, lengthscale_prior_unbiased: Optional[Prior] = None, lengthscale_prior_biased: Optional[Prior] = None, lengthscale_constraint_unbiased: Optional[Interval] = None, lengthscale_constraint_biased: Optional[Interval] = None, covar_module_unbiased: Optional[Kernel] = None, covar_module_biased: Optional[Kernel] = None, **kwargs: Any, ) -> None: if dimension is None and kwargs.get("active_dims") is None: raise UnsupportedError( "Must specify dimension when not specifying active_dims.") n_fidelity = len(fidelity_dims) if len(set(fidelity_dims)) != n_fidelity: raise ValueError("fidelity_dims must not have repeated elements") if n_fidelity not in {1, 2}: raise UnsupportedError( "LinearTruncatedFidelityKernel accepts either one or two" "fidelity parameters.") if nu not in {0.5, 1.5, 2.5}: raise ValueError("nu must be one of 0.5, 1.5, or 2.5") super().__init__(**kwargs) self.fidelity_dims = fidelity_dims if power_constraint is None: power_constraint = Positive() if lengthscale_prior_unbiased is None: lengthscale_prior_unbiased = GammaPrior(3, 6) if lengthscale_prior_biased is None: lengthscale_prior_biased = GammaPrior(6, 2) if lengthscale_constraint_unbiased is None: lengthscale_constraint_unbiased = Positive() if lengthscale_constraint_biased is None: lengthscale_constraint_biased = Positive() self.register_parameter( name="raw_power", parameter=torch.nn.Parameter(torch.zeros(*self.batch_shape, 1)), ) self.register_constraint("raw_power", power_constraint) if power_prior is not None: self.register_prior( "power_prior", power_prior, lambda: self.power, lambda v: self._set_power(v), ) if self.active_dims is not None: dimension = len(self.active_dims) if covar_module_unbiased is None: covar_module_unbiased = MaternKernel( nu=nu, batch_shape=self.batch_shape, lengthscale_prior=lengthscale_prior_unbiased, ard_num_dims=dimension - n_fidelity, lengthscale_constraint=lengthscale_constraint_unbiased, ) if covar_module_biased is None: covar_module_biased = MaternKernel( nu=nu, batch_shape=self.batch_shape, lengthscale_prior=lengthscale_prior_biased, ard_num_dims=dimension - n_fidelity, lengthscale_constraint=lengthscale_constraint_biased, ) self.covar_module_unbiased = covar_module_unbiased self.covar_module_biased = covar_module_biased
def __init__( self, train_X: Tensor, train_Y: Tensor, likelihood: Optional[Likelihood] = None, covar_modules: Optional[List[Kernel]] = None, num_latent_dims: Optional[List[int]] = None, learn_latent_pars: bool = True, latent_init: str = "default", outcome_transform: Optional[OutcomeTransform] = None, input_transform: Optional[InputTransform] = None, ): r"""A HigherOrderGP model for high-dim output regression. Args: train_X: A `batch_shape x n x d`-dim tensor of training inputs. train_Y: A `batch_shape x n x output_shape`-dim tensor of training targets. likelihood: Gaussian likelihood for the model. covar_modules: List of kernels for each output structure. num_latent_dims: Sizes for the latent dimensions. learn_latent_pars: If true, learn the latent parameters. latent_init: [default or gp] how to initialize the latent parameters. """ if input_transform is not None: input_transform.to(train_X) # infer the dimension of `output_shape`. num_output_dims = train_Y.dim() - train_X.dim() + 1 batch_shape = train_X.shape[:-2] if len(batch_shape) > 1: raise NotImplementedError( "HigherOrderGP currently only supports 1-dim `batch_shape`." ) if outcome_transform is not None: if isinstance(outcome_transform, Standardize) and not isinstance( outcome_transform, FlattenedStandardize ): warnings.warn( "HigherOrderGP does not support the outcome_transform " "`Standardize`! Using `FlattenedStandardize` with `output_shape=" f"{train_Y.shape[- num_output_dims:]} and batch_shape=" f"{batch_shape} instead.", RuntimeWarning, ) outcome_transform = FlattenedStandardize( output_shape=train_Y.shape[-num_output_dims:], batch_shape=batch_shape, ) train_Y, _ = outcome_transform(train_Y) self._aug_batch_shape = batch_shape self._num_dimensions = num_output_dims + 1 self._num_outputs = train_Y.shape[0] if batch_shape else 1 self.target_shape = train_Y.shape[-num_output_dims:] self._input_batch_shape = batch_shape if likelihood is None: noise_prior = GammaPrior(1.1, 0.05) noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate likelihood = GaussianLikelihood( noise_prior=noise_prior, batch_shape=self._aug_batch_shape, noise_constraint=GreaterThan( MIN_INFERRED_NOISE_LEVEL, transform=None, initial_value=noise_prior_mode, ), ) else: self._is_custom_likelihood = True super().__init__( train_X, train_Y.view(*self._aug_batch_shape, -1), likelihood=likelihood, ) if covar_modules is not None: self.covar_modules = ModuleList(covar_modules) else: self.covar_modules = ModuleList( [ MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), batch_shape=self._aug_batch_shape, ard_num_dims=1 if dim > 0 else train_X.shape[-1], ) for dim in range(self._num_dimensions) ] ) if num_latent_dims is None: num_latent_dims = [1] * (self._num_dimensions - 1) self.to(train_X.device) self._initialize_latents( latent_init=latent_init, num_latent_dims=num_latent_dims, learn_latent_pars=learn_latent_pars, device=train_Y.device, dtype=train_Y.dtype, ) if outcome_transform is not None: self.outcome_transform = outcome_transform if input_transform is not None: self.input_transform = input_transform
def __init__( self, datapoints: Tensor, comparisons: Tensor, covar_module: Optional[Module] = None, input_transform: Optional[InputTransform] = None, **kwargs, ) -> None: r"""A probit-likelihood GP with Laplace approximation model that learns via pairwise comparison data. By default it uses a scaled RBF kernel. Args: datapoints: A `batch_shape x n x d` tensor of training features. comparisons: A `batch_shape x m x 2` training comparisons; comparisons[i] is a noisy indicator suggesting the utility value of comparisons[i, 0]-th is greater than comparisons[i, 1]-th. covar_module: Covariance module. input_transform: An input transform that is applied in the model's forward pass. """ super().__init__() if input_transform is not None: input_transform.to(datapoints) # input transformation is applied in set_train_data self.input_transform = input_transform # Compatibility variables with fit_gpytorch_*: Dummy likelihood # Likelihood is tightly tied with this model and # it doesn't make much sense to keep it separate self.likelihood = None # TODO: remove these variables from `state_dict()` so that when calling # `load_state_dict()`, only the hyperparameters are copied over self.register_buffer("datapoints", None) self.register_buffer("comparisons", None) self.register_buffer("D", None) self.register_buffer("DT", None) self.register_buffer("utility", None) self.register_buffer("covar_chol", None) self.register_buffer("likelihood_hess", None) self.register_buffer("hlcov_eye", None) self.register_buffer("covar", None) self.register_buffer("covar_inv", None) self.train_inputs = [] self.train_targets = None self.pred_cov_fac_need_update = True self.dim = None # See set_train_data for additional compatibility variables. # Not that the datapoints here are not transformed even if input_transform # is not None to avoid double transformation during model fitting. # self.transform_inputs is called in `forward` self.set_train_data(datapoints, comparisons, update_model=False) # Set optional parameters # jitter to add for numerical stability self._jitter = kwargs.get("jitter", 1e-6) # Clamping z lim for better numerical stability. See self._calc_z for detail # norm_cdf(z=3) ~= 0.999, top 0.1% percent self._zlim = kwargs.get("zlim", 3) # Stopping creteria in scipy.optimize.fsolve used to find f_map in _update() # If None, set to 1e-6 by default in _update self._xtol = kwargs.get("xtol") # The maximum number of calls to the function in scipy.optimize.fsolve # If None, set to 100 by default in _update # If zero, then 100*(N+1) is used by default by fsolve; self._maxfev = kwargs.get("maxfev") # Set hyperparameters # Do not set the batch_shape explicitly so mean_module can operate in both mode # once fsolve used in _update can run in batch mode, we should explicitly set # the bacth shape here self.mean_module = ConstantMean() # Do not optimize constant mean prior for param in self.mean_module.parameters(): param.requires_grad = False # set covariance module # the default outputscale here is only a rule of thumb, meant to keep # estimates away from scale value that would make Phi(f(x)) saturate # at 0 or 1 if covar_module is None: ls_prior = GammaPrior(1.2, 0.5) ls_prior_mode = (ls_prior.concentration - 1) / ls_prior.rate covar_module = ScaleKernel( RBFKernel( batch_shape=self.batch_shape, ard_num_dims=self.dim, lengthscale_prior=ls_prior, lengthscale_constraint=Positive( transform=None, initial_value=ls_prior_mode), ), outputscale_prior=SmoothedBoxPrior(a=1, b=4), ) self.covar_module = covar_module self._x0 = None # will store temporary results for warm-starting if self.datapoints is not None and self.comparisons is not None: self.to(dtype=self.datapoints.dtype, device=self.datapoints.device) # Find f_map for initial parameters with transformed datapoints transformed_dp = self.transform_inputs(datapoints) self._update(transformed_dp) self.to(self.datapoints)
def sample_arch(self, START_BO, g, steps, hyperparams, og_flops, full_val_loss, target_flops=0): if args.slim: if target_flops == 0: parameterization = hyperparams.random_sample() layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) else: parameterization = np.ones(hyperparams.get_dim()) * args.lower_channel layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) else: # random sample to warmup history for MOBO if g < START_BO: if target_flops == 0: f = np.random.rand(1) * (args.upper_channel-args.lower_channel) + args.lower_channel else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) # put the largest model into the history elif g == START_BO: if target_flops == 0: parameterization = np.ones(hyperparams.get_dim()) else: f = args.lower_channel parameterization = np.ones(hyperparams.get_dim()) * f layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) # MOBO else: # this is the scalarization (lambda_{FLOPs}) rand = torch.rand(1).cuda() # standardize data for building Gaussian Processes train_X = torch.FloatTensor(self.X).cuda() train_Y_loss = torch.FloatTensor(np.array(self.Y)[:, 0].reshape(-1, 1)).cuda() train_Y_loss = standardize(train_Y_loss) train_Y_cost = torch.FloatTensor(np.array(self.Y)[:, 1].reshape(-1, 1)).cuda() train_Y_cost = standardize(train_Y_cost) new_train_X = train_X # GP for the cross entropy loss gp_loss = SingleTaskGP(new_train_X, train_Y_loss) mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss) mll = mll.to('cuda') fit_gpytorch_model(mll) # GP for FLOPs # we use add-gp since FLOPs has addive structure (not exactly though) # the parameters for ScaleKernel and MaternKernel simply follow the default covar_module = AdditiveStructureKernel( ScaleKernel( MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), num_dims=1 ), outputscale_prior=GammaPrior(2.0, 0.15), ), num_dims=train_X.shape[1] ) gp_cost = SingleTaskGP(new_train_X, train_Y_cost, covar_module=covar_module) mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost) mll = mll.to('cuda') fit_gpytorch_model(mll) # Build acquisition functions UCB_loss = UpperConfidenceBound(gp_loss, beta=0.1).cuda() UCB_cost = UpperConfidenceBound(gp_cost, beta=0.1).cuda() # Combine them via augmented Tchebyshev scalarization self.mobo_obj = RandAcquisition(UCB_loss).cuda() self.mobo_obj.setup(UCB_loss, UCB_cost, rand) # Bounds for the optimization variable (alpha) lower = torch.ones(new_train_X.shape[1])*args.lower_channel upper = torch.ones(new_train_X.shape[1])*args.upper_channel self.mobo_bounds = torch.stack([lower, upper]).cuda() # Pareto-aware sampling if args.pas: # Generate approximate Pareto front first costs = [] for i in range(len(self.population_data)): costs.append([self.population_data[i]['loss'], self.population_data[i]['ratio']]) costs = np.array(costs) efficient_mask = is_pareto_efficient(costs) costs = costs[efficient_mask] loss = costs[:, 0] flops = costs[:, 1] sorted_idx = np.argsort(flops) loss = loss[sorted_idx] flops = flops[sorted_idx] if flops[0] > args.lower_flops: flops = np.concatenate([[args.lower_flops], flops.reshape(-1)]) loss = np.concatenate([[8], loss.reshape(-1)]) else: flops = flops.reshape(-1) loss = loss.reshape(-1) if flops[-1] < args.upper_flops and (loss[-1] > full_val_loss): flops = np.concatenate([flops.reshape(-1), [args.upper_flops]]) loss = np.concatenate([loss.reshape(-1), [full_val_loss]]) else: flops = flops.reshape(-1) loss = loss.reshape(-1) # Equation (4) in paper areas = (flops[1:]-flops[:-1])*(loss[:-1]-loss[1:]) # Quantize into 50 bins to sample from multinomial self.sampling_weights = np.zeros(50) k = 0 while k < len(flops) and flops[k] < args.lower_flops: k+=1 for i in range(50): lower = i/50. upper = (i+1)/50. if upper < args.lower_flops or lower > args.upper_flops or lower < args.lower_flops: continue cnt = 1 while ((k+1) < len(flops)) and upper > flops[k+1]: self.sampling_weights[i] += areas[k] cnt += 1 k += 1 if k < len(areas): self.sampling_weights[i] += areas[k] self.sampling_weights[i] /= cnt if np.sum(self.sampling_weights) == 0: self.sampling_weights = np.ones(50) if target_flops == 0: val = np.arange(0.01, 1, 0.02) chosen_target_flops = np.random.choice(val, p=(self.sampling_weights/np.sum(self.sampling_weights))) else: chosen_target_flops = target_flops # Binary search is here lower_bnd, upper_bnd = 0, 1 lmda = 0.5 for i in range(10): self.mobo_obj.rand = lmda parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) sim_flops = self.mask_pruner.simulate_and_count_flops(layer_budget) ratio = sim_flops/og_flops if np.abs(ratio - chosen_target_flops) <= 0.02: break if args.baseline > 0: if ratio < chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 elif ratio > chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 else: if ratio < chosen_target_flops: upper_bnd = lmda lmda = (lmda + lower_bnd) / 2 elif ratio > chosen_target_flops: lower_bnd = lmda lmda = (lmda + upper_bnd) / 2 rand[0] = lmda writer.add_scalar('Binary search trials', i, steps) else: parameterization, acq_value = optimize_acqf( self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000, ) parameterization = parameterization[0].cpu().numpy() layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner) return layer_budget, parameterization, self.sampling_weights/np.sum(self.sampling_weights)
def test_initialize_offset_prior(self): kernel = DownsamplingKernel() kernel.offset_prior = NormalPrior(1, 1) self.assertTrue(isinstance(kernel.offset_prior, NormalPrior)) kernel2 = DownsamplingKernel(offset_prior=GammaPrior(1, 1)) self.assertTrue(isinstance(kernel2.offset_prior, GammaPrior))
def __init__( self, train_X: Tensor, train_Y: Tensor, likelihood: Optional[Likelihood] = None, covar_module: Optional[Module] = None, outcome_transform: Optional[OutcomeTransform] = None, ) -> None: r"""A single-task exact GP model. Args: train_X: A `batch_shape x n x d` tensor of training features. train_Y: A `batch_shape x n x m` tensor of training observations. likelihood: A likelihood. If omitted, use a standard GaussianLikelihood with inferred noise level. covar_module: The module computing the covariance (Kernel) matrix. If omitted, use a `MaternKernel`. outcome_transform: An outcome transform that is applied to the training data during instantiation and to the posterior during inference (that is, the `Posterior` obtained by calling `.posterior` on the model will be on the original scale). Example: >>> train_X = torch.rand(20, 2) >>> train_Y = torch.sin(train_X).sum(dim=1, keepdim=True) >>> model = SingleTaskGP(train_X, train_Y) """ if outcome_transform is not None: train_Y, _ = outcome_transform(train_Y) validate_input_scaling(train_X=train_X, train_Y=train_Y) self._validate_tensor_args(X=train_X, Y=train_Y) self._set_dimensions(train_X=train_X, train_Y=train_Y) train_X, train_Y, _ = self._transform_tensor_args(X=train_X, Y=train_Y) if likelihood is None: noise_prior = GammaPrior(1.1, 0.05) noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate likelihood = GaussianLikelihood( noise_prior=noise_prior, batch_shape=self._aug_batch_shape, noise_constraint=GreaterThan( MIN_INFERRED_NOISE_LEVEL, transform=None, initial_value=noise_prior_mode, ), ) else: self._is_custom_likelihood = True ExactGP.__init__(self, train_X, train_Y, likelihood) self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape) if covar_module is None: self.covar_module = ScaleKernel( MaternKernel( nu=2.5, ard_num_dims=train_X.shape[-1], batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), ), batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) self._subset_batch_dict = { "likelihood.noise_covar.raw_noise": -2, "mean_module.constant": -2, "covar_module.raw_outputscale": -1, "covar_module.base_kernel.raw_lengthscale": -3, } else: self.covar_module = covar_module # TODO: Allow subsetting of other covar modules if outcome_transform is not None: self.outcome_transform = outcome_transform self.to(train_X)
def test_initialize_offset_prior(self): kernel = ExponentialDecayKernel() kernel.offset_prior = NormalPrior(1, 1) self.assertTrue(isinstance(kernel.offset_prior, NormalPrior)) kernel2 = ExponentialDecayKernel(offset_prior=GammaPrior(1, 1)) self.assertTrue(isinstance(kernel2.offset_prior, GammaPrior))
def test_initialize_power_prior(self): kernel = ExpDecayKernel() kernel.power_prior = NormalPrior(1, 1) self.assertTrue(isinstance(kernel.power_prior, NormalPrior)) kernel2 = ExpDecayKernel(power_prior=GammaPrior(1, 1)) self.assertTrue(isinstance(kernel2.power_prior, GammaPrior))
def _setup_multifidelity_covar_module( dim: int, aug_batch_shape: torch.Size, iteration_fidelity: Optional[int], data_fidelity: Optional[int], linear_truncated: bool, nu: float, ) -> Tuple[ScaleKernel, Dict]: """Helper function to get the covariance module and associated subset_batch_dict for the multifidelity setting. Args: dim: The dimensionality of the training data. aug_batch_shape: The output-augmented batch shape as defined in `BatchedMultiOutputGPyTorchModel`. iteration_fidelity: The column index for the training iteration fidelity parameter (optional). data_fidelity: The column index for the downsampling fidelity parameter (optional). linear_truncated: If True, use a `LinearTruncatedFidelityKernel` instead of the default kernel. nu: The smoothness parameter for the Matern kernel: either 1/2, 3/2, or 5/2. Only used when `linear_truncated=True`. Returns: The covariance module and subset_batch_dict. """ if iteration_fidelity is not None and iteration_fidelity < 0: iteration_fidelity = dim + iteration_fidelity if data_fidelity is not None and data_fidelity < 0: data_fidelity = dim + data_fidelity if linear_truncated: fidelity_dims = [ i for i in (iteration_fidelity, data_fidelity) if i is not None ] kernel = LinearTruncatedFidelityKernel( fidelity_dims=fidelity_dims, dimension=dim, nu=nu, batch_shape=aug_batch_shape, power_prior=GammaPrior(3.0, 3.0), ) else: active_dimsX = [ i for i in range(dim) if i not in {iteration_fidelity, data_fidelity} ] kernel = RBFKernel( ard_num_dims=len(active_dimsX), batch_shape=aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), active_dims=active_dimsX, ) additional_kernels = [] if iteration_fidelity is not None: exp_kernel = ExponentialDecayKernel( batch_shape=aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), offset_prior=GammaPrior(3.0, 6.0), power_prior=GammaPrior(3.0, 6.0), active_dims=[iteration_fidelity], ) additional_kernels.append(exp_kernel) if data_fidelity is not None: ds_kernel = DownsamplingKernel( batch_shape=aug_batch_shape, offset_prior=GammaPrior(3.0, 6.0), power_prior=GammaPrior(3.0, 6.0), active_dims=[data_fidelity], ) additional_kernels.append(ds_kernel) kernel = ProductKernel(kernel, *additional_kernels) covar_module = ScaleKernel(kernel, batch_shape=aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15)) if linear_truncated: subset_batch_dict = { "covar_module.base_kernel.raw_power": -2, "covar_module.base_kernel.covar_module_unbiased.raw_lengthscale": -3, "covar_module.base_kernel.covar_module_biased.raw_lengthscale": -3, } else: subset_batch_dict = { "covar_module.base_kernel.kernels.0.raw_lengthscale": -3, "covar_module.base_kernel.kernels.1.raw_power": -2, "covar_module.base_kernel.kernels.1.raw_offset": -2, } if iteration_fidelity is not None: subset_batch_dict = { "covar_module.base_kernel.kernels.1.raw_lengthscale": -3, **subset_batch_dict, } if data_fidelity is not None: subset_batch_dict = { "covar_module.base_kernel.kernels.2.raw_power": -2, "covar_module.base_kernel.kernels.2.raw_offset": -2, **subset_batch_dict, } return covar_module, subset_batch_dict
def __init__( self, train_X: Tensor, train_Y: Tensor, iteration_fidelity: Optional[int] = None, data_fidelity: Optional[int] = None, linear_truncated: bool = True, nu: float = 2.5, likelihood: Optional[Likelihood] = None, outcome_transform: Optional[OutcomeTransform] = None, ) -> None: self._init_args = { "iteration_fidelity": iteration_fidelity, "data_fidelity": data_fidelity, "linear_truncated": linear_truncated, "nu": nu, "outcome_transform": outcome_transform, } if iteration_fidelity is None and data_fidelity is None: raise UnsupportedError( "SingleTaskMultiFidelityGP requires at least one fidelity parameter." ) if iteration_fidelity is not None and iteration_fidelity < 0: iteration_fidelity = train_X.size(-1) + iteration_fidelity if data_fidelity is not None and data_fidelity < 0: data_fidelity = train_X.size(-1) + data_fidelity self._set_dimensions(train_X=train_X, train_Y=train_Y) if linear_truncated: fidelity_dims = [ i for i in (iteration_fidelity, data_fidelity) if i is not None ] kernel = LinearTruncatedFidelityKernel( fidelity_dims=fidelity_dims, dimension=train_X.size(-1), nu=nu, batch_shape=self._aug_batch_shape, power_prior=GammaPrior(3.0, 3.0), ) else: active_dimsX = [ i for i in range(train_X.size(-1)) if i not in {iteration_fidelity, data_fidelity} ] kernel = RBFKernel( ard_num_dims=len(active_dimsX), batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), active_dims=active_dimsX, ) additional_kernels = [] if iteration_fidelity is not None: exp_kernel = ExponentialDecayKernel( batch_shape=self._aug_batch_shape, lengthscale_prior=GammaPrior(3.0, 6.0), offset_prior=GammaPrior(3.0, 6.0), power_prior=GammaPrior(3.0, 6.0), active_dims=[iteration_fidelity], ) additional_kernels.append(exp_kernel) if data_fidelity is not None: ds_kernel = DownsamplingKernel( batch_shape=self._aug_batch_shape, offset_prior=GammaPrior(3.0, 6.0), power_prior=GammaPrior(3.0, 6.0), active_dims=[data_fidelity], ) additional_kernels.append(ds_kernel) kernel = ProductKernel(kernel, *additional_kernels) covar_module = ScaleKernel( kernel, batch_shape=self._aug_batch_shape, outputscale_prior=GammaPrior(2.0, 0.15), ) super().__init__( train_X=train_X, train_Y=train_Y, covar_module=covar_module, outcome_transform=outcome_transform, ) if linear_truncated: subset_batch_dict = { "covar_module.base_kernel.raw_power": -2, "covar_module.base_kernel.covar_module_unbiased.raw_lengthscale": -3, "covar_module.base_kernel.covar_module_biased.raw_lengthscale": -3, } else: subset_batch_dict = { "covar_module.base_kernel.kernels.0.raw_lengthscale": -3, "covar_module.base_kernel.kernels.1.raw_power": -2, "covar_module.base_kernel.kernels.1.raw_offset": -2, } if iteration_fidelity is not None: subset_batch_dict = { "covar_module.base_kernel.kernels.1.raw_lengthscale": -3, **subset_batch_dict, } if data_fidelity is not None: subset_batch_dict = { "covar_module.base_kernel.kernels.2.raw_power": -2, "covar_module.base_kernel.kernels.2.raw_offset": -2, **subset_batch_dict, } self._subset_batch_dict = { "likelihood.noise_covar.raw_noise": -2, "mean_module.constant": -2, "covar_module.raw_outputscale": -1, **subset_batch_dict, } self.to(train_X)
def __init__( self, datapoints: Tensor, comparisons: Tensor, covar_module: Optional[Module] = None, noise_module: Optional[HomoskedasticNoise] = None, **kwargs, ) -> None: super().__init__() r"""A probit-likelihood GP with Laplace approximation model. A probit-likelihood GP with Laplace approximation model that learns via pairwise comparison data. By default it uses a scaled-RBF kernel. Args: datapoints: A `batch_shape x n x d` tensor of training features. comparisons: A `batch_shape x m x 2` training comparisons; comparisons[i] is a noisy indicator suggesting the utility value of comparisons[i, 0]-th is greater than comparisons[i, 1]-th. covar_module: Covariance module noise_module: Noise module """ # Compatibility variables with fit_gpytorch_*: Dummy likelihood # Likelihood is tightly tied with this model and # it doesn't make much sense to keep it separate self.likelihood = None # TODO: remove these variables from `state_dict()` so that when calling # `load_state_dict()`, only the hyperparameters are copied over self.register_buffer("datapoints", None) self.register_buffer("comparisons", None) self.register_buffer("utility", None) self.register_buffer("covar_chol", None) self.register_buffer("likelihood_hess", None) self.register_buffer("hlcov_eye", None) self.register_buffer("covar", None) self.register_buffer("covar_inv", None) self.train_inputs = [] self.train_targets = None self.pred_cov_fac_need_update = True self._input_batch_shape = torch.Size() self.dim = None # will be set to match datapoints' dtype and device # since scipy.optimize.fsolve only works on cpu, it'd be the # fastest to fit the model on cpu and take samples on gpu to avoid # overhead of moving data back and forth during fitting time self.tkwargs = {} # See set_train_data for additional compatibility variables self.set_train_data(datapoints, comparisons, update_model=False) # Set optional parameters # jitter to add for numerical stability self._jitter = kwargs.get("jitter", 1e-6) # Clamping z lim for better numerical stability. See self._calc_z for detail # norm_cdf(z=3) ~= 0.999, top 0.1% percent self._zlim = kwargs.get("zlim", 3) # Stopping creteria in scipy.optimize.fsolve used to find f_map in _update() # If None, set to 1e-6 by default in _update self._xtol = kwargs.get("xtol") # The maximum number of calls to the function in scipy.optimize.fsolve # If None, set to 100 by default in _update # If zero, then 100*(N+1) is used by default by fsolve; self._maxfev = kwargs.get("maxfev") # Set hyperparameters # Do not set the batch_shape explicitly so mean_module can operate in both mode # once fsolve used in _update can run in batch mode, we should explicitly set # the bacth shape here self.mean_module = ConstantMean() # Do not optimize constant mean prior for param in self.mean_module.parameters(): param.requires_grad = False # set covariance module if noise_module is None: noise_module = HomoskedasticNoise( noise_prior=SmoothedBoxPrior(-5, 5, 0.5, transform=torch.log), noise_constraint=GreaterThan(1e-4), # if None, 1e-4 by default batch_shape=self._input_batch_shape, ) self.noise_module = noise_module # set covariance module if covar_module is None: ls_prior = GammaPrior(1.2, 0.5) ls_prior_mode = (ls_prior.concentration - 1) / ls_prior.rate covar_module = RBFKernel( batch_shape=self._input_batch_shape, ard_num_dims=self.dim, lengthscale_prior=ls_prior, lengthscale_constraint=Positive(transform=None, initial_value=ls_prior_mode), ) self.covar_module = covar_module self._x0 = None # will store temporary results for warm-starting if self.datapoints is not None and self.comparisons is not None: self.to(dtype=self.datapoints.dtype, device=self.datapoints.device) self._update() # Find f_map for initial parameters self.to(self.datapoints)
def __init__( self, train_X: Tensor, train_Y: Tensor, task_feature: int, output_tasks: Optional[List[int]] = None, rank: Optional[int] = None, ) -> None: r"""Multi-Task GP model using an ICM kernel, inferring observation noise. Args: train_X: A `n x (d + 1)` or `b x n x (d + 1)` (batch mode) tensor of training data. One of the columns should contain the task features (see `task_feature` argument). train_Y: A `n` or `b x n` (batch mode) tensor of training observations. task_feature: The index of the task feature (`-d <= task_feature <= d`). output_tasks: A list of task indices for which to compute model outputs for. If omitted, return outputs for all task indices. rank: The rank to be used for the index kernel. If omitted, use a full rank (i.e. number of tasks) kernel. Example: >>> X1, X2 = torch.rand(10, 2), torch.rand(20, 2) >>> i1, i2 = torch.zeros(10, 1), torch.ones(20, 1) >>> train_X = torch.stack([ >>> torch.cat([X1, i1], -1), torch.cat([X2, i2], -1), >>> ]) >>> train_Y = torch.cat(f1(X1), f2(X2)) >>> model = MultiTaskGP(train_X, train_Y, task_feature=-1) """ if train_X.ndimension() != 2: # Currently, batch mode MTGPs are blocked upstream in GPyTorch raise ValueError(f"Unsupported shape {train_X.shape} for train_X.") d = train_X.shape[-1] - 1 if not (-d <= task_feature <= d): raise ValueError(f"Must have that -{d} <= task_feature <= {d}") all_tasks = train_X[:, task_feature].unique().to( dtype=torch.long).tolist() if output_tasks is None: output_tasks = all_tasks else: if any(t not in all_tasks for t in output_tasks): raise RuntimeError( "All output tasks must be present in input data.") self._output_tasks = output_tasks # TODO (T41270962): Support task-specific noise levels in likelihood likelihood = GaussianLikelihood(noise_prior=GammaPrior(1.1, 0.05)) # construct indexer to be used in forward self._task_feature = task_feature self._base_idxr = torch.arange(d) self._base_idxr[task_feature:] += 1 # exclude task feature super().__init__(train_inputs=train_X, train_targets=train_Y, likelihood=likelihood) self.mean_module = ConstantMean() self.covar_module = ScaleKernel( base_kernel=MaternKernel(nu=2.5, ard_num_dims=d, lengthscale_prior=GammaPrior(3.0, 6.0)), outputscale_prior=GammaPrior(2.0, 0.15), ) num_tasks = len(all_tasks) self._rank = rank if rank is not None else num_tasks # TODO: Add LKJ prior for the index kernel self.task_covar_module = IndexKernel(num_tasks=num_tasks, rank=self._rank) self.to(train_X)