Python GammaPrior Exemples, gpytorch.priors.torch_priors.GammaPrior Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : contextual_sac.py Projet : sile/botorch

    def __init__(
        self, decomposition: Dict[str, List[int]], batch_shape: torch.Size
    ) -> None:
        super().__init__(batch_shape=batch_shape)
        self.decomposition = decomposition

        num_param = len(next(iter(decomposition.values())))
        for active_parameters in decomposition.values():
            # check number of parameters are same in each decomp
            if len(active_parameters) != num_param:
                raise ValueError(
                    "num of parameters needs to be same across all contexts"
                )

        self._indexers = {
            context: torch.tensor(active_params)
            for context, active_params in self.decomposition.items()
        }

        self.base_kernel = MaternKernel(
            nu=2.5,
            ard_num_dims=num_param,
            batch_shape=batch_shape,
            lengthscale_prior=GammaPrior(3.0, 6.0),
        )

        self.kernel_dict = {}  # scaled kernel for each parameter space partition
        for context in list(decomposition.keys()):
            self.kernel_dict[context] = ScaleKernel(
                base_kernel=self.base_kernel, outputscale_prior=GammaPrior(2.0, 15.0)
            )
        self.kernel_dict = ModuleDict(self.kernel_dict)

Exemple #2

0

Afficher le fichier

def get_warping_transform(
    d: int,
    task_feature: Optional[int] = None,
) -> Warp:
    """Construct input warping transform.

    Args:
        d: The dimension of the input, including task features
        task_feature: the index of the task feature

    Returns:
        The input warping transform.
    """
    indices = list(range(d))
    # apply warping to all non-task features, including fidelity features
    if task_feature is not None:
        del indices[task_feature]
    # Note: this currently uses the same warping functions for all tasks
    tf = Warp(
        indices=indices,
        # use an uninformative prior with maximum log probability at 1
        concentration1_prior=GammaPrior(1.01, 0.01),
        concentration0_prior=GammaPrior(1.01, 0.01),
    )
    return tf

Exemple #3

0

Afficher le fichier

 def __init__(
     self,
     train_X: Tensor,
     train_Y: Tensor,
     nu: float = 2.5,
     train_iteration_fidelity: bool = True,
     train_data_fidelity: bool = True,
     likelihood: Optional[Likelihood] = None,
 ) -> None:
     if not train_iteration_fidelity and not train_data_fidelity:
         raise UnsupportedError(
             "You should have at least one fidelity parameter.")
     self._set_dimensions(train_X=train_X, train_Y=train_Y)
     kernel = LinearTruncatedFidelityKernel(
         nu=nu,
         dimension=train_X.shape[-1],
         train_iteration_fidelity=train_iteration_fidelity,
         train_data_fidelity=train_data_fidelity,
         batch_shape=self._aug_batch_shape,
         power_prior=GammaPrior(3.0, 3.0),
     )
     covar_module = ScaleKernel(
         kernel,
         batch_shape=self._aug_batch_shape,
         outputscale_prior=GammaPrior(2.0, 0.15),
     )
     super().__init__(train_X=train_X,
                      train_Y=train_Y,
                      covar_module=covar_module)
     self.to(train_X)

Exemple #4

0

Afficher le fichier

    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        likelihood: Optional[Likelihood] = None,
        covar_module: Optional[Module] = None,
    ) -> None:
        r"""A single-task exact GP model.

        Args:
            train_X: A `n x d` or `batch_shape x n x d` (batch mode) tensor of training
                features.
            train_Y: A `n x m` or `batch_shape x n x m` (batch mode) tensor of
                training observations.
            likelihood: A likelihood. If omitted, use a standard
                GaussianLikelihood with inferred noise level.
            covar_module: The covariance (kernel) matrix. If omitted, use the
                MaternKernel.

        Example:
            >>> train_X = torch.rand(20, 2)
            >>> train_Y = torch.sin(train_X).sum(dim=1, keepdim=True)
            >>> model = SingleTaskGP(train_X, train_Y)
        """
        validate_input_scaling(train_X=train_X, train_Y=train_Y)
        self._validate_tensor_args(X=train_X, Y=train_Y)
        self._set_dimensions(train_X=train_X, train_Y=train_Y)
        train_X, train_Y, _ = self._transform_tensor_args(X=train_X, Y=train_Y)
        if likelihood is None:
            noise_prior = GammaPrior(1.1, 0.05)
            noise_prior_mode = (noise_prior.concentration -
                                1) / noise_prior.rate
            likelihood = GaussianLikelihood(
                noise_prior=noise_prior,
                batch_shape=self._aug_batch_shape,
                noise_constraint=GreaterThan(
                    MIN_INFERRED_NOISE_LEVEL,
                    transform=None,
                    initial_value=noise_prior_mode,
                ),
            )
        else:
            self._is_custom_likelihood = True
        ExactGP.__init__(self, train_X, train_Y, likelihood)
        self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape)
        if covar_module is None:
            self.covar_module = ScaleKernel(
                MaternKernel(
                    nu=2.5,
                    ard_num_dims=train_X.shape[-1],
                    batch_shape=self._aug_batch_shape,
                    lengthscale_prior=GammaPrior(3.0, 6.0),
                ),
                batch_shape=self._aug_batch_shape,
                outputscale_prior=GammaPrior(2.0, 0.15),
            )
        else:
            self.covar_module = covar_module
        self.to(train_X)

Exemple #5

0

Afficher le fichier

Fichier : gp_regression.py Projet : shyamalschandra/botorch

    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        train_Yvar: Tensor,
        outcome_transform: Optional[OutcomeTransform] = None,
    ) -> None:
        r"""A single-task exact GP model using fixed noise levels.

        Args:
            train_X: A `batch_shape x n x d` tensor of training features.
            train_Y: A `batch_shape x n x m` tensor of training observations.
            train_Yvar: A `batch_shape x n x m` tensor of observed measurement
                noise.
            outcome_transform: An outcome transform that is applied to the
                training data during instantiation and to the posterior during
                inference (that is, the `Posterior` obtained by calling
                `.posterior` on the model will be on the original scale).

        Example:
            >>> train_X = torch.rand(20, 2)
            >>> train_Y = torch.sin(train_X).sum(dim=1, keepdim=True)
            >>> train_Yvar = torch.full_like(train_Y, 0.2)
            >>> model = FixedNoiseGP(train_X, train_Y, train_Yvar)
        """
        if outcome_transform is not None:
            train_Y, train_Yvar = outcome_transform(train_Y, train_Yvar)
        validate_input_scaling(train_X=train_X,
                               train_Y=train_Y,
                               train_Yvar=train_Yvar)
        self._validate_tensor_args(X=train_X, Y=train_Y, Yvar=train_Yvar)
        self._set_dimensions(train_X=train_X, train_Y=train_Y)
        train_X, train_Y, train_Yvar = self._transform_tensor_args(
            X=train_X, Y=train_Y, Yvar=train_Yvar)
        likelihood = FixedNoiseGaussianLikelihood(
            noise=train_Yvar, batch_shape=self._aug_batch_shape)
        ExactGP.__init__(self,
                         train_inputs=train_X,
                         train_targets=train_Y,
                         likelihood=likelihood)
        self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape)
        self.covar_module = ScaleKernel(
            base_kernel=MaternKernel(
                nu=2.5,
                ard_num_dims=train_X.shape[-1],
                batch_shape=self._aug_batch_shape,
                lengthscale_prior=GammaPrior(3.0, 6.0),
            ),
            batch_shape=self._aug_batch_shape,
            outputscale_prior=GammaPrior(2.0, 0.15),
        )
        if outcome_transform is not None:
            self.outcome_transform = outcome_transform
        self._subset_batch_dict = {
            "mean_module.constant": -2,
            "covar_module.raw_outputscale": -1,
            "covar_module.base_kernel.raw_lengthscale": -3,
        }
        self.to(train_X)

Exemple #6

0

Afficher le fichier

Fichier : multitask.py Projet : pytorch/botorch

    def construct_inputs(cls, training_data: TrainingData,
                         **kwargs) -> Dict[str, Any]:
        r"""Construct kwargs for the `Model` from `TrainingData` and other options.

        Args:
            training_data: `TrainingData` container with data for single outcome
                or for multiple outcomes for batched multi-output case.
            **kwargs: Additional options for the model that pertain to the
                training data, including:

                - `task_features`: Indices of the input columns containing the task
                  features (expected list of length 1),
                - `task_covar_prior`: A GPyTorch `Prior` object to use as prior on
                  the cross-task covariance matrix,
                - `prior_config`: A dict representing a prior config, should only be
                  used if `prior` is not passed directly. Should contain:
                  `use_LKJ_prior` (whether to use LKJ prior) and `eta` (eta value,
                  float),
                - `rank`: The rank of the cross-task covariance matrix.
        """

        task_features = kwargs.pop("task_features", None)
        if task_features is None:
            raise ValueError(f"`task_features` required for {cls.__name__}.")
        task_feature = task_features[0]
        inputs = {
            "train_X": training_data.X,
            "train_Y": training_data.Y,
            "task_feature": task_feature,
            "rank": kwargs.get("rank"),
        }

        prior = kwargs.get("task_covar_prior")
        prior_config = kwargs.get("prior_config")
        if prior and prior_config:
            raise ValueError(
                "Only one of `prior` and `prior_config` arguments expected.")

        if prior_config:
            if not prior_config.get("use_LKJ_prior"):
                raise ValueError(
                    "Currently only config for LKJ prior is supported.")
            all_tasks, _, _ = MultiTaskGP.get_all_tasks(
                training_data.X, task_feature)
            num_tasks = len(all_tasks)
            sd_prior = GammaPrior(1.0, 0.15)
            sd_prior._event_shape = torch.Size([num_tasks])
            eta = prior_config.get("eta", 0.5)
            if not isinstance(eta, float) and not isinstance(eta, int):
                raise ValueError(
                    f"eta must be a real number, your eta was {eta}.")
            prior = LKJCovariancePrior(num_tasks, eta, sd_prior)

        inputs["task_covar_prior"] = prior
        return inputs

Exemple #7

0

Afficher le fichier

    def __init__(self,
                 train_X: Tensor,
                 train_Y: Tensor,
                 likelihood: Optional[Likelihood] = None) -> None:
        r"""A single-task exact GP model.

        Args:
            train_X: A `n x d` or `batch_shape x n x d` (batch mode) tensor of training
                features.
            train_Y: A `n x (o)` or `batch_shape x n x (o)` (batch mode) tensor of
                training observations.
            likelihood: A likelihood. If omitted, use a standard
                GaussianLikelihood with inferred noise level.

        Example:
            >>> train_X = torch.rand(20, 2)
            >>> train_Y = torch.sin(train_X[:, 0]) + torch.cos(train_X[:, 1])
            >>> model = SingleTaskGP(train_X, train_Y)
        """
        ard_num_dims = train_X.shape[-1]
        train_X, train_Y, _ = self._set_dimensions(train_X=train_X,
                                                   train_Y=train_Y)
        train_X, train_Y, _ = multioutput_to_batch_mode_transform(
            train_X=train_X, train_Y=train_Y, num_outputs=self._num_outputs)
        if likelihood is None:
            noise_prior = GammaPrior(1.1, 0.05)
            noise_prior_mode = (noise_prior.concentration -
                                1) / noise_prior.rate
            likelihood = GaussianLikelihood(
                noise_prior=noise_prior,
                batch_shape=self._aug_batch_shape,
                noise_constraint=GreaterThan(
                    MIN_INFERRED_NOISE_LEVEL,
                    transform=None,
                    initial_value=noise_prior_mode,
                ),
            )
        else:
            self._likelihood_state_dict = deepcopy(likelihood.state_dict())
        ExactGP.__init__(self, train_X, train_Y, likelihood)
        self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape)
        self.covar_module = ScaleKernel(
            MaternKernel(
                nu=2.5,
                ard_num_dims=ard_num_dims,
                batch_shape=self._aug_batch_shape,
                lengthscale_prior=GammaPrior(3.0, 6.0),
            ),
            batch_shape=self._aug_batch_shape,
            outputscale_prior=GammaPrior(2.0, 0.15),
        )
        self.to(train_X)

Exemple #8

0

Afficher le fichier

Fichier : derivative_gp.py Projet : mshvartsman/aepsych-1

    def __init__(
        self,
        train_x: torch.Tensor,
        train_y: torch.Tensor,
        inducing_points: torch.Tensor,
        scales: Union[torch.Tensor, float] = 1.0,
        mean_module: Optional[Mean] = None,
        covar_module: Optional[Kernel] = None,
        fixed_prior_mean: Optional[float] = None,
    ) -> None:
        variational_distribution = CholeskyVariationalDistribution(
            inducing_points.size(0))
        variational_distribution.to(train_x)
        variational_strategy = VariationalStrategy(
            model=self,
            inducing_points=inducing_points,
            variational_distribution=variational_distribution,
            learn_inducing_locations=False,
        )
        super(MixedDerivativeVariationalGP,
              self).__init__(variational_strategy)

        # Set the mean if specified to
        if mean_module is None:
            self.mean_module = ConstantMeanPartialObsGrad()
        else:
            self.mean_module = mean_module

        if fixed_prior_mean is not None:
            self.mean_module.constant.requires_grad_(False)
            self.mean_module.constant.copy_(
                torch.tensor([fixed_prior_mean], dtype=train_x.dtype))

        if covar_module is None:
            self.base_kernel = RBFKernelPartialObsGrad(
                ard_num_dims=train_x.shape[-1] - 1,
                lengthscale_prior=GammaPrior(3.0, 6.0 / scales),
            )
            self.covar_module = ScaleKernel(self.base_kernel,
                                            outputscale_prior=GammaPrior(
                                                2.0, 0.15))
        else:
            self.covar_module = covar_module

        self._num_outputs = 1
        self.train_inputs = (train_x, )
        self.train_targets = train_y
        self(train_x)  # Necessary for CholeskyVariationalDistribution

Exemple #9

0

Afficher le fichier

Fichier : gp_regression.py Projet : arnabtarwani/botorch

    def __init__(self, train_X: Tensor, train_Y: Tensor, train_Yvar: Tensor) -> None:
        r"""A single-task exact GP model using fixed noise levels.

        Args:
            train_X: A `n x d` or `batch_shape x n x d` (batch mode) tensor of training
                features.
            train_Y: A `n x (o)` or `batch_shape x n x (o)` (batch mode) tensor of
                training observations.
            train_Yvar: A `batch_shape x n x (o)` or `batch_shape x n x (o)`
                (batch mode) tensor of observed measurement noise.

        Example:
            >>> train_X = torch.rand(20, 2)
            >>> train_Y = torch.sin(train_X[:, 0]]) + torch.cos(train_X[:, 1])
            >>> train_Yvar = torch.full_like(train_Y, 0.2)
            >>> model = FixedNoiseGP(train_X, train_Y, train_Yvar)
        """
        ard_num_dims = train_X.shape[-1]
        train_X, train_Y, train_Yvar = self._set_dimensions(
            train_X=train_X, train_Y=train_Y, train_Yvar=train_Yvar
        )
        train_X, train_Y, train_Yvar = multioutput_to_batch_mode_transform(
            train_X=train_X,
            train_Y=train_Y,
            num_outputs=self._num_outputs,
            train_Yvar=train_Yvar,
        )
        likelihood = FixedNoiseGaussianLikelihood(
            noise=train_Yvar, batch_shape=self._aug_batch_shape
        )
        ExactGP.__init__(
            self, train_inputs=train_X, train_targets=train_Y, likelihood=likelihood
        )
        self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape)
        self.covar_module = ScaleKernel(
            base_kernel=MaternKernel(
                nu=2.5,
                ard_num_dims=ard_num_dims,
                batch_shape=self._aug_batch_shape,
                lengthscale_prior=GammaPrior(3.0, 6.0),
            ),
            batch_shape=self._aug_batch_shape,
            outputscale_prior=GammaPrior(2.0, 0.15),
        )
        self.to(train_X)

Exemple #10

0

Afficher le fichier

    def test_sample_all_priors(self, cuda=False):
        device = torch.device("cuda" if cuda else "cpu")
        for dtype in (torch.float, torch.double):
            train_X = torch.rand(3, 5, device=device, dtype=dtype)
            train_Y = torch.rand(3, 1, device=device, dtype=dtype)
            model = SingleTaskGP(train_X=train_X, train_Y=train_Y)
            mll = ExactMarginalLogLikelihood(model.likelihood, model)
            mll.to(device=device, dtype=dtype)
            original_state_dict = dict(deepcopy(mll.model.state_dict()))
            sample_all_priors(model)

            # make sure one of the hyperparameters changed
            self.assertTrue(
                dict(model.state_dict())["likelihood.noise_covar.raw_noise"] !=
                original_state_dict["likelihood.noise_covar.raw_noise"])
            # check that lengthscales are all different
            ls = model.covar_module.base_kernel.raw_lengthscale.view(
                -1).tolist()
            self.assertTrue(all(ls[0] != ls[i]) for i in range(1, len(ls)))

            # change one of the priors to SmoothedBoxPrior
            model.covar_module = ScaleKernel(
                MaternKernel(
                    nu=2.5,
                    ard_num_dims=model.train_inputs[0].shape[-1],
                    batch_shape=model._aug_batch_shape,
                    lengthscale_prior=SmoothedBoxPrior(3.0, 6.0),
                ),
                batch_shape=model._aug_batch_shape,
                outputscale_prior=GammaPrior(2.0, 0.15),
            )
            original_state_dict = dict(deepcopy(mll.model.state_dict()))
            with warnings.catch_warnings(
                    record=True) as ws, settings.debug(True):
                sample_all_priors(model)
                self.assertEqual(len(ws), 1)
                self.assertTrue("rsample" in str(ws[0].message))

            # the lengthscale should not have changed because sampling is
            # not implemented for SmoothedBoxPrior
            self.assertTrue(
                torch.equal(
                    dict(model.state_dict())
                    ["covar_module.base_kernel.raw_lengthscale"],
                    original_state_dict[
                        "covar_module.base_kernel.raw_lengthscale"],
                ))

            # set setting_closure to None and make sure RuntimeError is raised
            prior_tuple = model.likelihood.noise_covar._priors["noise_prior"]
            model.likelihood.noise_covar._priors["noise_prior"] = (
                prior_tuple[0],
                prior_tuple[1],
                None,
            )
            with self.assertRaises(RuntimeError):
                sample_all_priors(model)

Exemple #11

0

Afficher le fichier

 def __init__(
     self,
     train_X: Tensor,
     train_Y: Tensor,
     train_iteration_fidelity: bool = True,
     train_data_fidelity: bool = True,
     likelihood: Optional[Likelihood] = None,
 ) -> None:
     train_X, train_Y, _ = self._set_dimensions(train_X=train_X,
                                                train_Y=train_Y)
     num_fidelity = train_iteration_fidelity + train_data_fidelity
     ard_num_dims = train_X.shape[-1] - num_fidelity
     active_dimsX = list(range(train_X.shape[-1] - num_fidelity))
     rbf_kernel = RBFKernel(
         ard_num_dims=ard_num_dims,
         batch_shape=self._aug_batch_shape,
         lengthscale_prior=GammaPrior(3.0, 6.0),
         active_dims=active_dimsX,
     )
     exp_kernel = ExpDecayKernel(
         batch_shape=self._aug_batch_shape,
         lengthscale_prior=GammaPrior(3.0, 6.0),
         offset_prior=GammaPrior(3.0, 6.0),
         power_prior=GammaPrior(3.0, 6.0),
     )
     ds_kernel = DownsamplingKernel(
         batch_shape=self._aug_batch_shape,
         offset_prior=GammaPrior(3.0, 6.0),
         power_prior=GammaPrior(3.0, 6.0),
     )
     if train_iteration_fidelity and train_data_fidelity:
         active_dimsS1 = [train_X.shape[-1] - 1]
         active_dimsS2 = [train_X.shape[-1] - 2]
         exp_kernel.active_dims = torch.tensor(active_dimsS1)
         ds_kernel.active_dims = torch.tensor(active_dimsS2)
         kernel = rbf_kernel * exp_kernel * ds_kernel
     elif train_iteration_fidelity or train_data_fidelity:
         active_dimsS = [train_X.shape[-1] - 1]
         if train_iteration_fidelity:
             exp_kernel.active_dims = torch.tensor(active_dimsS)
             kernel = rbf_kernel * exp_kernel
         else:
             ds_kernel.active_dims = torch.tensor(active_dimsS)
             kernel = rbf_kernel * ds_kernel
     else:
         raise UnsupportedError(
             "You should have at least one fidelity parameter.")
     covar_module = ScaleKernel(
         kernel,
         batch_shape=self._aug_batch_shape,
         outputscale_prior=GammaPrior(2.0, 0.15),
     )
     super().__init__(train_X=train_X,
                      train_Y=train_Y,
                      covar_module=covar_module)
     self.to(train_X)

Exemple #12

0

Afficher le fichier

 def __init__(self, ard_dims, kernel_transform):
     # Default values from botorch/models/gp_regression.py
     # GammaPrior(3.0, 6.0)) means alpha=3, beta=6
     # nu=2.5 means Matern 5/2 kernel (1/2, 3/2, 5/2)
     # E[X~Gamma(alpha,beta)] = alpha/beta
     # Var[X~Gamma(alpha,beta)] = alpha/(beta^2)
     core_kernel = gpytorch.kernels.MaternKernel(
         nu=2.5,
         ard_num_dims=ard_dims,
         lengthscale_prior=GammaPrior(3.0, 6.0),
         param_transform=None)
     super(MMaternKernel, self).__init__()
     self.kernel_transform = kernel_transform

Exemple #13

0

Afficher le fichier

Fichier : experiment.py Projet : saitcakmak/BoRisk

    def fit_gp(self) -> None:
        """
        Re-fits the GP using the most up to date data.
        """
        noise_prior = GammaPrior(1.1, 0.5)
        noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate
        likelihood = GaussianLikelihood(
            noise_prior=noise_prior,
            batch_shape=[],
            noise_constraint=GreaterThan(
                # 0.000005,  # minimum observation noise assumed in the GP model
                0.0001,
                transform=None,
                initial_value=noise_prior_mode,
            ),
        )

        self.model = SingleTaskGP(
            self.X, self.Y, likelihood, outcome_transform=Standardize(m=1)
        )
        mll = ExactMarginalLogLikelihood(self.model.likelihood, self.model)
        fit_gpytorch_model(mll)

        # dummy computation to be safe with gp fit
        try:
            dummy = torch.rand(
                (1, self.q, self.dim), dtype=self.dtype, device=self.device
            )
            _ = self.model.posterior(dummy).mean
        except RuntimeError as err:
            if self.fit_count < 5:
                self.fit_count += 1
                self.Y = self.Y + torch.randn_like(self.Y) * 0.001
                self.fit_gp()
            else:
                raise err
        self.fit_count = 0
        self.passed = False

Exemple #14

0

Afficher le fichier

Fichier : bo_utils.py Projet : contactrika/bo-svae-dc

def make_kernel(kernel_type, ard_dims, svae=None):
    kernel_transform = None
    if svae is not None:
        assert (kernel_type in ['KL', 'uKL', 'LKL']
                or kernel_type.startswith('SVAE'))
        latent = False if kernel_type == 'KL' else True
        kernel_transform = KernelTransform(svae, latent=latent)
    min_outsc = GPConstants.MIN_OUTPUTSCALE
    max_outsc = GPConstants.MAX_OUTPUTSCALE
    hypers_unrestricted = (kernel_type == 'uSE' or kernel_type == 'uMatern'
                           or kernel_type == 'uKL')
    if hypers_unrestricted:
        min_outsc = GPConstants.MIN_UNRESTR_OUTPUTSCALE
        max_outsc = GPConstants.MAX_UNRESTR_OUTPUTSCALE
    outsc_constr = SaneInterval(
        lower_bound=torch.as_tensor(min_outsc),
        upper_bound=torch.as_tensor(max_outsc),
        transform=torch.sigmoid,
        initial_value=inv_sigmoid(torch.as_tensor(0.9999 / max_outsc)))  # ~1.0
    if kernel_type.endswith('SE'):
        covar_module = gpytorch.kernels.ScaleKernel(
            RRBFKernel(ard_dims, kernel_transform, hypers_unrestricted),
            outputscale_constraint=outsc_constr)
    elif kernel_type.endswith('Matern'):
        covar_module = gpytorch.kernels.ScaleKernel(
            MMaternKernel(ard_dims, kernel_transform),
            outputscale_prior=GammaPrior(2.0, 0.15),
            outputscale_constraint=outsc_constr)
    elif kernel_type.endswith(
            'KL'):  # symmetrized KL (in latent space for LKL)
        assert (kernel_type == 'KL' or kernel_type == 'uKL'
                or kernel_type == 'LKL')
        covar_module = gpytorch.kernels.ScaleKernel(
            KLKernel(kernel_transform), outputscale_constraint=outsc_constr)
    else:
        logging.error('Unsupported kernel type', kernel_type)
        assert (False)
    return covar_module, kernel_transform

Exemple #15

0

Afficher le fichier

Fichier : hyperOptimization.py Projet : ngk123/URSABench

    def initialize_model(self, train_X, train_Y, state_dict=None):
        """Initialise model for BO."""
        # From: https://github.com/pytorch/botorch/issues/179
        noise_prior = GammaPrior(1.1, 0.05)
        noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate
        MIN_INFERRED_NOISE_LEVEL = 1e-3
        likelihood = GaussianLikelihood(
            noise_prior=noise_prior,
            noise_constraint=GreaterThan(
                MIN_INFERRED_NOISE_LEVEL,
                transform=None,
                initial_value=noise_prior_mode,
            ),
        )

        # train_x = self.scale_to_0_1_bounds(train_X)
        train_Y = standardize(train_Y)
        gp = SingleTaskGP(train_X, train_Y, likelihood=likelihood)
        mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
        # load state dict if it is passed
        if state_dict is not None:
            gp.load_state_dict(state_dict)
        return mll, gp

Exemple #16

0

Afficher le fichier

Fichier : multitask.py Projet : pytorch/botorch

    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        task_feature: int,
        covar_module: Optional[Module] = None,
        task_covar_prior: Optional[Prior] = None,
        output_tasks: Optional[List[int]] = None,
        rank: Optional[int] = None,
        input_transform: Optional[InputTransform] = None,
        outcome_transform: Optional[OutcomeTransform] = None,
    ) -> None:
        r"""Multi-Task GP model using an ICM kernel, inferring observation noise.

        Args:
            train_X: A `n x (d + 1)` or `b x n x (d + 1)` (batch mode) tensor
                of training data. One of the columns should contain the task
                features (see `task_feature` argument).
            train_Y: A `n x 1` or `b x n x 1` (batch mode) tensor of training
                observations.
            task_feature: The index of the task feature (`-d <= task_feature <= d`).
            output_tasks: A list of task indices for which to compute model
                outputs for. If omitted, return outputs for all task indices.
            rank: The rank to be used for the index kernel. If omitted, use a
                full rank (i.e. number of tasks) kernel.
            task_covar_prior : A Prior on the task covariance matrix. Must operate
                on p.s.d. matrices. A common prior for this is the `LKJ` prior.
            input_transform: An input transform that is applied in the model's
                forward pass.

        Example:
            >>> X1, X2 = torch.rand(10, 2), torch.rand(20, 2)
            >>> i1, i2 = torch.zeros(10, 1), torch.ones(20, 1)
            >>> train_X = torch.cat([
            >>>     torch.cat([X1, i1], -1), torch.cat([X2, i2], -1),
            >>> ])
            >>> train_Y = torch.cat(f1(X1), f2(X2)).unsqueeze(-1)
            >>> model = MultiTaskGP(train_X, train_Y, task_feature=-1)
        """
        with torch.no_grad():
            transformed_X = self.transform_inputs(
                X=train_X, input_transform=input_transform)
        self._validate_tensor_args(X=transformed_X, Y=train_Y)
        all_tasks, task_feature, d = self.get_all_tasks(
            transformed_X, task_feature, output_tasks)
        if outcome_transform is not None:
            train_Y, _ = outcome_transform(train_Y)

        # squeeze output dim
        train_Y = train_Y.squeeze(-1)
        if output_tasks is None:
            output_tasks = all_tasks
        else:
            if set(output_tasks) - set(all_tasks):
                raise RuntimeError(
                    "All output tasks must be present in input data.")
        self._output_tasks = output_tasks
        self._num_outputs = len(output_tasks)

        # TODO (T41270962): Support task-specific noise levels in likelihood
        likelihood = GaussianLikelihood(noise_prior=GammaPrior(1.1, 0.05))

        # construct indexer to be used in forward
        self._task_feature = task_feature
        self._base_idxr = torch.arange(d)
        self._base_idxr[task_feature:] += 1  # exclude task feature

        super().__init__(train_inputs=train_X,
                         train_targets=train_Y,
                         likelihood=likelihood)
        self.mean_module = ConstantMean()
        if covar_module is None:
            self.covar_module = ScaleKernel(
                base_kernel=MaternKernel(nu=2.5,
                                         ard_num_dims=d,
                                         lengthscale_prior=GammaPrior(
                                             3.0, 6.0)),
                outputscale_prior=GammaPrior(2.0, 0.15),
            )
        else:
            self.covar_module = covar_module

        num_tasks = len(all_tasks)
        self._rank = rank if rank is not None else num_tasks

        self.task_covar_module = IndexKernel(num_tasks=num_tasks,
                                             rank=self._rank,
                                             prior=task_covar_prior)
        if input_transform is not None:
            self.input_transform = input_transform
        if outcome_transform is not None:
            self.outcome_transform = outcome_transform
        self.to(train_X)

Exemple #17

0

Afficher le fichier

Fichier : multitask.py Projet : pytorch/botorch

    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        likelihood: Optional[MultitaskGaussianLikelihood] = None,
        data_covar_module: Optional[Module] = None,
        task_covar_prior: Optional[Prior] = None,
        rank: Optional[int] = None,
        input_transform: Optional[InputTransform] = None,
        outcome_transform: Optional[OutcomeTransform] = None,
        **kwargs: Any,
    ) -> None:
        r"""Multi-task GP with Kronecker structure, using a simple ICM kernel.

        Args:
            train_X: A `batch_shape x n x d` tensor of training features.
            train_Y: A `batch_shape x n x m` tensor of training observations.
            likelihood: A `MultitaskGaussianLikelihood`. If omitted, uses a
                `MultitaskGaussianLikelihood` with a `GammaPrior(1.1, 0.05)`
                noise prior.
            data_covar_module: The module computing the covariance (Kernel) matrix
                in data space. If omitted, use a `MaternKernel`.
            task_covar_prior : A Prior on the task covariance matrix. Must operate
                on p.s.d. matrices. A common prior for this is the `LKJ` prior. If
                omitted, uses `LKJCovariancePrior` with `eta` parameter as specified
                in the keyword arguments (if not specified, use `eta=1.5`).
            rank: The rank of the ICM kernel. If omitted, use a full rank kernel.
            kwargs: Additional arguments to override default settings of priors,
                including:
                - eta: The eta parameter on the default LKJ task_covar_prior.
                A value of 1.0 is uninformative, values <1.0 favor stronger
                correlations (in magnitude), correlations vanish as eta -> inf.
                - sd_prior: A scalar prior over nonnegative numbers, which is used
                for the default LKJCovariancePrior task_covar_prior.
                - likelihood_rank: The rank of the task covariance matrix to fit.
                Defaults to 0 (which corresponds to a diagonal covariance matrix).

        Example:
            >>> train_X = torch.rand(10, 2)
            >>> train_Y = torch.cat([f_1(X), f_2(X)], dim=-1)
            >>> model = KroneckerMultiTaskGP(train_X, train_Y)
        """
        with torch.no_grad():
            transformed_X = self.transform_inputs(
                X=train_X, input_transform=input_transform)
        if outcome_transform is not None:
            train_Y, _ = outcome_transform(train_Y)

        self._validate_tensor_args(X=transformed_X, Y=train_Y)
        self._num_outputs = train_Y.shape[-1]
        batch_shape, ard_num_dims = train_X.shape[:-2], train_X.shape[-1]
        num_tasks = train_Y.shape[-1]

        if rank is None:
            rank = num_tasks
        if likelihood is None:
            noise_prior = GammaPrior(1.1, 0.05)
            noise_prior_mode = (noise_prior.concentration -
                                1) / noise_prior.rate
            likelihood = MultitaskGaussianLikelihood(
                num_tasks=num_tasks,
                batch_shape=batch_shape,
                noise_prior=noise_prior,
                noise_constraint=GreaterThan(
                    MIN_INFERRED_NOISE_LEVEL,
                    transform=None,
                    initial_value=noise_prior_mode,
                ),
                rank=kwargs.get("likelihood_rank", 0),
            )
        if task_covar_prior is None:
            task_covar_prior = LKJCovariancePrior(
                n=num_tasks,
                eta=torch.tensor(kwargs.get("eta", 1.5)).to(train_X),
                sd_prior=kwargs.get(
                    "sd_prior",
                    SmoothedBoxPrior(math.exp(-6), math.exp(1.25), 0.05),
                ),
            )
        super().__init__(train_X, train_Y, likelihood)
        self.mean_module = MultitaskMean(
            base_means=ConstantMean(batch_shape=batch_shape),
            num_tasks=num_tasks)
        if data_covar_module is None:
            data_covar_module = MaternKernel(
                nu=2.5,
                ard_num_dims=ard_num_dims,
                lengthscale_prior=GammaPrior(3.0, 6.0),
                batch_shape=batch_shape,
            )
        else:
            data_covar_module = data_covar_module

        self.covar_module = MultitaskKernel(
            data_covar_module=data_covar_module,
            num_tasks=num_tasks,
            rank=rank,
            batch_shape=batch_shape,
            task_covar_prior=task_covar_prior,
        )

        if outcome_transform is not None:
            self.outcome_transform = outcome_transform
        if input_transform is not None:
            self.input_transform = input_transform
        self.to(train_X)

Exemple #18

0

Afficher le fichier

    def __init__(
        self,
        decomposition: Dict[str, List[int]],
        batch_shape: torch.Size,
        train_embedding: bool = True,
        cat_feature_dict: Optional[Dict] = None,
        embs_feature_dict: Optional[Dict] = None,
        embs_dim_list: Optional[List[int]] = None,
        context_weight_dict: Optional[Dict] = None,
        device: Optional[torch.device] = None,
    ) -> None:

        super().__init__(batch_shape=batch_shape)
        self.decomposition = decomposition
        self.batch_shape = batch_shape
        self.train_embedding = train_embedding
        self.device = device

        num_param = len(next(iter(decomposition.values())))
        self.context_list = list(decomposition.keys())
        self.num_contexts = len(self.context_list)

        # get parameter space decomposition
        for active_parameters in decomposition.values():
            # check number of parameters are same in each decomp
            if len(active_parameters) != num_param:
                raise ValueError(
                    "num of parameters needs to be same across all contexts")
        self._indexers = {
            context: torch.tensor(active_params, device=self.device)
            for context, active_params in self.decomposition.items()
        }
        # get context features and set emb dim
        self.context_cat_feature = None
        self.context_emb_feature = None
        self.n_embs = 0
        self.emb_weight_matrix_list = None
        self.emb_dims = None
        self._set_context_features(
            cat_feature_dict=cat_feature_dict,
            embs_feature_dict=embs_feature_dict,
            embs_dim_list=embs_dim_list,
        )
        # contruct embedding layer
        if train_embedding:
            self._set_emb_layers()
        # task covariance matrix
        self.task_covar_module = MaternKernel(
            nu=2.5,
            ard_num_dims=self.n_embs,
            batch_shape=batch_shape,
            lengthscale_prior=GammaPrior(3.0, 6.0),
        )
        # base kernel
        self.base_kernel = MaternKernel(
            nu=2.5,
            ard_num_dims=num_param,
            batch_shape=batch_shape,
            lengthscale_prior=GammaPrior(3.0, 6.0),
        )
        # outputscales for each context (note this is like sqrt of outputscale)
        self.context_weight = None
        if context_weight_dict is None:
            outputscale_list = torch.zeros(*batch_shape,
                                           self.num_contexts,
                                           device=self.device)
        else:
            outputscale_list = torch.zeros(*batch_shape, 1, device=self.device)
            self.context_weight = torch.tensor(
                [context_weight_dict[c] for c in self.context_list],
                device=self.device)
        self.register_parameter(name="raw_outputscale_list",
                                parameter=torch.nn.Parameter(outputscale_list))
        self.register_prior(
            "outputscale_list_prior",
            GammaPrior(2.0, 15.0),
            lambda m: m.outputscale_list,
            lambda m, v: m._set_outputscale_list(v),
        )
        self.register_constraint("raw_outputscale_list", Positive())

Exemple #19

0

Afficher le fichier

Fichier : linear_truncated_fidelity.py Projet : tuong-ai/botorch

    def __init__(  # noqa C901
        self,
        fidelity_dims: List[int],
        dimension: Optional[int] = None,
        power_prior: Optional[Prior] = None,
        power_constraint: Optional[Interval] = None,
        nu: float = 2.5,
        lengthscale_prior_unbiased: Optional[Prior] = None,
        lengthscale_prior_biased: Optional[Prior] = None,
        lengthscale_constraint_unbiased: Optional[Interval] = None,
        lengthscale_constraint_biased: Optional[Interval] = None,
        covar_module_unbiased: Optional[Kernel] = None,
        covar_module_biased: Optional[Kernel] = None,
        **kwargs: Any,
    ) -> None:
        if dimension is None and kwargs.get("active_dims") is None:
            raise UnsupportedError(
                "Must specify dimension when not specifying active_dims.")
        n_fidelity = len(fidelity_dims)
        if len(set(fidelity_dims)) != n_fidelity:
            raise ValueError("fidelity_dims must not have repeated elements")
        if n_fidelity not in {1, 2}:
            raise UnsupportedError(
                "LinearTruncatedFidelityKernel accepts either one or two"
                "fidelity parameters.")
        if nu not in {0.5, 1.5, 2.5}:
            raise ValueError("nu must be one of 0.5, 1.5, or 2.5")

        super().__init__(**kwargs)
        self.fidelity_dims = fidelity_dims
        if power_constraint is None:
            power_constraint = Positive()

        if lengthscale_prior_unbiased is None:
            lengthscale_prior_unbiased = GammaPrior(3, 6)

        if lengthscale_prior_biased is None:
            lengthscale_prior_biased = GammaPrior(6, 2)

        if lengthscale_constraint_unbiased is None:
            lengthscale_constraint_unbiased = Positive()

        if lengthscale_constraint_biased is None:
            lengthscale_constraint_biased = Positive()

        self.register_parameter(
            name="raw_power",
            parameter=torch.nn.Parameter(torch.zeros(*self.batch_shape, 1)),
        )
        self.register_constraint("raw_power", power_constraint)

        if power_prior is not None:
            self.register_prior(
                "power_prior",
                power_prior,
                lambda: self.power,
                lambda v: self._set_power(v),
            )

        if self.active_dims is not None:
            dimension = len(self.active_dims)

        if covar_module_unbiased is None:
            covar_module_unbiased = MaternKernel(
                nu=nu,
                batch_shape=self.batch_shape,
                lengthscale_prior=lengthscale_prior_unbiased,
                ard_num_dims=dimension - n_fidelity,
                lengthscale_constraint=lengthscale_constraint_unbiased,
            )

        if covar_module_biased is None:
            covar_module_biased = MaternKernel(
                nu=nu,
                batch_shape=self.batch_shape,
                lengthscale_prior=lengthscale_prior_biased,
                ard_num_dims=dimension - n_fidelity,
                lengthscale_constraint=lengthscale_constraint_biased,
            )

        self.covar_module_unbiased = covar_module_unbiased
        self.covar_module_biased = covar_module_biased

Exemple #20

0

Afficher le fichier

Fichier : higher_order_gp.py Projet : wjmaddox/botorch

    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        likelihood: Optional[Likelihood] = None,
        covar_modules: Optional[List[Kernel]] = None,
        num_latent_dims: Optional[List[int]] = None,
        learn_latent_pars: bool = True,
        latent_init: str = "default",
        outcome_transform: Optional[OutcomeTransform] = None,
        input_transform: Optional[InputTransform] = None,
    ):
        r"""A HigherOrderGP model for high-dim output regression.

        Args:
            train_X: A `batch_shape x n x d`-dim tensor of training inputs.
            train_Y: A `batch_shape x n x output_shape`-dim tensor of training targets.
            likelihood: Gaussian likelihood for the model.
            covar_modules: List of kernels for each output structure.
            num_latent_dims: Sizes for the latent dimensions.
            learn_latent_pars: If true, learn the latent parameters.
            latent_init: [default or gp] how to initialize the latent parameters.
        """

        if input_transform is not None:
            input_transform.to(train_X)

        # infer the dimension of `output_shape`.
        num_output_dims = train_Y.dim() - train_X.dim() + 1
        batch_shape = train_X.shape[:-2]
        if len(batch_shape) > 1:
            raise NotImplementedError(
                "HigherOrderGP currently only supports 1-dim `batch_shape`."
            )

        if outcome_transform is not None:
            if isinstance(outcome_transform, Standardize) and not isinstance(
                outcome_transform, FlattenedStandardize
            ):
                warnings.warn(
                    "HigherOrderGP does not support the outcome_transform "
                    "`Standardize`! Using `FlattenedStandardize` with `output_shape="
                    f"{train_Y.shape[- num_output_dims:]} and batch_shape="
                    f"{batch_shape} instead.",
                    RuntimeWarning,
                )
                outcome_transform = FlattenedStandardize(
                    output_shape=train_Y.shape[-num_output_dims:],
                    batch_shape=batch_shape,
                )
            train_Y, _ = outcome_transform(train_Y)

        self._aug_batch_shape = batch_shape
        self._num_dimensions = num_output_dims + 1
        self._num_outputs = train_Y.shape[0] if batch_shape else 1
        self.target_shape = train_Y.shape[-num_output_dims:]
        self._input_batch_shape = batch_shape

        if likelihood is None:

            noise_prior = GammaPrior(1.1, 0.05)
            noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate
            likelihood = GaussianLikelihood(
                noise_prior=noise_prior,
                batch_shape=self._aug_batch_shape,
                noise_constraint=GreaterThan(
                    MIN_INFERRED_NOISE_LEVEL,
                    transform=None,
                    initial_value=noise_prior_mode,
                ),
            )
        else:
            self._is_custom_likelihood = True

        super().__init__(
            train_X,
            train_Y.view(*self._aug_batch_shape, -1),
            likelihood=likelihood,
        )

        if covar_modules is not None:
            self.covar_modules = ModuleList(covar_modules)
        else:
            self.covar_modules = ModuleList(
                [
                    MaternKernel(
                        nu=2.5,
                        lengthscale_prior=GammaPrior(3.0, 6.0),
                        batch_shape=self._aug_batch_shape,
                        ard_num_dims=1 if dim > 0 else train_X.shape[-1],
                    )
                    for dim in range(self._num_dimensions)
                ]
            )

        if num_latent_dims is None:
            num_latent_dims = [1] * (self._num_dimensions - 1)

        self.to(train_X.device)

        self._initialize_latents(
            latent_init=latent_init,
            num_latent_dims=num_latent_dims,
            learn_latent_pars=learn_latent_pars,
            device=train_Y.device,
            dtype=train_Y.dtype,
        )

        if outcome_transform is not None:
            self.outcome_transform = outcome_transform
        if input_transform is not None:
            self.input_transform = input_transform

Exemple #21

0

Afficher le fichier

Fichier : pairwise_gp.py Projet : zkneupper/botorch

    def __init__(
        self,
        datapoints: Tensor,
        comparisons: Tensor,
        covar_module: Optional[Module] = None,
        input_transform: Optional[InputTransform] = None,
        **kwargs,
    ) -> None:
        r"""A probit-likelihood GP with Laplace approximation model that learns via
            pairwise comparison data. By default it uses a scaled RBF kernel.

        Args:
            datapoints: A `batch_shape x n x d` tensor of training features.
            comparisons: A `batch_shape x m x 2` training comparisons;
                comparisons[i] is a noisy indicator suggesting the utility value
                of comparisons[i, 0]-th is greater than comparisons[i, 1]-th.
            covar_module: Covariance module.
            input_transform: An input transform that is applied in the model's
                forward pass.
        """
        super().__init__()

        if input_transform is not None:
            input_transform.to(datapoints)
            # input transformation is applied in set_train_data
            self.input_transform = input_transform

        # Compatibility variables with fit_gpytorch_*: Dummy likelihood
        # Likelihood is tightly tied with this model and
        # it doesn't make much sense to keep it separate
        self.likelihood = None

        # TODO: remove these variables from `state_dict()` so that when calling
        #       `load_state_dict()`, only the hyperparameters are copied over
        self.register_buffer("datapoints", None)
        self.register_buffer("comparisons", None)
        self.register_buffer("D", None)
        self.register_buffer("DT", None)
        self.register_buffer("utility", None)
        self.register_buffer("covar_chol", None)
        self.register_buffer("likelihood_hess", None)
        self.register_buffer("hlcov_eye", None)
        self.register_buffer("covar", None)
        self.register_buffer("covar_inv", None)

        self.train_inputs = []
        self.train_targets = None

        self.pred_cov_fac_need_update = True
        self.dim = None

        # See set_train_data for additional compatibility variables.
        # Not that the datapoints here are not transformed even if input_transform
        # is not None to avoid double transformation during model fitting.
        # self.transform_inputs is called in `forward`
        self.set_train_data(datapoints, comparisons, update_model=False)

        # Set optional parameters
        # jitter to add for numerical stability
        self._jitter = kwargs.get("jitter", 1e-6)
        # Clamping z lim for better numerical stability. See self._calc_z for detail
        # norm_cdf(z=3) ~= 0.999, top 0.1% percent
        self._zlim = kwargs.get("zlim", 3)
        # Stopping creteria in scipy.optimize.fsolve used to find f_map in _update()
        # If None, set to 1e-6 by default in _update
        self._xtol = kwargs.get("xtol")
        # The maximum number of calls to the function in scipy.optimize.fsolve
        # If None, set to 100 by default in _update
        # If zero, then 100*(N+1) is used by default by fsolve;
        self._maxfev = kwargs.get("maxfev")

        # Set hyperparameters
        # Do not set the batch_shape explicitly so mean_module can operate in both mode
        # once fsolve used in _update can run in batch mode, we should explicitly set
        # the bacth shape here
        self.mean_module = ConstantMean()
        # Do not optimize constant mean prior
        for param in self.mean_module.parameters():
            param.requires_grad = False

        # set covariance module
        # the default outputscale here is only a rule of thumb, meant to keep
        # estimates away from scale value that would make Phi(f(x)) saturate
        # at 0 or 1
        if covar_module is None:
            ls_prior = GammaPrior(1.2, 0.5)
            ls_prior_mode = (ls_prior.concentration - 1) / ls_prior.rate
            covar_module = ScaleKernel(
                RBFKernel(
                    batch_shape=self.batch_shape,
                    ard_num_dims=self.dim,
                    lengthscale_prior=ls_prior,
                    lengthscale_constraint=Positive(
                        transform=None, initial_value=ls_prior_mode),
                ),
                outputscale_prior=SmoothedBoxPrior(a=1, b=4),
            )

        self.covar_module = covar_module

        self._x0 = None  # will store temporary results for warm-starting
        if self.datapoints is not None and self.comparisons is not None:
            self.to(dtype=self.datapoints.dtype, device=self.datapoints.device)
            # Find f_map for initial parameters with transformed datapoints
            transformed_dp = self.transform_inputs(datapoints)
            self._update(transformed_dp)

        self.to(self.datapoints)

Exemple #22

0

Afficher le fichier

    def sample_arch(self, START_BO, g, steps, hyperparams, og_flops, full_val_loss, target_flops=0):
        if args.slim:
            if target_flops == 0:
                parameterization = hyperparams.random_sample()
                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
            else:
                parameterization = np.ones(hyperparams.get_dim()) * args.lower_channel
                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
        else:
            # random sample to warmup history for MOBO
            if g < START_BO:
                if target_flops == 0:
                    f = np.random.rand(1) * (args.upper_channel-args.lower_channel) + args.lower_channel
                else:
                    f = args.lower_channel
                parameterization = np.ones(hyperparams.get_dim()) * f
                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
            # put the largest model into the history
            elif g == START_BO:
                if target_flops == 0:
                    parameterization = np.ones(hyperparams.get_dim())
                else:
                    f = args.lower_channel
                    parameterization = np.ones(hyperparams.get_dim()) * f
                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
            # MOBO
            else:
                # this is the scalarization (lambda_{FLOPs})
                rand = torch.rand(1).cuda()

                # standardize data for building Gaussian Processes
                train_X = torch.FloatTensor(self.X).cuda()
                train_Y_loss = torch.FloatTensor(np.array(self.Y)[:, 0].reshape(-1, 1)).cuda()
                train_Y_loss = standardize(train_Y_loss)

                train_Y_cost = torch.FloatTensor(np.array(self.Y)[:, 1].reshape(-1, 1)).cuda()
                train_Y_cost = standardize(train_Y_cost)

                new_train_X = train_X
                # GP for the cross entropy loss
                gp_loss = SingleTaskGP(new_train_X, train_Y_loss)
                mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss)
                mll = mll.to('cuda')
                fit_gpytorch_model(mll)


                # GP for FLOPs
                # we use add-gp since FLOPs has addive structure (not exactly though)
                # the parameters for ScaleKernel and MaternKernel simply follow the default
                covar_module = AdditiveStructureKernel(
                    ScaleKernel(
                        MaternKernel(
                            nu=2.5,
                            lengthscale_prior=GammaPrior(3.0, 6.0),
                            num_dims=1
                        ),
                        outputscale_prior=GammaPrior(2.0, 0.15),
                    ),
                    num_dims=train_X.shape[1]
                )
                gp_cost = SingleTaskGP(new_train_X, train_Y_cost, covar_module=covar_module)
                mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost)
                mll = mll.to('cuda')
                fit_gpytorch_model(mll)

                # Build acquisition functions
                UCB_loss = UpperConfidenceBound(gp_loss, beta=0.1).cuda()
                UCB_cost = UpperConfidenceBound(gp_cost, beta=0.1).cuda()

                # Combine them via augmented Tchebyshev scalarization
                self.mobo_obj = RandAcquisition(UCB_loss).cuda()
                self.mobo_obj.setup(UCB_loss, UCB_cost, rand)

                # Bounds for the optimization variable (alpha)
                lower = torch.ones(new_train_X.shape[1])*args.lower_channel
                upper = torch.ones(new_train_X.shape[1])*args.upper_channel
                self.mobo_bounds = torch.stack([lower, upper]).cuda()

                # Pareto-aware sampling
                if args.pas:
                    # Generate approximate Pareto front first
                    costs = []
                    for i in range(len(self.population_data)):
                        costs.append([self.population_data[i]['loss'], self.population_data[i]['ratio']])
                    costs = np.array(costs)
                    efficient_mask = is_pareto_efficient(costs)
                    costs = costs[efficient_mask]
                    loss = costs[:, 0]
                    flops = costs[:, 1]
                    sorted_idx = np.argsort(flops)
                    loss = loss[sorted_idx]
                    flops = flops[sorted_idx]
                    if flops[0] > args.lower_flops:
                        flops = np.concatenate([[args.lower_flops], flops.reshape(-1)])
                        loss = np.concatenate([[8], loss.reshape(-1)])
                    else:
                        flops = flops.reshape(-1)
                        loss = loss.reshape(-1)

                    if flops[-1] < args.upper_flops and (loss[-1] > full_val_loss):
                        flops = np.concatenate([flops.reshape(-1), [args.upper_flops]])
                        loss = np.concatenate([loss.reshape(-1), [full_val_loss]])
                    else:
                        flops = flops.reshape(-1)
                        loss = loss.reshape(-1)

                    # Equation (4) in paper
                    areas = (flops[1:]-flops[:-1])*(loss[:-1]-loss[1:])

                    # Quantize into 50 bins to sample from multinomial
                    self.sampling_weights = np.zeros(50)
                    k = 0
                    while k < len(flops) and flops[k] < args.lower_flops:
                        k+=1
                    for i in range(50):
                        lower = i/50.
                        upper = (i+1)/50.
                        if upper < args.lower_flops or lower > args.upper_flops or lower < args.lower_flops:
                            continue
                        cnt = 1
                        while ((k+1) < len(flops)) and upper > flops[k+1]:
                            self.sampling_weights[i] += areas[k]
                            cnt += 1
                            k += 1
                        if k < len(areas):
                            self.sampling_weights[i] += areas[k]
                        self.sampling_weights[i] /= cnt
                    if np.sum(self.sampling_weights) == 0:
                        self.sampling_weights = np.ones(50)
                        
                    if target_flops == 0:
                        val = np.arange(0.01, 1, 0.02)
                        chosen_target_flops = np.random.choice(val, p=(self.sampling_weights/np.sum(self.sampling_weights)))
                    else:
                        chosen_target_flops = target_flops
                    
                    # Binary search is here
                    lower_bnd, upper_bnd = 0, 1
                    lmda = 0.5
                    for i in range(10):
                        self.mobo_obj.rand = lmda

                        parameterization, acq_value = optimize_acqf(
                            self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000,
                        )

                        parameterization = parameterization[0].cpu().numpy()
                        layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
                        sim_flops = self.mask_pruner.simulate_and_count_flops(layer_budget)
                        ratio = sim_flops/og_flops

                        if np.abs(ratio - chosen_target_flops) <= 0.02:
                            break
                        if args.baseline > 0:
                            if ratio < chosen_target_flops:
                                lower_bnd = lmda
                                lmda = (lmda + upper_bnd) / 2
                            elif ratio > chosen_target_flops:
                                upper_bnd = lmda
                                lmda = (lmda + lower_bnd) / 2
                        else:
                            if ratio < chosen_target_flops:
                                upper_bnd = lmda
                                lmda = (lmda + lower_bnd) / 2
                            elif ratio > chosen_target_flops:
                                lower_bnd = lmda
                                lmda = (lmda + upper_bnd) / 2
                    rand[0] = lmda
                    writer.add_scalar('Binary search trials', i, steps)

                else:
                    parameterization, acq_value = optimize_acqf(
                        self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000,
                    )
                    parameterization = parameterization[0].cpu().numpy()

                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
        return layer_budget, parameterization, self.sampling_weights/np.sum(self.sampling_weights)

Exemple #23

0

Afficher le fichier

 def test_initialize_offset_prior(self):
     kernel = DownsamplingKernel()
     kernel.offset_prior = NormalPrior(1, 1)
     self.assertTrue(isinstance(kernel.offset_prior, NormalPrior))
     kernel2 = DownsamplingKernel(offset_prior=GammaPrior(1, 1))
     self.assertTrue(isinstance(kernel2.offset_prior, GammaPrior))

Exemple #24

0

Afficher le fichier

Fichier : gp_regression.py Projet : shyamalschandra/botorch

    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        likelihood: Optional[Likelihood] = None,
        covar_module: Optional[Module] = None,
        outcome_transform: Optional[OutcomeTransform] = None,
    ) -> None:
        r"""A single-task exact GP model.

        Args:
            train_X: A `batch_shape x n x d` tensor of training features.
            train_Y: A `batch_shape x n x m` tensor of training observations.
            likelihood: A likelihood. If omitted, use a standard
                GaussianLikelihood with inferred noise level.
            covar_module: The module computing the covariance (Kernel) matrix.
                If omitted, use a `MaternKernel`.
            outcome_transform: An outcome transform that is applied to the
                training data during instantiation and to the posterior during
                inference (that is, the `Posterior` obtained by calling
                `.posterior` on the model will be on the original scale).

        Example:
            >>> train_X = torch.rand(20, 2)
            >>> train_Y = torch.sin(train_X).sum(dim=1, keepdim=True)
            >>> model = SingleTaskGP(train_X, train_Y)
        """
        if outcome_transform is not None:
            train_Y, _ = outcome_transform(train_Y)
        validate_input_scaling(train_X=train_X, train_Y=train_Y)
        self._validate_tensor_args(X=train_X, Y=train_Y)
        self._set_dimensions(train_X=train_X, train_Y=train_Y)
        train_X, train_Y, _ = self._transform_tensor_args(X=train_X, Y=train_Y)
        if likelihood is None:
            noise_prior = GammaPrior(1.1, 0.05)
            noise_prior_mode = (noise_prior.concentration -
                                1) / noise_prior.rate
            likelihood = GaussianLikelihood(
                noise_prior=noise_prior,
                batch_shape=self._aug_batch_shape,
                noise_constraint=GreaterThan(
                    MIN_INFERRED_NOISE_LEVEL,
                    transform=None,
                    initial_value=noise_prior_mode,
                ),
            )
        else:
            self._is_custom_likelihood = True
        ExactGP.__init__(self, train_X, train_Y, likelihood)
        self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape)
        if covar_module is None:
            self.covar_module = ScaleKernel(
                MaternKernel(
                    nu=2.5,
                    ard_num_dims=train_X.shape[-1],
                    batch_shape=self._aug_batch_shape,
                    lengthscale_prior=GammaPrior(3.0, 6.0),
                ),
                batch_shape=self._aug_batch_shape,
                outputscale_prior=GammaPrior(2.0, 0.15),
            )
            self._subset_batch_dict = {
                "likelihood.noise_covar.raw_noise": -2,
                "mean_module.constant": -2,
                "covar_module.raw_outputscale": -1,
                "covar_module.base_kernel.raw_lengthscale": -3,
            }
        else:
            self.covar_module = covar_module
        # TODO: Allow subsetting of other covar modules
        if outcome_transform is not None:
            self.outcome_transform = outcome_transform
        self.to(train_X)

Exemple #25

0

Afficher le fichier

 def test_initialize_offset_prior(self):
     kernel = ExponentialDecayKernel()
     kernel.offset_prior = NormalPrior(1, 1)
     self.assertTrue(isinstance(kernel.offset_prior, NormalPrior))
     kernel2 = ExponentialDecayKernel(offset_prior=GammaPrior(1, 1))
     self.assertTrue(isinstance(kernel2.offset_prior, GammaPrior))

Exemple #26

0

Afficher le fichier

 def test_initialize_power_prior(self):
     kernel = ExpDecayKernel()
     kernel.power_prior = NormalPrior(1, 1)
     self.assertTrue(isinstance(kernel.power_prior, NormalPrior))
     kernel2 = ExpDecayKernel(power_prior=GammaPrior(1, 1))
     self.assertTrue(isinstance(kernel2.power_prior, GammaPrior))

Exemple #27

0

Afficher le fichier

Fichier : gp_regression_fidelity.py Projet : yiminzme/botorch

def _setup_multifidelity_covar_module(
    dim: int,
    aug_batch_shape: torch.Size,
    iteration_fidelity: Optional[int],
    data_fidelity: Optional[int],
    linear_truncated: bool,
    nu: float,
) -> Tuple[ScaleKernel, Dict]:
    """Helper function to get the covariance module and associated subset_batch_dict
    for the multifidelity setting.

    Args:
        dim: The dimensionality of the training data.
        aug_batch_shape: The output-augmented batch shape as defined in
            `BatchedMultiOutputGPyTorchModel`.
        iteration_fidelity: The column index for the training iteration fidelity
            parameter (optional).
        data_fidelity: The column index for the downsampling fidelity parameter
            (optional).
        linear_truncated: If True, use a `LinearTruncatedFidelityKernel` instead
            of the default kernel.
        nu: The smoothness parameter for the Matern kernel: either 1/2, 3/2, or
            5/2. Only used when `linear_truncated=True`.

    Returns:
        The covariance module and subset_batch_dict.
    """

    if iteration_fidelity is not None and iteration_fidelity < 0:
        iteration_fidelity = dim + iteration_fidelity
    if data_fidelity is not None and data_fidelity < 0:
        data_fidelity = dim + data_fidelity

    if linear_truncated:
        fidelity_dims = [
            i for i in (iteration_fidelity, data_fidelity) if i is not None
        ]
        kernel = LinearTruncatedFidelityKernel(
            fidelity_dims=fidelity_dims,
            dimension=dim,
            nu=nu,
            batch_shape=aug_batch_shape,
            power_prior=GammaPrior(3.0, 3.0),
        )
    else:
        active_dimsX = [
            i for i in range(dim)
            if i not in {iteration_fidelity, data_fidelity}
        ]
        kernel = RBFKernel(
            ard_num_dims=len(active_dimsX),
            batch_shape=aug_batch_shape,
            lengthscale_prior=GammaPrior(3.0, 6.0),
            active_dims=active_dimsX,
        )
        additional_kernels = []
        if iteration_fidelity is not None:
            exp_kernel = ExponentialDecayKernel(
                batch_shape=aug_batch_shape,
                lengthscale_prior=GammaPrior(3.0, 6.0),
                offset_prior=GammaPrior(3.0, 6.0),
                power_prior=GammaPrior(3.0, 6.0),
                active_dims=[iteration_fidelity],
            )
            additional_kernels.append(exp_kernel)
        if data_fidelity is not None:
            ds_kernel = DownsamplingKernel(
                batch_shape=aug_batch_shape,
                offset_prior=GammaPrior(3.0, 6.0),
                power_prior=GammaPrior(3.0, 6.0),
                active_dims=[data_fidelity],
            )
            additional_kernels.append(ds_kernel)
        kernel = ProductKernel(kernel, *additional_kernels)

    covar_module = ScaleKernel(kernel,
                               batch_shape=aug_batch_shape,
                               outputscale_prior=GammaPrior(2.0, 0.15))

    if linear_truncated:
        subset_batch_dict = {
            "covar_module.base_kernel.raw_power": -2,
            "covar_module.base_kernel.covar_module_unbiased.raw_lengthscale":
            -3,
            "covar_module.base_kernel.covar_module_biased.raw_lengthscale": -3,
        }
    else:
        subset_batch_dict = {
            "covar_module.base_kernel.kernels.0.raw_lengthscale": -3,
            "covar_module.base_kernel.kernels.1.raw_power": -2,
            "covar_module.base_kernel.kernels.1.raw_offset": -2,
        }
        if iteration_fidelity is not None:
            subset_batch_dict = {
                "covar_module.base_kernel.kernels.1.raw_lengthscale": -3,
                **subset_batch_dict,
            }
            if data_fidelity is not None:
                subset_batch_dict = {
                    "covar_module.base_kernel.kernels.2.raw_power": -2,
                    "covar_module.base_kernel.kernels.2.raw_offset": -2,
                    **subset_batch_dict,
                }

    return covar_module, subset_batch_dict

Exemple #28

0

Afficher le fichier

    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        iteration_fidelity: Optional[int] = None,
        data_fidelity: Optional[int] = None,
        linear_truncated: bool = True,
        nu: float = 2.5,
        likelihood: Optional[Likelihood] = None,
        outcome_transform: Optional[OutcomeTransform] = None,
    ) -> None:
        self._init_args = {
            "iteration_fidelity": iteration_fidelity,
            "data_fidelity": data_fidelity,
            "linear_truncated": linear_truncated,
            "nu": nu,
            "outcome_transform": outcome_transform,
        }
        if iteration_fidelity is None and data_fidelity is None:
            raise UnsupportedError(
                "SingleTaskMultiFidelityGP requires at least one fidelity parameter."
            )
        if iteration_fidelity is not None and iteration_fidelity < 0:
            iteration_fidelity = train_X.size(-1) + iteration_fidelity
        if data_fidelity is not None and data_fidelity < 0:
            data_fidelity = train_X.size(-1) + data_fidelity
        self._set_dimensions(train_X=train_X, train_Y=train_Y)
        if linear_truncated:
            fidelity_dims = [
                i for i in (iteration_fidelity, data_fidelity) if i is not None
            ]
            kernel = LinearTruncatedFidelityKernel(
                fidelity_dims=fidelity_dims,
                dimension=train_X.size(-1),
                nu=nu,
                batch_shape=self._aug_batch_shape,
                power_prior=GammaPrior(3.0, 3.0),
            )
        else:
            active_dimsX = [
                i
                for i in range(train_X.size(-1))
                if i not in {iteration_fidelity, data_fidelity}
            ]
            kernel = RBFKernel(
                ard_num_dims=len(active_dimsX),
                batch_shape=self._aug_batch_shape,
                lengthscale_prior=GammaPrior(3.0, 6.0),
                active_dims=active_dimsX,
            )
            additional_kernels = []
            if iteration_fidelity is not None:
                exp_kernel = ExponentialDecayKernel(
                    batch_shape=self._aug_batch_shape,
                    lengthscale_prior=GammaPrior(3.0, 6.0),
                    offset_prior=GammaPrior(3.0, 6.0),
                    power_prior=GammaPrior(3.0, 6.0),
                    active_dims=[iteration_fidelity],
                )
                additional_kernels.append(exp_kernel)
            if data_fidelity is not None:
                ds_kernel = DownsamplingKernel(
                    batch_shape=self._aug_batch_shape,
                    offset_prior=GammaPrior(3.0, 6.0),
                    power_prior=GammaPrior(3.0, 6.0),
                    active_dims=[data_fidelity],
                )
                additional_kernels.append(ds_kernel)
            kernel = ProductKernel(kernel, *additional_kernels)

        covar_module = ScaleKernel(
            kernel,
            batch_shape=self._aug_batch_shape,
            outputscale_prior=GammaPrior(2.0, 0.15),
        )
        super().__init__(
            train_X=train_X,
            train_Y=train_Y,
            covar_module=covar_module,
            outcome_transform=outcome_transform,
        )
        if linear_truncated:
            subset_batch_dict = {
                "covar_module.base_kernel.raw_power": -2,
                "covar_module.base_kernel.covar_module_unbiased.raw_lengthscale": -3,
                "covar_module.base_kernel.covar_module_biased.raw_lengthscale": -3,
            }
        else:
            subset_batch_dict = {
                "covar_module.base_kernel.kernels.0.raw_lengthscale": -3,
                "covar_module.base_kernel.kernels.1.raw_power": -2,
                "covar_module.base_kernel.kernels.1.raw_offset": -2,
            }
            if iteration_fidelity is not None:
                subset_batch_dict = {
                    "covar_module.base_kernel.kernels.1.raw_lengthscale": -3,
                    **subset_batch_dict,
                }
                if data_fidelity is not None:
                    subset_batch_dict = {
                        "covar_module.base_kernel.kernels.2.raw_power": -2,
                        "covar_module.base_kernel.kernels.2.raw_offset": -2,
                        **subset_batch_dict,
                    }
        self._subset_batch_dict = {
            "likelihood.noise_covar.raw_noise": -2,
            "mean_module.constant": -2,
            "covar_module.raw_outputscale": -1,
            **subset_batch_dict,
        }

        self.to(train_X)

Exemple #29

0

Afficher le fichier

    def __init__(
        self,
        datapoints: Tensor,
        comparisons: Tensor,
        covar_module: Optional[Module] = None,
        noise_module: Optional[HomoskedasticNoise] = None,
        **kwargs,
    ) -> None:
        super().__init__()
        r"""A probit-likelihood GP with Laplace approximation model.

        A probit-likelihood GP with Laplace approximation model that learns via
        pairwise comparison data. By default it uses a scaled-RBF kernel.

        Args:
            datapoints: A `batch_shape x n x d` tensor of training features.
            comparisons: A `batch_shape x m x 2` training comparisons;
                comparisons[i] is a noisy indicator suggesting the utility value
                of comparisons[i, 0]-th is greater than comparisons[i, 1]-th.
            covar_module: Covariance module
            noise_module: Noise module
        """

        # Compatibility variables with fit_gpytorch_*: Dummy likelihood
        # Likelihood is tightly tied with this model and
        # it doesn't make much sense to keep it separate
        self.likelihood = None

        # TODO: remove these variables from `state_dict()` so that when calling
        #       `load_state_dict()`, only the hyperparameters are copied over
        self.register_buffer("datapoints", None)
        self.register_buffer("comparisons", None)
        self.register_buffer("utility", None)
        self.register_buffer("covar_chol", None)
        self.register_buffer("likelihood_hess", None)
        self.register_buffer("hlcov_eye", None)
        self.register_buffer("covar", None)
        self.register_buffer("covar_inv", None)

        self.train_inputs = []
        self.train_targets = None

        self.pred_cov_fac_need_update = True
        self._input_batch_shape = torch.Size()
        self.dim = None
        # will be set to match datapoints' dtype and device
        # since scipy.optimize.fsolve only works on cpu, it'd be the
        # fastest to fit the model on cpu and take samples on gpu to avoid
        # overhead of moving data back and forth during fitting time
        self.tkwargs = {}
        # See set_train_data for additional compatibility variables
        self.set_train_data(datapoints, comparisons, update_model=False)

        # Set optional parameters
        # jitter to add for numerical stability
        self._jitter = kwargs.get("jitter", 1e-6)
        # Clamping z lim for better numerical stability. See self._calc_z for detail
        # norm_cdf(z=3) ~= 0.999, top 0.1% percent
        self._zlim = kwargs.get("zlim", 3)
        # Stopping creteria in scipy.optimize.fsolve used to find f_map in _update()
        # If None, set to 1e-6 by default in _update
        self._xtol = kwargs.get("xtol")
        # The maximum number of calls to the function in scipy.optimize.fsolve
        # If None, set to 100 by default in _update
        # If zero, then 100*(N+1) is used by default by fsolve;
        self._maxfev = kwargs.get("maxfev")

        # Set hyperparameters
        # Do not set the batch_shape explicitly so mean_module can operate in both mode
        # once fsolve used in _update can run in batch mode, we should explicitly set
        # the bacth shape here
        self.mean_module = ConstantMean()
        # Do not optimize constant mean prior
        for param in self.mean_module.parameters():
            param.requires_grad = False

        # set covariance module
        if noise_module is None:
            noise_module = HomoskedasticNoise(
                noise_prior=SmoothedBoxPrior(-5, 5, 0.5, transform=torch.log),
                noise_constraint=GreaterThan(1e-4),  # if None, 1e-4 by default
                batch_shape=self._input_batch_shape,
            )
        self.noise_module = noise_module

        # set covariance module
        if covar_module is None:
            ls_prior = GammaPrior(1.2, 0.5)
            ls_prior_mode = (ls_prior.concentration - 1) / ls_prior.rate
            covar_module = RBFKernel(
                batch_shape=self._input_batch_shape,
                ard_num_dims=self.dim,
                lengthscale_prior=ls_prior,
                lengthscale_constraint=Positive(transform=None,
                                                initial_value=ls_prior_mode),
            )
        self.covar_module = covar_module

        self._x0 = None  # will store temporary results for warm-starting
        if self.datapoints is not None and self.comparisons is not None:
            self.to(dtype=self.datapoints.dtype, device=self.datapoints.device)
            self._update()  # Find f_map for initial parameters

        self.to(self.datapoints)

Exemple #30

0

Afficher le fichier

Fichier : multitask.py Projet : lena-kashtelyan/botorch

    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        task_feature: int,
        output_tasks: Optional[List[int]] = None,
        rank: Optional[int] = None,
    ) -> None:
        r"""Multi-Task GP model using an ICM kernel, inferring observation noise.

        Args:
            train_X: A `n x (d + 1)` or `b x n x (d + 1)` (batch mode) tensor
                of training data. One of the columns should contain the task
                features (see `task_feature` argument).
            train_Y: A `n` or `b x n` (batch mode) tensor of training
                observations.
            task_feature: The index of the task feature
                (`-d <= task_feature <= d`).
            output_tasks: A list of task indices for which to compute model
                outputs for. If omitted, return outputs for all task indices.
            rank: The rank to be used for the index kernel. If omitted, use a
                full rank (i.e. number of tasks) kernel.

        Example:
            >>> X1, X2 = torch.rand(10, 2), torch.rand(20, 2)
            >>> i1, i2 = torch.zeros(10, 1), torch.ones(20, 1)
            >>> train_X = torch.stack([
            >>>     torch.cat([X1, i1], -1), torch.cat([X2, i2], -1),
            >>> ])
            >>> train_Y = torch.cat(f1(X1), f2(X2))
            >>> model = MultiTaskGP(train_X, train_Y, task_feature=-1)
        """
        if train_X.ndimension() != 2:
            # Currently, batch mode MTGPs are blocked upstream in GPyTorch
            raise ValueError(f"Unsupported shape {train_X.shape} for train_X.")
        d = train_X.shape[-1] - 1
        if not (-d <= task_feature <= d):
            raise ValueError(f"Must have that -{d} <= task_feature <= {d}")
        all_tasks = train_X[:, task_feature].unique().to(
            dtype=torch.long).tolist()
        if output_tasks is None:
            output_tasks = all_tasks
        else:
            if any(t not in all_tasks for t in output_tasks):
                raise RuntimeError(
                    "All output tasks must be present in input data.")
        self._output_tasks = output_tasks

        # TODO (T41270962): Support task-specific noise levels in likelihood
        likelihood = GaussianLikelihood(noise_prior=GammaPrior(1.1, 0.05))

        # construct indexer to be used in forward
        self._task_feature = task_feature
        self._base_idxr = torch.arange(d)
        self._base_idxr[task_feature:] += 1  # exclude task feature

        super().__init__(train_inputs=train_X,
                         train_targets=train_Y,
                         likelihood=likelihood)
        self.mean_module = ConstantMean()
        self.covar_module = ScaleKernel(
            base_kernel=MaternKernel(nu=2.5,
                                     ard_num_dims=d,
                                     lengthscale_prior=GammaPrior(3.0, 6.0)),
            outputscale_prior=GammaPrior(2.0, 0.15),
        )
        num_tasks = len(all_tasks)
        self._rank = rank if rank is not None else num_tasks
        # TODO: Add LKJ prior for the index kernel
        self.task_covar_module = IndexKernel(num_tasks=num_tasks,
                                             rank=self._rank)
        self.to(train_X)