def test_compute_linear_truncated_kernel_no_batch(self):
        x1 = torch.tensor([[1, 0.1, 0.2], [2, 0.3, 0.4]])
        x2 = torch.tensor([[3, 0.5, 0.6], [4, 0.7, 0.8]])
        t_1 = torch.tensor([[0.3584, 0.1856], [0.2976, 0.1584]])
        for nu, fidelity_dims in itertools.product({0.5, 1.5, 2.5}, ([2], [1, 2])):
            kernel = LinearTruncatedFidelityKernel(
                fidelity_dims=fidelity_dims, dimension=3, nu=nu
            )
            kernel.power = 1
            n_fid = len(fidelity_dims)
            if n_fid > 1:
                active_dimsM = [0]
                t_2 = torch.tensor([[0.4725, 0.2889], [0.4025, 0.2541]])
                t_3 = torch.tensor([[0.1685, 0.0531], [0.1168, 0.0386]])
                t = 1 + t_1 + t_2 + t_3
            else:
                active_dimsM = [0, 1]
                t = 1 + t_1

            matern_ker = MaternKernel(nu=nu, active_dims=active_dimsM)
            matern_term = matern_ker(x1, x2).evaluate()
            actual = t * matern_term
            res = kernel(x1, x2).evaluate()
            self.assertLess(torch.norm(res - actual), 1e-4)
            # test diagonal mode
            res_diag = kernel(x1, x2, diag=True)
            self.assertLess(torch.norm(res_diag - actual.diag()), 1e-4)
        # make sure that we error out if last_dim_is_batch=True
        with self.assertRaises(NotImplementedError):
            kernel(x1, x2, diag=True, last_dim_is_batch=True)
Example #2
0
    def test_compute_linear_truncated_kernel_no_batch(self):
        x1 = torch.tensor([1, 0.1, 0.2, 2, 0.3, 0.4],
                          dtype=torch.float).view(2, 3)
        x2 = torch.tensor([3, 0.5, 0.6, 4, 0.7, 0.8],
                          dtype=torch.float).view(2, 3)
        t_1 = torch.tensor([0.3584, 0.1856, 0.2976, 0.1584],
                           dtype=torch.float).view(2, 2)
        for nu in {0.5, 1.5, 2.5}:
            for fidelity_dims in ([2], [1, 2]):
                kernel = LinearTruncatedFidelityKernel(
                    fidelity_dims=fidelity_dims, dimension=3, nu=nu)
                kernel.power = 1
                if len(fidelity_dims) > 1:
                    active_dimsM = [0]
                    t_2 = torch.tensor([0.4725, 0.2889, 0.4025, 0.2541],
                                       dtype=torch.float).view(2, 2)
                    t_3 = torch.tensor([0.1685, 0.0531, 0.1168, 0.0386],
                                       dtype=torch.float).view(2, 2)
                    t = 1 + t_1 + t_2 + t_3
                else:
                    active_dimsM = [0, 1]
                    t = 1 + t_1

                matern_ker = MaternKernel(nu=nu, active_dims=active_dimsM)
                matern_term = matern_ker(x1, x2).evaluate()
                actual = t * matern_term
                res = kernel(x1, x2).evaluate()
                self.assertLess(torch.norm(res - actual), 1e-4)
                # test diagonal mode
                res_diag = kernel(x1, x2, diag=True)
                self.assertLess(torch.norm(res_diag - actual.diag()), 1e-4)
        # make sure that we error out if last_dim_is_batch=True
        with self.assertRaises(NotImplementedError):
            kernel(x1, x2, diag=True, last_dim_is_batch=True)
    def test_compute_linear_truncated_kernel_no_batch(self):
        x1 = torch.tensor([1, 0.1, 0.2, 2, 0.3, 0.4],
                          dtype=torch.float).view(2, 3)
        x2 = torch.tensor([3, 0.5, 0.6, 4, 0.7, 0.8],
                          dtype=torch.float).view(2, 3)
        t_1 = torch.tensor([0.3584, 0.1856, 0.2976, 0.1584],
                           dtype=torch.float).view(2, 2)
        for nu in {0.5, 1.5, 2.5}:
            for train_data_fidelity in {False, True}:
                kernel = LinearTruncatedFidelityKernel(
                    nu=nu,
                    dimension=3,
                    train_data_fidelity=train_data_fidelity)
                kernel.power = 1
                if train_data_fidelity:
                    active_dimsM = [0]
                    t_2 = torch.tensor([0.4725, 0.2889, 0.4025, 0.2541],
                                       dtype=torch.float).view(2, 2)
                    t_3 = torch.tensor([0.1685, 0.0531, 0.1168, 0.0386],
                                       dtype=torch.float).view(2, 2)
                    t = 1 + t_1 + t_2 + t_3
                else:
                    active_dimsM = [0, 1]
                    t = 1 + t_1

                matern_ker = MaternKernel(nu=nu, active_dims=active_dimsM)
                matern_term = matern_ker(x1, x2).evaluate()
                actual = t * matern_term
                res = kernel(x1, x2).evaluate()
                self.assertLess(torch.norm(res - actual), 1e-4)
Example #4
0
    def __init__(
        self, decomposition: Dict[str, List[int]], batch_shape: torch.Size
    ) -> None:
        super().__init__(batch_shape=batch_shape)
        self.decomposition = decomposition

        num_param = len(next(iter(decomposition.values())))
        for active_parameters in decomposition.values():
            # check number of parameters are same in each decomp
            if len(active_parameters) != num_param:
                raise ValueError(
                    "num of parameters needs to be same across all contexts"
                )

        self._indexers = {
            context: torch.tensor(active_params)
            for context, active_params in self.decomposition.items()
        }

        self.base_kernel = MaternKernel(
            nu=2.5,
            ard_num_dims=num_param,
            batch_shape=batch_shape,
            lengthscale_prior=GammaPrior(3.0, 6.0),
        )

        self.kernel_dict = {}  # scaled kernel for each parameter space partition
        for context in list(decomposition.keys()):
            self.kernel_dict[context] = ScaleKernel(
                base_kernel=self.base_kernel, outputscale_prior=GammaPrior(2.0, 15.0)
            )
        self.kernel_dict = ModuleDict(self.kernel_dict)
Example #5
0
    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        train_Yvar: Tensor,
        outcome_transform: Optional[OutcomeTransform] = None,
    ) -> None:
        r"""A single-task exact GP model using fixed noise levels.

        Args:
            train_X: A `batch_shape x n x d` tensor of training features.
            train_Y: A `batch_shape x n x m` tensor of training observations.
            train_Yvar: A `batch_shape x n x m` tensor of observed measurement
                noise.
            outcome_transform: An outcome transform that is applied to the
                training data during instantiation and to the posterior during
                inference (that is, the `Posterior` obtained by calling
                `.posterior` on the model will be on the original scale).

        Example:
            >>> train_X = torch.rand(20, 2)
            >>> train_Y = torch.sin(train_X).sum(dim=1, keepdim=True)
            >>> train_Yvar = torch.full_like(train_Y, 0.2)
            >>> model = FixedNoiseGP(train_X, train_Y, train_Yvar)
        """
        if outcome_transform is not None:
            train_Y, train_Yvar = outcome_transform(train_Y, train_Yvar)
        validate_input_scaling(train_X=train_X,
                               train_Y=train_Y,
                               train_Yvar=train_Yvar)
        self._validate_tensor_args(X=train_X, Y=train_Y, Yvar=train_Yvar)
        self._set_dimensions(train_X=train_X, train_Y=train_Y)
        train_X, train_Y, train_Yvar = self._transform_tensor_args(
            X=train_X, Y=train_Y, Yvar=train_Yvar)
        likelihood = FixedNoiseGaussianLikelihood(
            noise=train_Yvar, batch_shape=self._aug_batch_shape)
        ExactGP.__init__(self,
                         train_inputs=train_X,
                         train_targets=train_Y,
                         likelihood=likelihood)
        self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape)
        self.covar_module = ScaleKernel(
            base_kernel=MaternKernel(
                nu=2.5,
                ard_num_dims=train_X.shape[-1],
                batch_shape=self._aug_batch_shape,
                lengthscale_prior=GammaPrior(3.0, 6.0),
            ),
            batch_shape=self._aug_batch_shape,
            outputscale_prior=GammaPrior(2.0, 0.15),
        )
        if outcome_transform is not None:
            self.outcome_transform = outcome_transform
        self._subset_batch_dict = {
            "mean_module.constant": -2,
            "covar_module.raw_outputscale": -1,
            "covar_module.base_kernel.raw_lengthscale": -3,
        }
        self.to(train_X)
Example #6
0
    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        likelihood: Optional[Likelihood] = None,
        covar_module: Optional[Module] = None,
    ) -> None:
        r"""A single-task exact GP model.

        Args:
            train_X: A `n x d` or `batch_shape x n x d` (batch mode) tensor of training
                features.
            train_Y: A `n x m` or `batch_shape x n x m` (batch mode) tensor of
                training observations.
            likelihood: A likelihood. If omitted, use a standard
                GaussianLikelihood with inferred noise level.
            covar_module: The covariance (kernel) matrix. If omitted, use the
                MaternKernel.

        Example:
            >>> train_X = torch.rand(20, 2)
            >>> train_Y = torch.sin(train_X).sum(dim=1, keepdim=True)
            >>> model = SingleTaskGP(train_X, train_Y)
        """
        validate_input_scaling(train_X=train_X, train_Y=train_Y)
        self._validate_tensor_args(X=train_X, Y=train_Y)
        self._set_dimensions(train_X=train_X, train_Y=train_Y)
        train_X, train_Y, _ = self._transform_tensor_args(X=train_X, Y=train_Y)
        if likelihood is None:
            noise_prior = GammaPrior(1.1, 0.05)
            noise_prior_mode = (noise_prior.concentration -
                                1) / noise_prior.rate
            likelihood = GaussianLikelihood(
                noise_prior=noise_prior,
                batch_shape=self._aug_batch_shape,
                noise_constraint=GreaterThan(
                    MIN_INFERRED_NOISE_LEVEL,
                    transform=None,
                    initial_value=noise_prior_mode,
                ),
            )
        else:
            self._is_custom_likelihood = True
        ExactGP.__init__(self, train_X, train_Y, likelihood)
        self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape)
        if covar_module is None:
            self.covar_module = ScaleKernel(
                MaternKernel(
                    nu=2.5,
                    ard_num_dims=train_X.shape[-1],
                    batch_shape=self._aug_batch_shape,
                    lengthscale_prior=GammaPrior(3.0, 6.0),
                ),
                batch_shape=self._aug_batch_shape,
                outputscale_prior=GammaPrior(2.0, 0.15),
            )
        else:
            self.covar_module = covar_module
        self.to(train_X)
Example #7
0
 def cont_kernel_factory(
     batch_shape: torch.Size, ard_num_dims: int, active_dims: List[int]
 ) -> MaternKernel:
     return MaternKernel(
         nu=2.5,
         batch_shape=batch_shape,
         ard_num_dims=ard_num_dims,
         active_dims=active_dims,
     )
Example #8
0
    def test_compute_linear_truncated_kernel_with_batch(self):
        x1 = torch.tensor([1, 0.1, 0.2, 3, 0.3, 0.4, 5, 0.5, 0.6, 7, 0.7, 0.8],
                          dtype=torch.float).view(2, 2, 3)
        x2 = torch.tensor([2, 0.8, 0.7, 4, 0.6, 0.5, 6, 0.4, 0.3, 8, 0.2, 0.1],
                          dtype=torch.float).view(2, 2, 3)
        t_1 = torch.tensor(
            [0.2736, 0.44, 0.2304, 0.36, 0.3304, 0.3816, 0.1736, 0.1944],
            dtype=torch.float,
        ).view(2, 2, 2)
        batch_shape = torch.Size([2])
        for nu in {0.5, 1.5, 2.5}:
            for fidelity_dims in ([2], [1, 2]):
                kernel = LinearTruncatedFidelityKernel(
                    fidelity_dims=fidelity_dims,
                    dimension=3,
                    nu=nu,
                    batch_shape=batch_shape,
                )
                kernel.power = 1
                if len(fidelity_dims) > 1:
                    active_dimsM = [0]
                    t_2 = torch.tensor(
                        [
                            0.0527, 0.167, 0.0383, 0.1159, 0.1159, 0.167,
                            0.0383, 0.0527
                        ],
                        dtype=torch.float,
                    ).view(2, 2, 2)
                    t_3 = torch.tensor(
                        [
                            0.1944, 0.3816, 0.1736, 0.3304, 0.36, 0.44, 0.2304,
                            0.2736
                        ],
                        dtype=torch.float,
                    ).view(2, 2, 2)
                    t = 1 + t_1 + t_2 + t_3
                else:
                    active_dimsM = [0, 1]
                    t = 1 + t_1

                matern_ker = MaternKernel(nu=nu,
                                          active_dims=active_dimsM,
                                          batch_shape=batch_shape)
                matern_term = matern_ker(x1, x2).evaluate()
                actual = t * matern_term
                res = kernel(x1, x2).evaluate()
                self.assertLess(torch.norm(res - actual), 1e-4)
                # test diagonal mode
                res_diag = kernel(x1, x2, diag=True)
                self.assertLess(
                    torch.norm(res_diag -
                               torch.diagonal(actual, dim1=-1, dim2=-2)),
                    1e-4,
                )
        # make sure that we error out if last_dim_is_batch=True
        with self.assertRaises(NotImplementedError):
            kernel(x1, x2, diag=True, last_dim_is_batch=True)
Example #9
0
    def test_sample_all_priors(self, cuda=False):
        device = torch.device("cuda" if cuda else "cpu")
        for dtype in (torch.float, torch.double):
            train_X = torch.rand(3, 5, device=device, dtype=dtype)
            train_Y = torch.rand(3, 1, device=device, dtype=dtype)
            model = SingleTaskGP(train_X=train_X, train_Y=train_Y)
            mll = ExactMarginalLogLikelihood(model.likelihood, model)
            mll.to(device=device, dtype=dtype)
            original_state_dict = dict(deepcopy(mll.model.state_dict()))
            sample_all_priors(model)

            # make sure one of the hyperparameters changed
            self.assertTrue(
                dict(model.state_dict())["likelihood.noise_covar.raw_noise"] !=
                original_state_dict["likelihood.noise_covar.raw_noise"])
            # check that lengthscales are all different
            ls = model.covar_module.base_kernel.raw_lengthscale.view(
                -1).tolist()
            self.assertTrue(all(ls[0] != ls[i]) for i in range(1, len(ls)))

            # change one of the priors to SmoothedBoxPrior
            model.covar_module = ScaleKernel(
                MaternKernel(
                    nu=2.5,
                    ard_num_dims=model.train_inputs[0].shape[-1],
                    batch_shape=model._aug_batch_shape,
                    lengthscale_prior=SmoothedBoxPrior(3.0, 6.0),
                ),
                batch_shape=model._aug_batch_shape,
                outputscale_prior=GammaPrior(2.0, 0.15),
            )
            original_state_dict = dict(deepcopy(mll.model.state_dict()))
            with warnings.catch_warnings(
                    record=True) as ws, settings.debug(True):
                sample_all_priors(model)
                self.assertEqual(len(ws), 1)
                self.assertTrue("rsample" in str(ws[0].message))

            # the lengthscale should not have changed because sampling is
            # not implemented for SmoothedBoxPrior
            self.assertTrue(
                torch.equal(
                    dict(model.state_dict())
                    ["covar_module.base_kernel.raw_lengthscale"],
                    original_state_dict[
                        "covar_module.base_kernel.raw_lengthscale"],
                ))

            # set setting_closure to None and make sure RuntimeError is raised
            prior_tuple = model.likelihood.noise_covar._priors["noise_prior"]
            model.likelihood.noise_covar._priors["noise_prior"] = (
                prior_tuple[0],
                prior_tuple[1],
                None,
            )
            with self.assertRaises(RuntimeError):
                sample_all_priors(model)
Example #10
0
 def cont_kernel_factory(
     batch_shape: torch.Size,
     ard_num_dims: int,
     active_dims: List[int],
 ) -> MaternKernel:
     return MaternKernel(
         nu=2.5,
         batch_shape=batch_shape,
         ard_num_dims=ard_num_dims,
         active_dims=active_dims,
         lengthscale_constraint=GreaterThan(1e-04),
     )
Example #11
0
    def __init__(self,
                 train_X: Tensor,
                 train_Y: Tensor,
                 likelihood: Optional[Likelihood] = None) -> None:
        r"""A single-task exact GP model.

        Args:
            train_X: A `n x d` or `batch_shape x n x d` (batch mode) tensor of training
                features.
            train_Y: A `n x (o)` or `batch_shape x n x (o)` (batch mode) tensor of
                training observations.
            likelihood: A likelihood. If omitted, use a standard
                GaussianLikelihood with inferred noise level.

        Example:
            >>> train_X = torch.rand(20, 2)
            >>> train_Y = torch.sin(train_X[:, 0]) + torch.cos(train_X[:, 1])
            >>> model = SingleTaskGP(train_X, train_Y)
        """
        ard_num_dims = train_X.shape[-1]
        train_X, train_Y, _ = self._set_dimensions(train_X=train_X,
                                                   train_Y=train_Y)
        train_X, train_Y, _ = multioutput_to_batch_mode_transform(
            train_X=train_X, train_Y=train_Y, num_outputs=self._num_outputs)
        if likelihood is None:
            noise_prior = GammaPrior(1.1, 0.05)
            noise_prior_mode = (noise_prior.concentration -
                                1) / noise_prior.rate
            likelihood = GaussianLikelihood(
                noise_prior=noise_prior,
                batch_shape=self._aug_batch_shape,
                noise_constraint=GreaterThan(
                    MIN_INFERRED_NOISE_LEVEL,
                    transform=None,
                    initial_value=noise_prior_mode,
                ),
            )
        else:
            self._likelihood_state_dict = deepcopy(likelihood.state_dict())
        ExactGP.__init__(self, train_X, train_Y, likelihood)
        self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape)
        self.covar_module = ScaleKernel(
            MaternKernel(
                nu=2.5,
                ard_num_dims=ard_num_dims,
                batch_shape=self._aug_batch_shape,
                lengthscale_prior=GammaPrior(3.0, 6.0),
            ),
            batch_shape=self._aug_batch_shape,
            outputscale_prior=GammaPrior(2.0, 0.15),
        )
        self.to(train_X)
    def test_compute_linear_truncated_kernel_with_batch(self):
        x1 = torch.tensor(
            [[[1.0, 0.1, 0.2], [3.0, 0.3, 0.4]], [[5.0, 0.5, 0.6], [7.0, 0.7, 0.8]]]
        )
        x2 = torch.tensor(
            [[[2.0, 0.8, 0.7], [4.0, 0.6, 0.5]], [[6.0, 0.4, 0.3], [8.0, 0.2, 0.1]]]
        )
        t_1 = torch.tensor(
            [[[0.2736, 0.4400], [0.2304, 0.3600]], [[0.3304, 0.3816], [0.1736, 0.1944]]]
        )
        batch_shape = torch.Size([2])
        for nu, fidelity_dims in itertools.product({0.5, 1.5, 2.5}, ([2], [1, 2])):
            kernel = LinearTruncatedFidelityKernel(
                fidelity_dims=fidelity_dims, dimension=3, nu=nu, batch_shape=batch_shape
            )
            kernel.power = 1
            if len(fidelity_dims) > 1:
                active_dimsM = [0]
                t_2 = torch.tensor(
                    [
                        [[0.0527, 0.1670], [0.0383, 0.1159]],
                        [[0.1159, 0.1670], [0.0383, 0.0527]],
                    ]
                )
                t_3 = torch.tensor(
                    [
                        [[0.1944, 0.3816], [0.1736, 0.3304]],
                        [[0.3600, 0.4400], [0.2304, 0.2736]],
                    ]
                )
                t = 1 + t_1 + t_2 + t_3
            else:
                active_dimsM = [0, 1]
                t = 1 + t_1

            matern_ker = MaternKernel(
                nu=nu, active_dims=active_dimsM, batch_shape=batch_shape
            )
            matern_term = matern_ker(x1, x2).evaluate()
            actual = t * matern_term
            res = kernel(x1, x2).evaluate()
            self.assertLess(torch.norm(res - actual), 1e-4)
            # test diagonal mode
            res_diag = kernel(x1, x2, diag=True)
            self.assertLess(
                torch.norm(res_diag - torch.diagonal(actual, dim1=-1, dim2=-2)), 1e-4
            )
        # make sure that we error out if last_dim_is_batch=True
        with self.assertRaises(NotImplementedError):
            kernel(x1, x2, diag=True, last_dim_is_batch=True)
    def test_compute_linear_truncated_kernel_with_batch(self):
        x1 = torch.tensor([1, 0.1, 0.2, 3, 0.3, 0.4, 5, 0.5, 0.6, 7, 0.7, 0.8],
                          dtype=torch.float).view(2, 2, 3)
        x2 = torch.tensor([2, 0.8, 0.7, 4, 0.6, 0.5, 6, 0.4, 0.3, 8, 0.2, 0.1],
                          dtype=torch.float).view(2, 2, 3)
        t_1 = torch.tensor(
            [0.2736, 0.44, 0.2304, 0.36, 0.3304, 0.3816, 0.1736, 0.1944],
            dtype=torch.float,
        ).view(2, 2, 2)
        batch_shape = torch.Size([2])
        dimension = 3
        for nu in {0.5, 1.5, 2.5}:
            for train_data_fidelity in {False, True}:
                kernel = LinearTruncatedFidelityKernel(
                    nu=nu,
                    dimension=dimension,
                    train_data_fidelity=train_data_fidelity,
                    batch_shape=batch_shape,
                )
                kernel.power = 1
                kernel.train_data_fidelity = train_data_fidelity
                if train_data_fidelity:
                    active_dimsM = [0]
                    t_2 = torch.tensor(
                        [
                            0.0527, 0.167, 0.0383, 0.1159, 0.1159, 0.167,
                            0.0383, 0.0527
                        ],
                        dtype=torch.float,
                    ).view(2, 2, 2)
                    t_3 = torch.tensor(
                        [
                            0.1944, 0.3816, 0.1736, 0.3304, 0.36, 0.44, 0.2304,
                            0.2736
                        ],
                        dtype=torch.float,
                    ).view(2, 2, 2)
                    t = 1 + t_1 + t_2 + t_3
                else:
                    active_dimsM = [0, 1]
                    t = 1 + t_1

                matern_ker = MaternKernel(nu=nu,
                                          active_dims=active_dimsM,
                                          batch_shape=batch_shape)
                matern_term = matern_ker(x1, x2).evaluate()
                actual = t * matern_term
                res = kernel(x1, x2).evaluate()
                self.assertLess(torch.norm(res - actual), 1e-4)
Example #14
0
    def __init__(self, train_X: Tensor, train_Y: Tensor, train_Yvar: Tensor) -> None:
        r"""A single-task exact GP model using fixed noise levels.

        Args:
            train_X: A `n x d` or `batch_shape x n x d` (batch mode) tensor of training
                features.
            train_Y: A `n x (o)` or `batch_shape x n x (o)` (batch mode) tensor of
                training observations.
            train_Yvar: A `batch_shape x n x (o)` or `batch_shape x n x (o)`
                (batch mode) tensor of observed measurement noise.

        Example:
            >>> train_X = torch.rand(20, 2)
            >>> train_Y = torch.sin(train_X[:, 0]]) + torch.cos(train_X[:, 1])
            >>> train_Yvar = torch.full_like(train_Y, 0.2)
            >>> model = FixedNoiseGP(train_X, train_Y, train_Yvar)
        """
        ard_num_dims = train_X.shape[-1]
        train_X, train_Y, train_Yvar = self._set_dimensions(
            train_X=train_X, train_Y=train_Y, train_Yvar=train_Yvar
        )
        train_X, train_Y, train_Yvar = multioutput_to_batch_mode_transform(
            train_X=train_X,
            train_Y=train_Y,
            num_outputs=self._num_outputs,
            train_Yvar=train_Yvar,
        )
        likelihood = FixedNoiseGaussianLikelihood(
            noise=train_Yvar, batch_shape=self._aug_batch_shape
        )
        ExactGP.__init__(
            self, train_inputs=train_X, train_targets=train_Y, likelihood=likelihood
        )
        self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape)
        self.covar_module = ScaleKernel(
            base_kernel=MaternKernel(
                nu=2.5,
                ard_num_dims=ard_num_dims,
                batch_shape=self._aug_batch_shape,
                lengthscale_prior=GammaPrior(3.0, 6.0),
            ),
            batch_shape=self._aug_batch_shape,
            outputscale_prior=GammaPrior(2.0, 0.15),
        )
        self.to(train_X)
Example #15
0
    def __init__(
        self,
        decomposition: Dict[str, List[int]],
        batch_shape: torch.Size,
        train_embedding: bool = True,
        cat_feature_dict: Optional[Dict] = None,
        embs_feature_dict: Optional[Dict] = None,
        embs_dim_list: Optional[List[int]] = None,
        context_weight_dict: Optional[Dict] = None,
        device: Optional[torch.device] = None,
    ) -> None:

        super().__init__(batch_shape=batch_shape)
        self.decomposition = decomposition
        self.batch_shape = batch_shape
        self.train_embedding = train_embedding
        self.device = device

        num_param = len(next(iter(decomposition.values())))
        self.context_list = list(decomposition.keys())
        self.num_contexts = len(self.context_list)

        # get parameter space decomposition
        for active_parameters in decomposition.values():
            # check number of parameters are same in each decomp
            if len(active_parameters) != num_param:
                raise ValueError(
                    "num of parameters needs to be same across all contexts")
        self._indexers = {
            context: torch.tensor(active_params, device=self.device)
            for context, active_params in self.decomposition.items()
        }
        # get context features and set emb dim
        self.context_cat_feature = None
        self.context_emb_feature = None
        self.n_embs = 0
        self.emb_weight_matrix_list = None
        self.emb_dims = None
        self._set_context_features(
            cat_feature_dict=cat_feature_dict,
            embs_feature_dict=embs_feature_dict,
            embs_dim_list=embs_dim_list,
        )
        # contruct embedding layer
        if train_embedding:
            self._set_emb_layers()
        # task covariance matrix
        self.task_covar_module = MaternKernel(
            nu=2.5,
            ard_num_dims=self.n_embs,
            batch_shape=batch_shape,
            lengthscale_prior=GammaPrior(3.0, 6.0),
        )
        # base kernel
        self.base_kernel = MaternKernel(
            nu=2.5,
            ard_num_dims=num_param,
            batch_shape=batch_shape,
            lengthscale_prior=GammaPrior(3.0, 6.0),
        )
        # outputscales for each context (note this is like sqrt of outputscale)
        self.context_weight = None
        if context_weight_dict is None:
            outputscale_list = torch.zeros(*batch_shape,
                                           self.num_contexts,
                                           device=self.device)
        else:
            outputscale_list = torch.zeros(*batch_shape, 1, device=self.device)
            self.context_weight = torch.tensor(
                [context_weight_dict[c] for c in self.context_list],
                device=self.device)
        self.register_parameter(name="raw_outputscale_list",
                                parameter=torch.nn.Parameter(outputscale_list))
        self.register_prior(
            "outputscale_list_prior",
            GammaPrior(2.0, 15.0),
            lambda m: m.outputscale_list,
            lambda m, v: m._set_outputscale_list(v),
        )
        self.register_constraint("raw_outputscale_list", Positive())
    def __init__(  # noqa C901
        self,
        fidelity_dims: List[int],
        dimension: Optional[int] = None,
        power_prior: Optional[Prior] = None,
        power_constraint: Optional[Interval] = None,
        nu: float = 2.5,
        lengthscale_prior_unbiased: Optional[Prior] = None,
        lengthscale_prior_biased: Optional[Prior] = None,
        lengthscale_constraint_unbiased: Optional[Interval] = None,
        lengthscale_constraint_biased: Optional[Interval] = None,
        covar_module_unbiased: Optional[Kernel] = None,
        covar_module_biased: Optional[Kernel] = None,
        **kwargs: Any,
    ) -> None:
        if dimension is None and kwargs.get("active_dims") is None:
            raise UnsupportedError(
                "Must specify dimension when not specifying active_dims.")
        n_fidelity = len(fidelity_dims)
        if len(set(fidelity_dims)) != n_fidelity:
            raise ValueError("fidelity_dims must not have repeated elements")
        if n_fidelity not in {1, 2}:
            raise UnsupportedError(
                "LinearTruncatedFidelityKernel accepts either one or two"
                "fidelity parameters.")
        if nu not in {0.5, 1.5, 2.5}:
            raise ValueError("nu must be one of 0.5, 1.5, or 2.5")

        super().__init__(**kwargs)
        self.fidelity_dims = fidelity_dims
        if power_constraint is None:
            power_constraint = Positive()

        if lengthscale_prior_unbiased is None:
            lengthscale_prior_unbiased = GammaPrior(3, 6)

        if lengthscale_prior_biased is None:
            lengthscale_prior_biased = GammaPrior(6, 2)

        if lengthscale_constraint_unbiased is None:
            lengthscale_constraint_unbiased = Positive()

        if lengthscale_constraint_biased is None:
            lengthscale_constraint_biased = Positive()

        self.register_parameter(
            name="raw_power",
            parameter=torch.nn.Parameter(torch.zeros(*self.batch_shape, 1)),
        )
        self.register_constraint("raw_power", power_constraint)

        if power_prior is not None:
            self.register_prior(
                "power_prior",
                power_prior,
                lambda: self.power,
                lambda v: self._set_power(v),
            )

        if self.active_dims is not None:
            dimension = len(self.active_dims)

        if covar_module_unbiased is None:
            covar_module_unbiased = MaternKernel(
                nu=nu,
                batch_shape=self.batch_shape,
                lengthscale_prior=lengthscale_prior_unbiased,
                ard_num_dims=dimension - n_fidelity,
                lengthscale_constraint=lengthscale_constraint_unbiased,
            )

        if covar_module_biased is None:
            covar_module_biased = MaternKernel(
                nu=nu,
                batch_shape=self.batch_shape,
                lengthscale_prior=lengthscale_prior_biased,
                ard_num_dims=dimension - n_fidelity,
                lengthscale_constraint=lengthscale_constraint_biased,
            )

        self.covar_module_unbiased = covar_module_unbiased
        self.covar_module_biased = covar_module_biased
Example #17
0
    def sample_arch(self, START_BO, g, hyperparams, og_flops, empty_val_loss, full_val_loss, target_flops=0):
        if g < START_BO:
            if target_flops == 0:
                f = np.random.rand(1) * (args.upper_channel-args.lower_channel) + args.lower_channel
            else:
                f = args.lower_channel
            parameterization = np.ones(hyperparams.get_dim()) * f
            layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
        elif g == START_BO:
            if target_flops == 0:
                parameterization = np.ones(hyperparams.get_dim())
            else:
                f = args.lower_channel
                parameterization = np.ones(hyperparams.get_dim()) * f
            layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
        else:
            rand = torch.rand(1).cuda()

            train_X = torch.FloatTensor(self.X).cuda()
            train_Y_loss = torch.FloatTensor(np.array(self.Y)[:, 0].reshape(-1, 1)).cuda()
            train_Y_loss = standardize(train_Y_loss)

            train_Y_cost = torch.FloatTensor(np.array(self.Y)[:, 1].reshape(-1, 1)).cuda()
            train_Y_cost = standardize(train_Y_cost)

            covar_module = None
            if args.ski and g > 128:
                if args.additive:
                    covar_module = AdditiveStructureKernel(
                        ScaleKernel(
                            GridInterpolationKernel(
                                MaternKernel(
                                    nu=2.5,
                                    lengthscale_prior=GammaPrior(3.0, 6.0),
                                ),
                                grid_size=128, num_dims=1, grid_bounds=[(0, 1)]
                            ),
                            outputscale_prior=GammaPrior(2.0, 0.15),
                        ), 
                        num_dims=train_X.shape[1]
                    )
                else:
                    covar_module = ScaleKernel(
                        GridInterpolationKernel(
                            MaternKernel(
                                nu=2.5,
                                lengthscale_prior=GammaPrior(3.0, 6.0),
                            ),
                            grid_size=128, num_dims=train_X.shape[1], grid_bounds=[(0, 1) for _ in range(train_X.shape[1])]
                        ),
                        outputscale_prior=GammaPrior(2.0, 0.15),
                    )
            else:
                if args.additive:
                    covar_module = AdditiveStructureKernel(
                        ScaleKernel(
                            MaternKernel(
                                nu=2.5,
                                lengthscale_prior=GammaPrior(3.0, 6.0),
                                num_dims=1
                            ),
                            outputscale_prior=GammaPrior(2.0, 0.15),
                        ),
                        num_dims=train_X.shape[1]
                    )
                else:
                    covar_module = ScaleKernel(
                        MaternKernel(
                            nu=2.5,
                            lengthscale_prior=GammaPrior(3.0, 6.0),
                            num_dims=train_X.shape[1]
                        ),
                        outputscale_prior=GammaPrior(2.0, 0.15),
                    )

            new_train_X = train_X
            gp_loss = SingleTaskGP(new_train_X, train_Y_loss, covar_module=covar_module)
            mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss)
            mll = mll.to('cuda')
            fit_gpytorch_model(mll)


            # Use add-gp for cost
            covar_module = AdditiveStructureKernel(
                ScaleKernel(
                    MaternKernel(
                        nu=2.5,
                        lengthscale_prior=GammaPrior(3.0, 6.0),
                        num_dims=1
                    ),
                    outputscale_prior=GammaPrior(2.0, 0.15),
                ),
                num_dims=train_X.shape[1]
            )
            gp_cost = SingleTaskGP(new_train_X, train_Y_cost, covar_module=covar_module)
            mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost)
            mll = mll.to('cuda')
            fit_gpytorch_model(mll)

            UCB_loss = UpperConfidenceBound(gp_loss, beta=args.beta).cuda()
            UCB_cost = UpperConfidenceBound(gp_cost, beta=args.beta).cuda()
            self.mobo_obj = RandAcquisition(UCB_loss).cuda()
            self.mobo_obj.setup(UCB_loss, UCB_cost, rand)

            lower = torch.ones(new_train_X.shape[1])*args.lower_channel
            upper = torch.ones(new_train_X.shape[1])*args.upper_channel
            self.mobo_bounds = torch.stack([lower, upper]).cuda()

            if args.pas:
                val = np.linspace(args.lower_flops, 1, 50)
                chosen_target_flops = np.random.choice(val, p=(self.sampling_weights/np.sum(self.sampling_weights)))
                
                lower_bnd, upper_bnd = 0, 1
                lmda = 0.5
                for i in range(10):
                    self.mobo_obj.rand = lmda

                    parameterization, acq_value = optimize_acqf(
                        self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000,
                    )

                    parameterization = parameterization[0].cpu().numpy()
                    layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
                    sim_flops = self.mask_pruner.simulate_and_count_flops(layer_budget, self.use_mem)
                    ratio = sim_flops/og_flops

                    if np.abs(ratio - chosen_target_flops) <= 0.02:
                        break
                    if args.baseline > 0:
                        if ratio < chosen_target_flops:
                            lower_bnd = lmda
                            lmda = (lmda + upper_bnd) / 2
                        elif ratio > chosen_target_flops:
                            upper_bnd = lmda
                            lmda = (lmda + lower_bnd) / 2
                    else:
                        if ratio < chosen_target_flops:
                            upper_bnd = lmda
                            lmda = (lmda + lower_bnd) / 2
                        elif ratio > chosen_target_flops:
                            lower_bnd = lmda
                            lmda = (lmda + upper_bnd) / 2
                rand[0] = lmda
                writer.add_scalar('Binary search trials', i, g)

            else:
                parameterization, acq_value = optimize_acqf(
                    self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000,
                )
                parameterization = parameterization[0].cpu().numpy()

            layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
        return layer_budget, parameterization, self.sampling_weights/np.sum(self.sampling_weights)
Example #18
0
    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        likelihood: Optional[Likelihood] = None,
        covar_module: Optional[Module] = None,
        outcome_transform: Optional[OutcomeTransform] = None,
    ) -> None:
        r"""A single-task exact GP model.

        Args:
            train_X: A `batch_shape x n x d` tensor of training features.
            train_Y: A `batch_shape x n x m` tensor of training observations.
            likelihood: A likelihood. If omitted, use a standard
                GaussianLikelihood with inferred noise level.
            covar_module: The module computing the covariance (Kernel) matrix.
                If omitted, use a `MaternKernel`.
            outcome_transform: An outcome transform that is applied to the
                training data during instantiation and to the posterior during
                inference (that is, the `Posterior` obtained by calling
                `.posterior` on the model will be on the original scale).

        Example:
            >>> train_X = torch.rand(20, 2)
            >>> train_Y = torch.sin(train_X).sum(dim=1, keepdim=True)
            >>> model = SingleTaskGP(train_X, train_Y)
        """
        if outcome_transform is not None:
            train_Y, _ = outcome_transform(train_Y)
        validate_input_scaling(train_X=train_X, train_Y=train_Y)
        self._validate_tensor_args(X=train_X, Y=train_Y)
        self._set_dimensions(train_X=train_X, train_Y=train_Y)
        train_X, train_Y, _ = self._transform_tensor_args(X=train_X, Y=train_Y)
        if likelihood is None:
            noise_prior = GammaPrior(1.1, 0.05)
            noise_prior_mode = (noise_prior.concentration -
                                1) / noise_prior.rate
            likelihood = GaussianLikelihood(
                noise_prior=noise_prior,
                batch_shape=self._aug_batch_shape,
                noise_constraint=GreaterThan(
                    MIN_INFERRED_NOISE_LEVEL,
                    transform=None,
                    initial_value=noise_prior_mode,
                ),
            )
        else:
            self._is_custom_likelihood = True
        ExactGP.__init__(self, train_X, train_Y, likelihood)
        self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape)
        if covar_module is None:
            self.covar_module = ScaleKernel(
                MaternKernel(
                    nu=2.5,
                    ard_num_dims=train_X.shape[-1],
                    batch_shape=self._aug_batch_shape,
                    lengthscale_prior=GammaPrior(3.0, 6.0),
                ),
                batch_shape=self._aug_batch_shape,
                outputscale_prior=GammaPrior(2.0, 0.15),
            )
            self._subset_batch_dict = {
                "likelihood.noise_covar.raw_noise": -2,
                "mean_module.constant": -2,
                "covar_module.raw_outputscale": -1,
                "covar_module.base_kernel.raw_lengthscale": -3,
            }
        else:
            self.covar_module = covar_module
        # TODO: Allow subsetting of other covar modules
        if outcome_transform is not None:
            self.outcome_transform = outcome_transform
        self.to(train_X)
Example #19
0
    def __init__(
        self,
        dimension: int = 3,
        nu: float = 2.5,
        train_iteration_fidelity: bool = True,
        train_data_fidelity: bool = True,
        lengthscale_prior: Optional[Prior] = None,
        power_prior: Optional[Prior] = None,
        power_constraint: Optional[Interval] = None,
        lengthscale_2_prior: Optional[Prior] = None,
        lengthscale_2_constraint: Optional[Interval] = None,
        lengthscale_constraint: Optional[Interval] = None,
        covar_module_1: Optional[Kernel] = None,
        covar_module_2: Optional[Kernel] = None,
        **kwargs: Any,
    ):
        if not train_iteration_fidelity and not train_data_fidelity:
            raise UnsupportedError(
                "You should have at least one fidelity parameter.")
        if nu not in {0.5, 1.5, 2.5}:
            raise ValueError("nu expected to be 0.5, 1.5, or 2.5")
        super().__init__(**kwargs)
        self.train_iteration_fidelity = train_iteration_fidelity
        self.train_data_fidelity = train_data_fidelity
        if power_constraint is None:
            power_constraint = Positive()

        if lengthscale_prior is None:
            lengthscale_prior = GammaPrior(3, 6)

        if lengthscale_2_prior is None:
            lengthscale_2_prior = GammaPrior(6, 2)

        if lengthscale_constraint is None:
            lengthscale_constraint = Positive()

        if lengthscale_2_constraint is None:
            lengthscale_2_constraint = Positive()

        self.register_parameter(
            name="raw_power",
            parameter=torch.nn.Parameter(torch.zeros(*self.batch_shape, 1)),
        )

        if power_prior is not None:
            self.register_prior(
                "power_prior",
                power_prior,
                lambda: self.power,
                lambda v: self._set_power(v),
            )
        self.register_constraint("raw_power", power_constraint)

        m = self.train_iteration_fidelity + self.train_data_fidelity

        if self.active_dims is not None:
            dimension = len(self.active_dims)

        if covar_module_1 is None:
            self.covar_module_1 = MaternKernel(
                nu=nu,
                batch_shape=self.batch_shape,
                lengthscale_prior=lengthscale_prior,
                ard_num_dims=dimension - m,
                lengthscale_constraint=lengthscale_constraint,
            )
        else:
            self.covar_module_1 = covar_module_1

        if covar_module_2 is None:
            self.covar_module_2 = MaternKernel(
                nu=nu,
                batch_shape=self.batch_shape,
                lengthscale_prior=lengthscale_2_prior,
                ard_num_dims=dimension - m,
                lengthscale_constraint=lengthscale_2_constraint,
            )
        else:
            self.covar_module_2 = covar_module_2
Example #20
0
    def sample_arch(self, START_BO, g, steps, hyperparams, og_flops, full_val_loss, target_flops=0):
        if args.slim:
            if target_flops == 0:
                parameterization = hyperparams.random_sample()
                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
            else:
                parameterization = np.ones(hyperparams.get_dim()) * args.lower_channel
                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
        else:
            # random sample to warmup history for MOBO
            if g < START_BO:
                if target_flops == 0:
                    f = np.random.rand(1) * (args.upper_channel-args.lower_channel) + args.lower_channel
                else:
                    f = args.lower_channel
                parameterization = np.ones(hyperparams.get_dim()) * f
                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
            # put the largest model into the history
            elif g == START_BO:
                if target_flops == 0:
                    parameterization = np.ones(hyperparams.get_dim())
                else:
                    f = args.lower_channel
                    parameterization = np.ones(hyperparams.get_dim()) * f
                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
            # MOBO
            else:
                # this is the scalarization (lambda_{FLOPs})
                rand = torch.rand(1).cuda()

                # standardize data for building Gaussian Processes
                train_X = torch.FloatTensor(self.X).cuda()
                train_Y_loss = torch.FloatTensor(np.array(self.Y)[:, 0].reshape(-1, 1)).cuda()
                train_Y_loss = standardize(train_Y_loss)

                train_Y_cost = torch.FloatTensor(np.array(self.Y)[:, 1].reshape(-1, 1)).cuda()
                train_Y_cost = standardize(train_Y_cost)

                new_train_X = train_X
                # GP for the cross entropy loss
                gp_loss = SingleTaskGP(new_train_X, train_Y_loss)
                mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss)
                mll = mll.to('cuda')
                fit_gpytorch_model(mll)


                # GP for FLOPs
                # we use add-gp since FLOPs has addive structure (not exactly though)
                # the parameters for ScaleKernel and MaternKernel simply follow the default
                covar_module = AdditiveStructureKernel(
                    ScaleKernel(
                        MaternKernel(
                            nu=2.5,
                            lengthscale_prior=GammaPrior(3.0, 6.0),
                            num_dims=1
                        ),
                        outputscale_prior=GammaPrior(2.0, 0.15),
                    ),
                    num_dims=train_X.shape[1]
                )
                gp_cost = SingleTaskGP(new_train_X, train_Y_cost, covar_module=covar_module)
                mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost)
                mll = mll.to('cuda')
                fit_gpytorch_model(mll)

                # Build acquisition functions
                UCB_loss = UpperConfidenceBound(gp_loss, beta=0.1).cuda()
                UCB_cost = UpperConfidenceBound(gp_cost, beta=0.1).cuda()

                # Combine them via augmented Tchebyshev scalarization
                self.mobo_obj = RandAcquisition(UCB_loss).cuda()
                self.mobo_obj.setup(UCB_loss, UCB_cost, rand)

                # Bounds for the optimization variable (alpha)
                lower = torch.ones(new_train_X.shape[1])*args.lower_channel
                upper = torch.ones(new_train_X.shape[1])*args.upper_channel
                self.mobo_bounds = torch.stack([lower, upper]).cuda()

                # Pareto-aware sampling
                if args.pas:
                    # Generate approximate Pareto front first
                    costs = []
                    for i in range(len(self.population_data)):
                        costs.append([self.population_data[i]['loss'], self.population_data[i]['ratio']])
                    costs = np.array(costs)
                    efficient_mask = is_pareto_efficient(costs)
                    costs = costs[efficient_mask]
                    loss = costs[:, 0]
                    flops = costs[:, 1]
                    sorted_idx = np.argsort(flops)
                    loss = loss[sorted_idx]
                    flops = flops[sorted_idx]
                    if flops[0] > args.lower_flops:
                        flops = np.concatenate([[args.lower_flops], flops.reshape(-1)])
                        loss = np.concatenate([[8], loss.reshape(-1)])
                    else:
                        flops = flops.reshape(-1)
                        loss = loss.reshape(-1)

                    if flops[-1] < args.upper_flops and (loss[-1] > full_val_loss):
                        flops = np.concatenate([flops.reshape(-1), [args.upper_flops]])
                        loss = np.concatenate([loss.reshape(-1), [full_val_loss]])
                    else:
                        flops = flops.reshape(-1)
                        loss = loss.reshape(-1)

                    # Equation (4) in paper
                    areas = (flops[1:]-flops[:-1])*(loss[:-1]-loss[1:])

                    # Quantize into 50 bins to sample from multinomial
                    self.sampling_weights = np.zeros(50)
                    k = 0
                    while k < len(flops) and flops[k] < args.lower_flops:
                        k+=1
                    for i in range(50):
                        lower = i/50.
                        upper = (i+1)/50.
                        if upper < args.lower_flops or lower > args.upper_flops or lower < args.lower_flops:
                            continue
                        cnt = 1
                        while ((k+1) < len(flops)) and upper > flops[k+1]:
                            self.sampling_weights[i] += areas[k]
                            cnt += 1
                            k += 1
                        if k < len(areas):
                            self.sampling_weights[i] += areas[k]
                        self.sampling_weights[i] /= cnt
                    if np.sum(self.sampling_weights) == 0:
                        self.sampling_weights = np.ones(50)
                        
                    if target_flops == 0:
                        val = np.arange(0.01, 1, 0.02)
                        chosen_target_flops = np.random.choice(val, p=(self.sampling_weights/np.sum(self.sampling_weights)))
                    else:
                        chosen_target_flops = target_flops
                    
                    # Binary search is here
                    lower_bnd, upper_bnd = 0, 1
                    lmda = 0.5
                    for i in range(10):
                        self.mobo_obj.rand = lmda

                        parameterization, acq_value = optimize_acqf(
                            self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000,
                        )

                        parameterization = parameterization[0].cpu().numpy()
                        layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
                        sim_flops = self.mask_pruner.simulate_and_count_flops(layer_budget)
                        ratio = sim_flops/og_flops

                        if np.abs(ratio - chosen_target_flops) <= 0.02:
                            break
                        if args.baseline > 0:
                            if ratio < chosen_target_flops:
                                lower_bnd = lmda
                                lmda = (lmda + upper_bnd) / 2
                            elif ratio > chosen_target_flops:
                                upper_bnd = lmda
                                lmda = (lmda + lower_bnd) / 2
                        else:
                            if ratio < chosen_target_flops:
                                upper_bnd = lmda
                                lmda = (lmda + lower_bnd) / 2
                            elif ratio > chosen_target_flops:
                                lower_bnd = lmda
                                lmda = (lmda + upper_bnd) / 2
                    rand[0] = lmda
                    writer.add_scalar('Binary search trials', i, steps)

                else:
                    parameterization, acq_value = optimize_acqf(
                        self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000,
                    )
                    parameterization = parameterization[0].cpu().numpy()

                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
        return layer_budget, parameterization, self.sampling_weights/np.sum(self.sampling_weights)
Example #21
0
    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        likelihood: Optional[MultitaskGaussianLikelihood] = None,
        data_covar_module: Optional[Module] = None,
        task_covar_prior: Optional[Prior] = None,
        rank: Optional[int] = None,
        input_transform: Optional[InputTransform] = None,
        outcome_transform: Optional[OutcomeTransform] = None,
        **kwargs: Any,
    ) -> None:
        r"""Multi-task GP with Kronecker structure, using a simple ICM kernel.

        Args:
            train_X: A `batch_shape x n x d` tensor of training features.
            train_Y: A `batch_shape x n x m` tensor of training observations.
            likelihood: A `MultitaskGaussianLikelihood`. If omitted, uses a
                `MultitaskGaussianLikelihood` with a `GammaPrior(1.1, 0.05)`
                noise prior.
            data_covar_module: The module computing the covariance (Kernel) matrix
                in data space. If omitted, use a `MaternKernel`.
            task_covar_prior : A Prior on the task covariance matrix. Must operate
                on p.s.d. matrices. A common prior for this is the `LKJ` prior. If
                omitted, uses `LKJCovariancePrior` with `eta` parameter as specified
                in the keyword arguments (if not specified, use `eta=1.5`).
            rank: The rank of the ICM kernel. If omitted, use a full rank kernel.
            kwargs: Additional arguments to override default settings of priors,
                including:
                - eta: The eta parameter on the default LKJ task_covar_prior.
                A value of 1.0 is uninformative, values <1.0 favor stronger
                correlations (in magnitude), correlations vanish as eta -> inf.
                - sd_prior: A scalar prior over nonnegative numbers, which is used
                for the default LKJCovariancePrior task_covar_prior.
                - likelihood_rank: The rank of the task covariance matrix to fit.
                Defaults to 0 (which corresponds to a diagonal covariance matrix).

        Example:
            >>> train_X = torch.rand(10, 2)
            >>> train_Y = torch.cat([f_1(X), f_2(X)], dim=-1)
            >>> model = KroneckerMultiTaskGP(train_X, train_Y)
        """
        with torch.no_grad():
            transformed_X = self.transform_inputs(
                X=train_X, input_transform=input_transform)
        if outcome_transform is not None:
            train_Y, _ = outcome_transform(train_Y)

        self._validate_tensor_args(X=transformed_X, Y=train_Y)
        self._num_outputs = train_Y.shape[-1]
        batch_shape, ard_num_dims = train_X.shape[:-2], train_X.shape[-1]
        num_tasks = train_Y.shape[-1]

        if rank is None:
            rank = num_tasks
        if likelihood is None:
            noise_prior = GammaPrior(1.1, 0.05)
            noise_prior_mode = (noise_prior.concentration -
                                1) / noise_prior.rate
            likelihood = MultitaskGaussianLikelihood(
                num_tasks=num_tasks,
                batch_shape=batch_shape,
                noise_prior=noise_prior,
                noise_constraint=GreaterThan(
                    MIN_INFERRED_NOISE_LEVEL,
                    transform=None,
                    initial_value=noise_prior_mode,
                ),
                rank=kwargs.get("likelihood_rank", 0),
            )
        if task_covar_prior is None:
            task_covar_prior = LKJCovariancePrior(
                n=num_tasks,
                eta=torch.tensor(kwargs.get("eta", 1.5)).to(train_X),
                sd_prior=kwargs.get(
                    "sd_prior",
                    SmoothedBoxPrior(math.exp(-6), math.exp(1.25), 0.05),
                ),
            )
        super().__init__(train_X, train_Y, likelihood)
        self.mean_module = MultitaskMean(
            base_means=ConstantMean(batch_shape=batch_shape),
            num_tasks=num_tasks)
        if data_covar_module is None:
            data_covar_module = MaternKernel(
                nu=2.5,
                ard_num_dims=ard_num_dims,
                lengthscale_prior=GammaPrior(3.0, 6.0),
                batch_shape=batch_shape,
            )
        else:
            data_covar_module = data_covar_module

        self.covar_module = MultitaskKernel(
            data_covar_module=data_covar_module,
            num_tasks=num_tasks,
            rank=rank,
            batch_shape=batch_shape,
            task_covar_prior=task_covar_prior,
        )

        if outcome_transform is not None:
            self.outcome_transform = outcome_transform
        if input_transform is not None:
            self.input_transform = input_transform
        self.to(train_X)
Example #22
0
    def sample_arch(self,
                    START_BO,
                    g,
                    hyperparams,
                    og_flops,
                    empty_val_loss,
                    full_val_loss,
                    target_flops=0):
        # Warming up the history with a single width-multiplier
        if g < START_BO:
            if target_flops == 0:
                f = np.random.rand(1) * (args.upper_channel - args.
                                         lower_channel) + args.lower_channel
            else:
                f = args.lower_channel
            parameterization = np.ones(hyperparams.get_dim()) * f
            layer_budget = hyperparams.get_layer_budget_from_parameterization(
                parameterization, self.mask_pruner)
        # Put largest model into the history
        elif g == START_BO:
            if target_flops == 0:
                parameterization = np.ones(hyperparams.get_dim())
            else:
                f = args.lower_channel
                parameterization = np.ones(hyperparams.get_dim()) * f
            layer_budget = hyperparams.get_layer_budget_from_parameterization(
                parameterization, self.mask_pruner)
        # MOBO-RS
        else:
            rand = torch.rand(1).cuda()

            train_X = torch.FloatTensor(self.X).cuda()
            train_Y_loss = torch.FloatTensor(
                np.array(self.Y)[:, 0].reshape(-1, 1)).cuda()
            train_Y_loss = standardize(train_Y_loss)

            train_Y_cost = torch.FloatTensor(
                np.array(self.Y)[:, 1].reshape(-1, 1)).cuda()
            train_Y_cost = standardize(train_Y_cost)

            new_train_X = train_X
            gp_loss = SingleTaskGP(new_train_X, train_Y_loss)
            mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss)
            mll = mll.to('cuda')
            fit_gpytorch_model(mll)

            # Use add-gp for cost
            covar_module = AdditiveStructureKernel(ScaleKernel(
                MaternKernel(nu=2.5,
                             lengthscale_prior=GammaPrior(3.0, 6.0),
                             num_dims=1),
                outputscale_prior=GammaPrior(2.0, 0.15),
            ),
                                                   num_dims=train_X.shape[1])
            gp_cost = SingleTaskGP(new_train_X,
                                   train_Y_cost,
                                   covar_module=covar_module)
            mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost)
            mll = mll.to('cuda')
            fit_gpytorch_model(mll)

            UCB_loss = UpperConfidenceBound(gp_loss).cuda()
            UCB_cost = UpperConfidenceBound(gp_cost).cuda()
            self.mobo_obj = RandAcquisition(UCB_loss).cuda()
            self.mobo_obj.setup(UCB_loss, UCB_cost, rand)

            lower = torch.ones(new_train_X.shape[1]) * args.lower_channel
            upper = torch.ones(new_train_X.shape[1]) * args.upper_channel
            self.mobo_bounds = torch.stack([lower, upper]).cuda()

            if args.pas:
                costs = []
                for i in range(len(self.population_data)):
                    costs.append([
                        self.population_data[i]['loss'],
                        self.population_data[i]['ratio']
                    ])
                costs = np.array(costs)
                efficient_mask = is_pareto_efficient(costs)
                costs = costs[efficient_mask]
                loss = costs[:, 0]
                flops = costs[:, 1]
                sorted_idx = np.argsort(flops)
                loss = loss[sorted_idx]
                flops = flops[sorted_idx]
                if flops[0] > args.lower_flops:
                    flops = np.concatenate([[args.lower_flops],
                                            flops.reshape(-1)])
                    loss = np.concatenate([[empty_val_loss], loss.reshape(-1)])
                else:
                    flops = flops.reshape(-1)
                    loss = loss.reshape(-1)

                if flops[-1] < args.upper_flops and (loss[-1] > full_val_loss):
                    flops = np.concatenate(
                        [flops.reshape(-1), [args.upper_flops]])
                    loss = np.concatenate([loss.reshape(-1), [full_val_loss]])
                else:
                    flops = flops.reshape(-1)
                    loss = loss.reshape(-1)

                areas = (flops[1:] - flops[:-1]) * (loss[:-1] - loss[1:])

                self.sampling_weights = np.zeros(50)
                k = 0
                while k < len(flops) and flops[k] < args.lower_flops:
                    k += 1
                for i in range(50):
                    lower = i / 50.
                    upper = (i + 1) / 50.
                    if upper < args.lower_flops or lower > args.upper_flops or lower < args.lower_flops:
                        continue
                    cnt = 1
                    while ((k + 1) < len(flops)) and upper > flops[k + 1]:
                        self.sampling_weights[i] += areas[k]
                        cnt += 1
                        k += 1
                    if k < len(areas):
                        self.sampling_weights[i] += areas[k]
                    self.sampling_weights[i] /= cnt
                if np.sum(self.sampling_weights) == 0:
                    self.sampling_weights = np.ones(50)

                if target_flops == 0:
                    val = np.arange(0.01, 1, 0.02)
                    chosen_target_flops = np.random.choice(
                        val,
                        p=(self.sampling_weights /
                           np.sum(self.sampling_weights)))
                else:
                    chosen_target_flops = target_flops

                lower_bnd, upper_bnd = 0, 1
                lmda = 0.5
                for i in range(10):
                    self.mobo_obj.rand = lmda

                    parameterization, acq_value = optimize_acqf(
                        self.mobo_obj,
                        bounds=self.mobo_bounds,
                        q=1,
                        num_restarts=5,
                        raw_samples=1000,
                    )

                    parameterization = parameterization[0].cpu().numpy()
                    layer_budget = hyperparams.get_layer_budget_from_parameterization(
                        parameterization, self.mask_pruner)
                    sim_flops = self.mask_pruner.simulate_and_count_flops(
                        layer_budget)
                    ratio = sim_flops / og_flops

                    if np.abs(ratio - chosen_target_flops) <= 0.02:
                        break
                    if args.baseline > 0:
                        if ratio < chosen_target_flops:
                            lower_bnd = lmda
                            lmda = (lmda + upper_bnd) / 2
                        elif ratio > chosen_target_flops:
                            upper_bnd = lmda
                            lmda = (lmda + lower_bnd) / 2
                    else:
                        if ratio < chosen_target_flops:
                            upper_bnd = lmda
                            lmda = (lmda + lower_bnd) / 2
                        elif ratio > chosen_target_flops:
                            lower_bnd = lmda
                            lmda = (lmda + upper_bnd) / 2
                rand[0] = lmda
                writer.add_scalar('Binary search trials', i, g)

            else:
                parameterization, acq_value = optimize_acqf(
                    self.mobo_obj,
                    bounds=self.mobo_bounds,
                    q=1,
                    num_restarts=5,
                    raw_samples=1000,
                )
                parameterization = parameterization[0].cpu().numpy()

            layer_budget = hyperparams.get_layer_budget_from_parameterization(
                parameterization, self.mask_pruner)
        return layer_budget, parameterization, self.sampling_weights / np.sum(
            self.sampling_weights)
Example #23
0
    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        task_feature: int,
        covar_module: Optional[Module] = None,
        task_covar_prior: Optional[Prior] = None,
        output_tasks: Optional[List[int]] = None,
        rank: Optional[int] = None,
        input_transform: Optional[InputTransform] = None,
        outcome_transform: Optional[OutcomeTransform] = None,
    ) -> None:
        r"""Multi-Task GP model using an ICM kernel, inferring observation noise.

        Args:
            train_X: A `n x (d + 1)` or `b x n x (d + 1)` (batch mode) tensor
                of training data. One of the columns should contain the task
                features (see `task_feature` argument).
            train_Y: A `n x 1` or `b x n x 1` (batch mode) tensor of training
                observations.
            task_feature: The index of the task feature (`-d <= task_feature <= d`).
            output_tasks: A list of task indices for which to compute model
                outputs for. If omitted, return outputs for all task indices.
            rank: The rank to be used for the index kernel. If omitted, use a
                full rank (i.e. number of tasks) kernel.
            task_covar_prior : A Prior on the task covariance matrix. Must operate
                on p.s.d. matrices. A common prior for this is the `LKJ` prior.
            input_transform: An input transform that is applied in the model's
                forward pass.

        Example:
            >>> X1, X2 = torch.rand(10, 2), torch.rand(20, 2)
            >>> i1, i2 = torch.zeros(10, 1), torch.ones(20, 1)
            >>> train_X = torch.cat([
            >>>     torch.cat([X1, i1], -1), torch.cat([X2, i2], -1),
            >>> ])
            >>> train_Y = torch.cat(f1(X1), f2(X2)).unsqueeze(-1)
            >>> model = MultiTaskGP(train_X, train_Y, task_feature=-1)
        """
        with torch.no_grad():
            transformed_X = self.transform_inputs(
                X=train_X, input_transform=input_transform)
        self._validate_tensor_args(X=transformed_X, Y=train_Y)
        all_tasks, task_feature, d = self.get_all_tasks(
            transformed_X, task_feature, output_tasks)
        if outcome_transform is not None:
            train_Y, _ = outcome_transform(train_Y)

        # squeeze output dim
        train_Y = train_Y.squeeze(-1)
        if output_tasks is None:
            output_tasks = all_tasks
        else:
            if set(output_tasks) - set(all_tasks):
                raise RuntimeError(
                    "All output tasks must be present in input data.")
        self._output_tasks = output_tasks
        self._num_outputs = len(output_tasks)

        # TODO (T41270962): Support task-specific noise levels in likelihood
        likelihood = GaussianLikelihood(noise_prior=GammaPrior(1.1, 0.05))

        # construct indexer to be used in forward
        self._task_feature = task_feature
        self._base_idxr = torch.arange(d)
        self._base_idxr[task_feature:] += 1  # exclude task feature

        super().__init__(train_inputs=train_X,
                         train_targets=train_Y,
                         likelihood=likelihood)
        self.mean_module = ConstantMean()
        if covar_module is None:
            self.covar_module = ScaleKernel(
                base_kernel=MaternKernel(nu=2.5,
                                         ard_num_dims=d,
                                         lengthscale_prior=GammaPrior(
                                             3.0, 6.0)),
                outputscale_prior=GammaPrior(2.0, 0.15),
            )
        else:
            self.covar_module = covar_module

        num_tasks = len(all_tasks)
        self._rank = rank if rank is not None else num_tasks

        self.task_covar_module = IndexKernel(num_tasks=num_tasks,
                                             rank=self._rank,
                                             prior=task_covar_prior)
        if input_transform is not None:
            self.input_transform = input_transform
        if outcome_transform is not None:
            self.outcome_transform = outcome_transform
        self.to(train_X)
Example #24
0
    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        task_feature: int,
        output_tasks: Optional[List[int]] = None,
        rank: Optional[int] = None,
    ) -> None:
        r"""Multi-Task GP model using an ICM kernel, inferring observation noise.

        Args:
            train_X: A `n x (d + 1)` or `b x n x (d + 1)` (batch mode) tensor
                of training data. One of the columns should contain the task
                features (see `task_feature` argument).
            train_Y: A `n` or `b x n` (batch mode) tensor of training
                observations.
            task_feature: The index of the task feature
                (`-d <= task_feature <= d`).
            output_tasks: A list of task indices for which to compute model
                outputs for. If omitted, return outputs for all task indices.
            rank: The rank to be used for the index kernel. If omitted, use a
                full rank (i.e. number of tasks) kernel.

        Example:
            >>> X1, X2 = torch.rand(10, 2), torch.rand(20, 2)
            >>> i1, i2 = torch.zeros(10, 1), torch.ones(20, 1)
            >>> train_X = torch.stack([
            >>>     torch.cat([X1, i1], -1), torch.cat([X2, i2], -1),
            >>> ])
            >>> train_Y = torch.cat(f1(X1), f2(X2))
            >>> model = MultiTaskGP(train_X, train_Y, task_feature=-1)
        """
        if train_X.ndimension() != 2:
            # Currently, batch mode MTGPs are blocked upstream in GPyTorch
            raise ValueError(f"Unsupported shape {train_X.shape} for train_X.")
        d = train_X.shape[-1] - 1
        if not (-d <= task_feature <= d):
            raise ValueError(f"Must have that -{d} <= task_feature <= {d}")
        all_tasks = train_X[:, task_feature].unique().to(
            dtype=torch.long).tolist()
        if output_tasks is None:
            output_tasks = all_tasks
        else:
            if any(t not in all_tasks for t in output_tasks):
                raise RuntimeError(
                    "All output tasks must be present in input data.")
        self._output_tasks = output_tasks

        # TODO (T41270962): Support task-specific noise levels in likelihood
        likelihood = GaussianLikelihood(noise_prior=GammaPrior(1.1, 0.05))

        # construct indexer to be used in forward
        self._task_feature = task_feature
        self._base_idxr = torch.arange(d)
        self._base_idxr[task_feature:] += 1  # exclude task feature

        super().__init__(train_inputs=train_X,
                         train_targets=train_Y,
                         likelihood=likelihood)
        self.mean_module = ConstantMean()
        self.covar_module = ScaleKernel(
            base_kernel=MaternKernel(nu=2.5,
                                     ard_num_dims=d,
                                     lengthscale_prior=GammaPrior(3.0, 6.0)),
            outputscale_prior=GammaPrior(2.0, 0.15),
        )
        num_tasks = len(all_tasks)
        self._rank = rank if rank is not None else num_tasks
        # TODO: Add LKJ prior for the index kernel
        self.task_covar_module = IndexKernel(num_tasks=num_tasks,
                                             rank=self._rank)
        self.to(train_X)