Beispiel #1
0
    def __init__(
        self, decomposition: Dict[str, List[int]], batch_shape: torch.Size
    ) -> None:
        super().__init__(batch_shape=batch_shape)
        self.decomposition = decomposition

        num_param = len(next(iter(decomposition.values())))
        for active_parameters in decomposition.values():
            # check number of parameters are same in each decomp
            if len(active_parameters) != num_param:
                raise ValueError(
                    "num of parameters needs to be same across all contexts"
                )

        self._indexers = {
            context: torch.tensor(active_params)
            for context, active_params in self.decomposition.items()
        }

        self.base_kernel = MaternKernel(
            nu=2.5,
            ard_num_dims=num_param,
            batch_shape=batch_shape,
            lengthscale_prior=GammaPrior(3.0, 6.0),
        )

        self.kernel_dict = {}  # scaled kernel for each parameter space partition
        for context in list(decomposition.keys()):
            self.kernel_dict[context] = ScaleKernel(
                base_kernel=self.base_kernel, outputscale_prior=GammaPrior(2.0, 15.0)
            )
        self.kernel_dict = ModuleDict(self.kernel_dict)
Beispiel #2
0
 def __init__(
     self,
     train_X: Tensor,
     train_Y: Tensor,
     nu: float = 2.5,
     train_iteration_fidelity: bool = True,
     train_data_fidelity: bool = True,
     likelihood: Optional[Likelihood] = None,
 ) -> None:
     if not train_iteration_fidelity and not train_data_fidelity:
         raise UnsupportedError(
             "You should have at least one fidelity parameter.")
     self._set_dimensions(train_X=train_X, train_Y=train_Y)
     kernel = LinearTruncatedFidelityKernel(
         nu=nu,
         dimension=train_X.shape[-1],
         train_iteration_fidelity=train_iteration_fidelity,
         train_data_fidelity=train_data_fidelity,
         batch_shape=self._aug_batch_shape,
         power_prior=GammaPrior(3.0, 3.0),
     )
     covar_module = ScaleKernel(
         kernel,
         batch_shape=self._aug_batch_shape,
         outputscale_prior=GammaPrior(2.0, 0.15),
     )
     super().__init__(train_X=train_X,
                      train_Y=train_Y,
                      covar_module=covar_module)
     self.to(train_X)
Beispiel #3
0
    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        likelihood: Optional[Likelihood] = None,
        covar_module: Optional[Module] = None,
    ) -> None:
        r"""A single-task exact GP model.

        Args:
            train_X: A `n x d` or `batch_shape x n x d` (batch mode) tensor of training
                features.
            train_Y: A `n x m` or `batch_shape x n x m` (batch mode) tensor of
                training observations.
            likelihood: A likelihood. If omitted, use a standard
                GaussianLikelihood with inferred noise level.
            covar_module: The covariance (kernel) matrix. If omitted, use the
                MaternKernel.

        Example:
            >>> train_X = torch.rand(20, 2)
            >>> train_Y = torch.sin(train_X).sum(dim=1, keepdim=True)
            >>> model = SingleTaskGP(train_X, train_Y)
        """
        validate_input_scaling(train_X=train_X, train_Y=train_Y)
        self._validate_tensor_args(X=train_X, Y=train_Y)
        self._set_dimensions(train_X=train_X, train_Y=train_Y)
        train_X, train_Y, _ = self._transform_tensor_args(X=train_X, Y=train_Y)
        if likelihood is None:
            noise_prior = GammaPrior(1.1, 0.05)
            noise_prior_mode = (noise_prior.concentration -
                                1) / noise_prior.rate
            likelihood = GaussianLikelihood(
                noise_prior=noise_prior,
                batch_shape=self._aug_batch_shape,
                noise_constraint=GreaterThan(
                    MIN_INFERRED_NOISE_LEVEL,
                    transform=None,
                    initial_value=noise_prior_mode,
                ),
            )
        else:
            self._is_custom_likelihood = True
        ExactGP.__init__(self, train_X, train_Y, likelihood)
        self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape)
        if covar_module is None:
            self.covar_module = ScaleKernel(
                MaternKernel(
                    nu=2.5,
                    ard_num_dims=train_X.shape[-1],
                    batch_shape=self._aug_batch_shape,
                    lengthscale_prior=GammaPrior(3.0, 6.0),
                ),
                batch_shape=self._aug_batch_shape,
                outputscale_prior=GammaPrior(2.0, 0.15),
            )
        else:
            self.covar_module = covar_module
        self.to(train_X)
    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        train_Yvar: Tensor,
        outcome_transform: Optional[OutcomeTransform] = None,
    ) -> None:
        r"""A single-task exact GP model using fixed noise levels.

        Args:
            train_X: A `batch_shape x n x d` tensor of training features.
            train_Y: A `batch_shape x n x m` tensor of training observations.
            train_Yvar: A `batch_shape x n x m` tensor of observed measurement
                noise.
            outcome_transform: An outcome transform that is applied to the
                training data during instantiation and to the posterior during
                inference (that is, the `Posterior` obtained by calling
                `.posterior` on the model will be on the original scale).

        Example:
            >>> train_X = torch.rand(20, 2)
            >>> train_Y = torch.sin(train_X).sum(dim=1, keepdim=True)
            >>> train_Yvar = torch.full_like(train_Y, 0.2)
            >>> model = FixedNoiseGP(train_X, train_Y, train_Yvar)
        """
        if outcome_transform is not None:
            train_Y, train_Yvar = outcome_transform(train_Y, train_Yvar)
        validate_input_scaling(train_X=train_X,
                               train_Y=train_Y,
                               train_Yvar=train_Yvar)
        self._validate_tensor_args(X=train_X, Y=train_Y, Yvar=train_Yvar)
        self._set_dimensions(train_X=train_X, train_Y=train_Y)
        train_X, train_Y, train_Yvar = self._transform_tensor_args(
            X=train_X, Y=train_Y, Yvar=train_Yvar)
        likelihood = FixedNoiseGaussianLikelihood(
            noise=train_Yvar, batch_shape=self._aug_batch_shape)
        ExactGP.__init__(self,
                         train_inputs=train_X,
                         train_targets=train_Y,
                         likelihood=likelihood)
        self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape)
        self.covar_module = ScaleKernel(
            base_kernel=MaternKernel(
                nu=2.5,
                ard_num_dims=train_X.shape[-1],
                batch_shape=self._aug_batch_shape,
                lengthscale_prior=GammaPrior(3.0, 6.0),
            ),
            batch_shape=self._aug_batch_shape,
            outputscale_prior=GammaPrior(2.0, 0.15),
        )
        if outcome_transform is not None:
            self.outcome_transform = outcome_transform
        self._subset_batch_dict = {
            "mean_module.constant": -2,
            "covar_module.raw_outputscale": -1,
            "covar_module.base_kernel.raw_lengthscale": -3,
        }
        self.to(train_X)
Beispiel #5
0
    def test_sample_all_priors(self, cuda=False):
        device = torch.device("cuda" if cuda else "cpu")
        for dtype in (torch.float, torch.double):
            train_X = torch.rand(3, 5, device=device, dtype=dtype)
            train_Y = torch.rand(3, 1, device=device, dtype=dtype)
            model = SingleTaskGP(train_X=train_X, train_Y=train_Y)
            mll = ExactMarginalLogLikelihood(model.likelihood, model)
            mll.to(device=device, dtype=dtype)
            original_state_dict = dict(deepcopy(mll.model.state_dict()))
            sample_all_priors(model)

            # make sure one of the hyperparameters changed
            self.assertTrue(
                dict(model.state_dict())["likelihood.noise_covar.raw_noise"] !=
                original_state_dict["likelihood.noise_covar.raw_noise"])
            # check that lengthscales are all different
            ls = model.covar_module.base_kernel.raw_lengthscale.view(
                -1).tolist()
            self.assertTrue(all(ls[0] != ls[i]) for i in range(1, len(ls)))

            # change one of the priors to SmoothedBoxPrior
            model.covar_module = ScaleKernel(
                MaternKernel(
                    nu=2.5,
                    ard_num_dims=model.train_inputs[0].shape[-1],
                    batch_shape=model._aug_batch_shape,
                    lengthscale_prior=SmoothedBoxPrior(3.0, 6.0),
                ),
                batch_shape=model._aug_batch_shape,
                outputscale_prior=GammaPrior(2.0, 0.15),
            )
            original_state_dict = dict(deepcopy(mll.model.state_dict()))
            with warnings.catch_warnings(
                    record=True) as ws, settings.debug(True):
                sample_all_priors(model)
                self.assertEqual(len(ws), 1)
                self.assertTrue("rsample" in str(ws[0].message))

            # the lengthscale should not have changed because sampling is
            # not implemented for SmoothedBoxPrior
            self.assertTrue(
                torch.equal(
                    dict(model.state_dict())
                    ["covar_module.base_kernel.raw_lengthscale"],
                    original_state_dict[
                        "covar_module.base_kernel.raw_lengthscale"],
                ))

            # set setting_closure to None and make sure RuntimeError is raised
            prior_tuple = model.likelihood.noise_covar._priors["noise_prior"]
            model.likelihood.noise_covar._priors["noise_prior"] = (
                prior_tuple[0],
                prior_tuple[1],
                None,
            )
            with self.assertRaises(RuntimeError):
                sample_all_priors(model)
Beispiel #6
0
 def __init__(
     self, B: Tensor, train_X: Tensor, train_Y: Tensor, train_Yvar: Tensor
 ) -> None:
     super().__init__(train_X=train_X, train_Y=train_Y, train_Yvar=train_Yvar)
     self.covar_module = ScaleKernel(
         base_kernel=ALEBOKernel(B=B, batch_shape=self._aug_batch_shape),
         batch_shape=self._aug_batch_shape,
     )
     self.to(train_X)
Beispiel #7
0
 def __init__(
     self,
     train_X: Tensor,
     train_Y: Tensor,
     train_iteration_fidelity: bool = True,
     train_data_fidelity: bool = True,
     likelihood: Optional[Likelihood] = None,
 ) -> None:
     train_X, train_Y, _ = self._set_dimensions(train_X=train_X,
                                                train_Y=train_Y)
     num_fidelity = train_iteration_fidelity + train_data_fidelity
     ard_num_dims = train_X.shape[-1] - num_fidelity
     active_dimsX = list(range(train_X.shape[-1] - num_fidelity))
     rbf_kernel = RBFKernel(
         ard_num_dims=ard_num_dims,
         batch_shape=self._aug_batch_shape,
         lengthscale_prior=GammaPrior(3.0, 6.0),
         active_dims=active_dimsX,
     )
     exp_kernel = ExpDecayKernel(
         batch_shape=self._aug_batch_shape,
         lengthscale_prior=GammaPrior(3.0, 6.0),
         offset_prior=GammaPrior(3.0, 6.0),
         power_prior=GammaPrior(3.0, 6.0),
     )
     ds_kernel = DownsamplingKernel(
         batch_shape=self._aug_batch_shape,
         offset_prior=GammaPrior(3.0, 6.0),
         power_prior=GammaPrior(3.0, 6.0),
     )
     if train_iteration_fidelity and train_data_fidelity:
         active_dimsS1 = [train_X.shape[-1] - 1]
         active_dimsS2 = [train_X.shape[-1] - 2]
         exp_kernel.active_dims = torch.tensor(active_dimsS1)
         ds_kernel.active_dims = torch.tensor(active_dimsS2)
         kernel = rbf_kernel * exp_kernel * ds_kernel
     elif train_iteration_fidelity or train_data_fidelity:
         active_dimsS = [train_X.shape[-1] - 1]
         if train_iteration_fidelity:
             exp_kernel.active_dims = torch.tensor(active_dimsS)
             kernel = rbf_kernel * exp_kernel
         else:
             ds_kernel.active_dims = torch.tensor(active_dimsS)
             kernel = rbf_kernel * ds_kernel
     else:
         raise UnsupportedError(
             "You should have at least one fidelity parameter.")
     covar_module = ScaleKernel(
         kernel,
         batch_shape=self._aug_batch_shape,
         outputscale_prior=GammaPrior(2.0, 0.15),
     )
     super().__init__(train_X=train_X,
                      train_Y=train_Y,
                      covar_module=covar_module)
     self.to(train_X)
Beispiel #8
0
    def __init__(self,
                 train_X: Tensor,
                 train_Y: Tensor,
                 likelihood: Optional[Likelihood] = None) -> None:
        r"""A single-task exact GP model.

        Args:
            train_X: A `n x d` or `batch_shape x n x d` (batch mode) tensor of training
                features.
            train_Y: A `n x (o)` or `batch_shape x n x (o)` (batch mode) tensor of
                training observations.
            likelihood: A likelihood. If omitted, use a standard
                GaussianLikelihood with inferred noise level.

        Example:
            >>> train_X = torch.rand(20, 2)
            >>> train_Y = torch.sin(train_X[:, 0]) + torch.cos(train_X[:, 1])
            >>> model = SingleTaskGP(train_X, train_Y)
        """
        ard_num_dims = train_X.shape[-1]
        train_X, train_Y, _ = self._set_dimensions(train_X=train_X,
                                                   train_Y=train_Y)
        train_X, train_Y, _ = multioutput_to_batch_mode_transform(
            train_X=train_X, train_Y=train_Y, num_outputs=self._num_outputs)
        if likelihood is None:
            noise_prior = GammaPrior(1.1, 0.05)
            noise_prior_mode = (noise_prior.concentration -
                                1) / noise_prior.rate
            likelihood = GaussianLikelihood(
                noise_prior=noise_prior,
                batch_shape=self._aug_batch_shape,
                noise_constraint=GreaterThan(
                    MIN_INFERRED_NOISE_LEVEL,
                    transform=None,
                    initial_value=noise_prior_mode,
                ),
            )
        else:
            self._likelihood_state_dict = deepcopy(likelihood.state_dict())
        ExactGP.__init__(self, train_X, train_Y, likelihood)
        self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape)
        self.covar_module = ScaleKernel(
            MaternKernel(
                nu=2.5,
                ard_num_dims=ard_num_dims,
                batch_shape=self._aug_batch_shape,
                lengthscale_prior=GammaPrior(3.0, 6.0),
            ),
            batch_shape=self._aug_batch_shape,
            outputscale_prior=GammaPrior(2.0, 0.15),
        )
        self.to(train_X)
Beispiel #9
0
    def __init__(
        self,
        train_x: torch.Tensor,
        train_y: torch.Tensor,
        inducing_points: torch.Tensor,
        scales: Union[torch.Tensor, float] = 1.0,
        mean_module: Optional[Mean] = None,
        covar_module: Optional[Kernel] = None,
        fixed_prior_mean: Optional[float] = None,
    ) -> None:
        variational_distribution = CholeskyVariationalDistribution(
            inducing_points.size(0))
        variational_distribution.to(train_x)
        variational_strategy = VariationalStrategy(
            model=self,
            inducing_points=inducing_points,
            variational_distribution=variational_distribution,
            learn_inducing_locations=False,
        )
        super(MixedDerivativeVariationalGP,
              self).__init__(variational_strategy)

        # Set the mean if specified to
        if mean_module is None:
            self.mean_module = ConstantMeanPartialObsGrad()
        else:
            self.mean_module = mean_module

        if fixed_prior_mean is not None:
            self.mean_module.constant.requires_grad_(False)
            self.mean_module.constant.copy_(
                torch.tensor([fixed_prior_mean], dtype=train_x.dtype))

        if covar_module is None:
            self.base_kernel = RBFKernelPartialObsGrad(
                ard_num_dims=train_x.shape[-1] - 1,
                lengthscale_prior=GammaPrior(3.0, 6.0 / scales),
            )
            self.covar_module = ScaleKernel(self.base_kernel,
                                            outputscale_prior=GammaPrior(
                                                2.0, 0.15))
        else:
            self.covar_module = covar_module

        self._num_outputs = 1
        self.train_inputs = (train_x, )
        self.train_targets = train_y
        self(train_x)  # Necessary for CholeskyVariationalDistribution
Beispiel #10
0
    def __init__(self, train_X: Tensor, train_Y: Tensor, train_Yvar: Tensor) -> None:
        r"""A single-task exact GP model using fixed noise levels.

        Args:
            train_X: A `n x d` or `batch_shape x n x d` (batch mode) tensor of training
                features.
            train_Y: A `n x (o)` or `batch_shape x n x (o)` (batch mode) tensor of
                training observations.
            train_Yvar: A `batch_shape x n x (o)` or `batch_shape x n x (o)`
                (batch mode) tensor of observed measurement noise.

        Example:
            >>> train_X = torch.rand(20, 2)
            >>> train_Y = torch.sin(train_X[:, 0]]) + torch.cos(train_X[:, 1])
            >>> train_Yvar = torch.full_like(train_Y, 0.2)
            >>> model = FixedNoiseGP(train_X, train_Y, train_Yvar)
        """
        ard_num_dims = train_X.shape[-1]
        train_X, train_Y, train_Yvar = self._set_dimensions(
            train_X=train_X, train_Y=train_Y, train_Yvar=train_Yvar
        )
        train_X, train_Y, train_Yvar = multioutput_to_batch_mode_transform(
            train_X=train_X,
            train_Y=train_Y,
            num_outputs=self._num_outputs,
            train_Yvar=train_Yvar,
        )
        likelihood = FixedNoiseGaussianLikelihood(
            noise=train_Yvar, batch_shape=self._aug_batch_shape
        )
        ExactGP.__init__(
            self, train_inputs=train_X, train_targets=train_Y, likelihood=likelihood
        )
        self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape)
        self.covar_module = ScaleKernel(
            base_kernel=MaternKernel(
                nu=2.5,
                ard_num_dims=ard_num_dims,
                batch_shape=self._aug_batch_shape,
                lengthscale_prior=GammaPrior(3.0, 6.0),
            ),
            batch_shape=self._aug_batch_shape,
            outputscale_prior=GammaPrior(2.0, 0.15),
        )
        self.to(train_X)
Beispiel #11
0
 def __init__(self, train_X: Tensor, train_Y: Tensor,
              train_Yvar: Tensor) -> None:
     self._validate_tensor_args(X=train_X, Y=train_Y, Yvar=train_Yvar)
     self._set_dimensions(train_X=train_X, train_Y=train_Y)
     train_X, train_Y, train_Yvar = self._transform_tensor_args(
         X=train_X, Y=train_Y, Yvar=train_Yvar)
     likelihood = FixedNoiseGaussianLikelihood(
         noise=train_Yvar, batch_shape=self._aug_batch_shape)
     ExactGP.__init__(self,
                      train_inputs=train_X,
                      train_targets=train_Y,
                      likelihood=likelihood)
     self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape)
     self.covar_module = ScaleKernel(
         base_kernel=RBFKernel(
             ard_num_dims=train_X.shape[-1],
             batch_shape=self._aug_batch_shape,
         ),
         batch_shape=self._aug_batch_shape,
     )
     self.to(train_X)
Beispiel #12
0
    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        cat_dims: List[int],
        cont_kernel_factory: Optional[Callable[[int, List[int]], Kernel]] = None,
        likelihood: Optional[Likelihood] = None,
        outcome_transform: Optional[OutcomeTransform] = None,  # TODO
        input_transform: Optional[InputTransform] = None,  # TODO
    ) -> None:
        r"""A single-task exact GP model supporting categorical parameters.

        Args:
            train_X: A `batch_shape x n x d` tensor of training features.
            train_Y: A `batch_shape x n x m` tensor of training observations.
            cat_dims: A list of indices corresponding to the columns of
                the input `X` that should be considered categorical features.
            cont_kernel_factory: A method that accepts `ard_num_dims` and
                `active_dims` arguments and returns an instatiated GPyTorch
                `Kernel` object to be used as the ase kernel for the continuous
                dimensions. If omitted, this model uses a Matern-2.5 kernel as
                the kernel for the ordinal parameters.
            likelihood: A likelihood. If omitted, use a standard
                GaussianLikelihood with inferred noise level.
            # outcome_transform: An outcome transform that is applied to the
            #     training data during instantiation and to the posterior during
            #     inference (that is, the `Posterior` obtained by calling
            #     `.posterior` on the model will be on the original scale).
            # input_transform: An input transform that is applied in the model's
            #     forward pass.

        Example:
            >>> train_X = torch.cat(
                    [torch.rand(20, 2), torch.randint(3, (20, 1))], dim=-1)
                )
            >>> train_Y = (
                    torch.sin(train_X[..., :-1]).sum(dim=1, keepdim=True)
                    + train_X[..., -1:]
                )
            >>> model = MixedSingleTaskGP(train_X, train_Y, cat_dims=[-1])
        """
        if outcome_transform is not None:
            raise UnsupportedError("outcome transforms not yet supported")
        if input_transform is not None:
            raise UnsupportedError("input transforms not yet supported")
        if len(cat_dims) == 0:
            raise ValueError(
                "Must specify categorical dimensions for MixedSingleTaskGP"
            )
        input_batch_shape, aug_batch_shape = self.get_batch_dimensions(
            train_X=train_X, train_Y=train_Y
        )

        if cont_kernel_factory is None:

            def cont_kernel_factory(
                batch_shape: torch.Size, ard_num_dims: int, active_dims: List[int]
            ) -> MaternKernel:
                return MaternKernel(
                    nu=2.5,
                    batch_shape=batch_shape,
                    ard_num_dims=ard_num_dims,
                    active_dims=active_dims,
                )

        if likelihood is None:
            # This Gamma prior is quite close to the Horseshoe prior
            min_noise = 1e-5 if train_X.dtype == torch.float else 1e-6
            likelihood = GaussianLikelihood(
                batch_shape=aug_batch_shape,
                noise_constraint=GreaterThan(
                    min_noise, transform=None, initial_value=1e-3
                ),
                noise_prior=GammaPrior(0.9, 10.0),
            )

        d = train_X.shape[-1]
        cat_dims = normalize_indices(indices=cat_dims, d=d)
        ord_dims = sorted(set(range(d)) - set(cat_dims))
        if len(ord_dims) == 0:
            covar_module = ScaleKernel(
                CategoricalKernel(
                    batch_shape=aug_batch_shape,
                    ard_num_dims=len(cat_dims),
                )
            )
        else:
            sum_kernel = ScaleKernel(
                cont_kernel_factory(
                    batch_shape=aug_batch_shape,
                    ard_num_dims=len(ord_dims),
                    active_dims=ord_dims,
                )
                + ScaleKernel(
                    CategoricalKernel(
                        batch_shape=aug_batch_shape,
                        ard_num_dims=len(cat_dims),
                        active_dims=cat_dims,
                    )
                )
            )
            prod_kernel = ScaleKernel(
                cont_kernel_factory(
                    batch_shape=aug_batch_shape,
                    ard_num_dims=len(ord_dims),
                    active_dims=ord_dims,
                )
                * CategoricalKernel(
                    batch_shape=aug_batch_shape,
                    ard_num_dims=len(cat_dims),
                    active_dims=cat_dims,
                )
            )
            covar_module = sum_kernel + prod_kernel
        super().__init__(
            train_X=train_X,
            train_Y=train_Y,
            likelihood=likelihood,
            covar_module=covar_module,
            outcome_transform=outcome_transform,
            input_transform=input_transform,
        )
Beispiel #13
0
    def sample_arch(self, START_BO, g, hyperparams, og_flops, empty_val_loss, full_val_loss, target_flops=0):
        if g < START_BO:
            if target_flops == 0:
                f = np.random.rand(1) * (args.upper_channel-args.lower_channel) + args.lower_channel
            else:
                f = args.lower_channel
            parameterization = np.ones(hyperparams.get_dim()) * f
            layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
        elif g == START_BO:
            if target_flops == 0:
                parameterization = np.ones(hyperparams.get_dim())
            else:
                f = args.lower_channel
                parameterization = np.ones(hyperparams.get_dim()) * f
            layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
        else:
            rand = torch.rand(1).cuda()

            train_X = torch.FloatTensor(self.X).cuda()
            train_Y_loss = torch.FloatTensor(np.array(self.Y)[:, 0].reshape(-1, 1)).cuda()
            train_Y_loss = standardize(train_Y_loss)

            train_Y_cost = torch.FloatTensor(np.array(self.Y)[:, 1].reshape(-1, 1)).cuda()
            train_Y_cost = standardize(train_Y_cost)

            covar_module = None
            if args.ski and g > 128:
                if args.additive:
                    covar_module = AdditiveStructureKernel(
                        ScaleKernel(
                            GridInterpolationKernel(
                                MaternKernel(
                                    nu=2.5,
                                    lengthscale_prior=GammaPrior(3.0, 6.0),
                                ),
                                grid_size=128, num_dims=1, grid_bounds=[(0, 1)]
                            ),
                            outputscale_prior=GammaPrior(2.0, 0.15),
                        ), 
                        num_dims=train_X.shape[1]
                    )
                else:
                    covar_module = ScaleKernel(
                        GridInterpolationKernel(
                            MaternKernel(
                                nu=2.5,
                                lengthscale_prior=GammaPrior(3.0, 6.0),
                            ),
                            grid_size=128, num_dims=train_X.shape[1], grid_bounds=[(0, 1) for _ in range(train_X.shape[1])]
                        ),
                        outputscale_prior=GammaPrior(2.0, 0.15),
                    )
            else:
                if args.additive:
                    covar_module = AdditiveStructureKernel(
                        ScaleKernel(
                            MaternKernel(
                                nu=2.5,
                                lengthscale_prior=GammaPrior(3.0, 6.0),
                                num_dims=1
                            ),
                            outputscale_prior=GammaPrior(2.0, 0.15),
                        ),
                        num_dims=train_X.shape[1]
                    )
                else:
                    covar_module = ScaleKernel(
                        MaternKernel(
                            nu=2.5,
                            lengthscale_prior=GammaPrior(3.0, 6.0),
                            num_dims=train_X.shape[1]
                        ),
                        outputscale_prior=GammaPrior(2.0, 0.15),
                    )

            new_train_X = train_X
            gp_loss = SingleTaskGP(new_train_X, train_Y_loss, covar_module=covar_module)
            mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss)
            mll = mll.to('cuda')
            fit_gpytorch_model(mll)


            # Use add-gp for cost
            covar_module = AdditiveStructureKernel(
                ScaleKernel(
                    MaternKernel(
                        nu=2.5,
                        lengthscale_prior=GammaPrior(3.0, 6.0),
                        num_dims=1
                    ),
                    outputscale_prior=GammaPrior(2.0, 0.15),
                ),
                num_dims=train_X.shape[1]
            )
            gp_cost = SingleTaskGP(new_train_X, train_Y_cost, covar_module=covar_module)
            mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost)
            mll = mll.to('cuda')
            fit_gpytorch_model(mll)

            UCB_loss = UpperConfidenceBound(gp_loss, beta=args.beta).cuda()
            UCB_cost = UpperConfidenceBound(gp_cost, beta=args.beta).cuda()
            self.mobo_obj = RandAcquisition(UCB_loss).cuda()
            self.mobo_obj.setup(UCB_loss, UCB_cost, rand)

            lower = torch.ones(new_train_X.shape[1])*args.lower_channel
            upper = torch.ones(new_train_X.shape[1])*args.upper_channel
            self.mobo_bounds = torch.stack([lower, upper]).cuda()

            if args.pas:
                val = np.linspace(args.lower_flops, 1, 50)
                chosen_target_flops = np.random.choice(val, p=(self.sampling_weights/np.sum(self.sampling_weights)))
                
                lower_bnd, upper_bnd = 0, 1
                lmda = 0.5
                for i in range(10):
                    self.mobo_obj.rand = lmda

                    parameterization, acq_value = optimize_acqf(
                        self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000,
                    )

                    parameterization = parameterization[0].cpu().numpy()
                    layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
                    sim_flops = self.mask_pruner.simulate_and_count_flops(layer_budget, self.use_mem)
                    ratio = sim_flops/og_flops

                    if np.abs(ratio - chosen_target_flops) <= 0.02:
                        break
                    if args.baseline > 0:
                        if ratio < chosen_target_flops:
                            lower_bnd = lmda
                            lmda = (lmda + upper_bnd) / 2
                        elif ratio > chosen_target_flops:
                            upper_bnd = lmda
                            lmda = (lmda + lower_bnd) / 2
                    else:
                        if ratio < chosen_target_flops:
                            upper_bnd = lmda
                            lmda = (lmda + lower_bnd) / 2
                        elif ratio > chosen_target_flops:
                            lower_bnd = lmda
                            lmda = (lmda + upper_bnd) / 2
                rand[0] = lmda
                writer.add_scalar('Binary search trials', i, g)

            else:
                parameterization, acq_value = optimize_acqf(
                    self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000,
                )
                parameterization = parameterization[0].cpu().numpy()

            layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
        return layer_budget, parameterization, self.sampling_weights/np.sum(self.sampling_weights)
    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        likelihood: Optional[Likelihood] = None,
        covar_module: Optional[Module] = None,
        outcome_transform: Optional[OutcomeTransform] = None,
    ) -> None:
        r"""A single-task exact GP model.

        Args:
            train_X: A `batch_shape x n x d` tensor of training features.
            train_Y: A `batch_shape x n x m` tensor of training observations.
            likelihood: A likelihood. If omitted, use a standard
                GaussianLikelihood with inferred noise level.
            covar_module: The module computing the covariance (Kernel) matrix.
                If omitted, use a `MaternKernel`.
            outcome_transform: An outcome transform that is applied to the
                training data during instantiation and to the posterior during
                inference (that is, the `Posterior` obtained by calling
                `.posterior` on the model will be on the original scale).

        Example:
            >>> train_X = torch.rand(20, 2)
            >>> train_Y = torch.sin(train_X).sum(dim=1, keepdim=True)
            >>> model = SingleTaskGP(train_X, train_Y)
        """
        if outcome_transform is not None:
            train_Y, _ = outcome_transform(train_Y)
        validate_input_scaling(train_X=train_X, train_Y=train_Y)
        self._validate_tensor_args(X=train_X, Y=train_Y)
        self._set_dimensions(train_X=train_X, train_Y=train_Y)
        train_X, train_Y, _ = self._transform_tensor_args(X=train_X, Y=train_Y)
        if likelihood is None:
            noise_prior = GammaPrior(1.1, 0.05)
            noise_prior_mode = (noise_prior.concentration -
                                1) / noise_prior.rate
            likelihood = GaussianLikelihood(
                noise_prior=noise_prior,
                batch_shape=self._aug_batch_shape,
                noise_constraint=GreaterThan(
                    MIN_INFERRED_NOISE_LEVEL,
                    transform=None,
                    initial_value=noise_prior_mode,
                ),
            )
        else:
            self._is_custom_likelihood = True
        ExactGP.__init__(self, train_X, train_Y, likelihood)
        self.mean_module = ConstantMean(batch_shape=self._aug_batch_shape)
        if covar_module is None:
            self.covar_module = ScaleKernel(
                MaternKernel(
                    nu=2.5,
                    ard_num_dims=train_X.shape[-1],
                    batch_shape=self._aug_batch_shape,
                    lengthscale_prior=GammaPrior(3.0, 6.0),
                ),
                batch_shape=self._aug_batch_shape,
                outputscale_prior=GammaPrior(2.0, 0.15),
            )
            self._subset_batch_dict = {
                "likelihood.noise_covar.raw_noise": -2,
                "mean_module.constant": -2,
                "covar_module.raw_outputscale": -1,
                "covar_module.base_kernel.raw_lengthscale": -3,
            }
        else:
            self.covar_module = covar_module
        # TODO: Allow subsetting of other covar modules
        if outcome_transform is not None:
            self.outcome_transform = outcome_transform
        self.to(train_X)
Beispiel #15
0
    def sample_arch(self,
                    START_BO,
                    g,
                    hyperparams,
                    og_flops,
                    empty_val_loss,
                    full_val_loss,
                    target_flops=0):
        # Warming up the history with a single width-multiplier
        if g < START_BO:
            if target_flops == 0:
                f = np.random.rand(1) * (args.upper_channel - args.
                                         lower_channel) + args.lower_channel
            else:
                f = args.lower_channel
            parameterization = np.ones(hyperparams.get_dim()) * f
            layer_budget = hyperparams.get_layer_budget_from_parameterization(
                parameterization, self.mask_pruner)
        # Put largest model into the history
        elif g == START_BO:
            if target_flops == 0:
                parameterization = np.ones(hyperparams.get_dim())
            else:
                f = args.lower_channel
                parameterization = np.ones(hyperparams.get_dim()) * f
            layer_budget = hyperparams.get_layer_budget_from_parameterization(
                parameterization, self.mask_pruner)
        # MOBO-RS
        else:
            rand = torch.rand(1).cuda()

            train_X = torch.FloatTensor(self.X).cuda()
            train_Y_loss = torch.FloatTensor(
                np.array(self.Y)[:, 0].reshape(-1, 1)).cuda()
            train_Y_loss = standardize(train_Y_loss)

            train_Y_cost = torch.FloatTensor(
                np.array(self.Y)[:, 1].reshape(-1, 1)).cuda()
            train_Y_cost = standardize(train_Y_cost)

            new_train_X = train_X
            gp_loss = SingleTaskGP(new_train_X, train_Y_loss)
            mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss)
            mll = mll.to('cuda')
            fit_gpytorch_model(mll)

            # Use add-gp for cost
            covar_module = AdditiveStructureKernel(ScaleKernel(
                MaternKernel(nu=2.5,
                             lengthscale_prior=GammaPrior(3.0, 6.0),
                             num_dims=1),
                outputscale_prior=GammaPrior(2.0, 0.15),
            ),
                                                   num_dims=train_X.shape[1])
            gp_cost = SingleTaskGP(new_train_X,
                                   train_Y_cost,
                                   covar_module=covar_module)
            mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost)
            mll = mll.to('cuda')
            fit_gpytorch_model(mll)

            UCB_loss = UpperConfidenceBound(gp_loss).cuda()
            UCB_cost = UpperConfidenceBound(gp_cost).cuda()
            self.mobo_obj = RandAcquisition(UCB_loss).cuda()
            self.mobo_obj.setup(UCB_loss, UCB_cost, rand)

            lower = torch.ones(new_train_X.shape[1]) * args.lower_channel
            upper = torch.ones(new_train_X.shape[1]) * args.upper_channel
            self.mobo_bounds = torch.stack([lower, upper]).cuda()

            if args.pas:
                costs = []
                for i in range(len(self.population_data)):
                    costs.append([
                        self.population_data[i]['loss'],
                        self.population_data[i]['ratio']
                    ])
                costs = np.array(costs)
                efficient_mask = is_pareto_efficient(costs)
                costs = costs[efficient_mask]
                loss = costs[:, 0]
                flops = costs[:, 1]
                sorted_idx = np.argsort(flops)
                loss = loss[sorted_idx]
                flops = flops[sorted_idx]
                if flops[0] > args.lower_flops:
                    flops = np.concatenate([[args.lower_flops],
                                            flops.reshape(-1)])
                    loss = np.concatenate([[empty_val_loss], loss.reshape(-1)])
                else:
                    flops = flops.reshape(-1)
                    loss = loss.reshape(-1)

                if flops[-1] < args.upper_flops and (loss[-1] > full_val_loss):
                    flops = np.concatenate(
                        [flops.reshape(-1), [args.upper_flops]])
                    loss = np.concatenate([loss.reshape(-1), [full_val_loss]])
                else:
                    flops = flops.reshape(-1)
                    loss = loss.reshape(-1)

                areas = (flops[1:] - flops[:-1]) * (loss[:-1] - loss[1:])

                self.sampling_weights = np.zeros(50)
                k = 0
                while k < len(flops) and flops[k] < args.lower_flops:
                    k += 1
                for i in range(50):
                    lower = i / 50.
                    upper = (i + 1) / 50.
                    if upper < args.lower_flops or lower > args.upper_flops or lower < args.lower_flops:
                        continue
                    cnt = 1
                    while ((k + 1) < len(flops)) and upper > flops[k + 1]:
                        self.sampling_weights[i] += areas[k]
                        cnt += 1
                        k += 1
                    if k < len(areas):
                        self.sampling_weights[i] += areas[k]
                    self.sampling_weights[i] /= cnt
                if np.sum(self.sampling_weights) == 0:
                    self.sampling_weights = np.ones(50)

                if target_flops == 0:
                    val = np.arange(0.01, 1, 0.02)
                    chosen_target_flops = np.random.choice(
                        val,
                        p=(self.sampling_weights /
                           np.sum(self.sampling_weights)))
                else:
                    chosen_target_flops = target_flops

                lower_bnd, upper_bnd = 0, 1
                lmda = 0.5
                for i in range(10):
                    self.mobo_obj.rand = lmda

                    parameterization, acq_value = optimize_acqf(
                        self.mobo_obj,
                        bounds=self.mobo_bounds,
                        q=1,
                        num_restarts=5,
                        raw_samples=1000,
                    )

                    parameterization = parameterization[0].cpu().numpy()
                    layer_budget = hyperparams.get_layer_budget_from_parameterization(
                        parameterization, self.mask_pruner)
                    sim_flops = self.mask_pruner.simulate_and_count_flops(
                        layer_budget)
                    ratio = sim_flops / og_flops

                    if np.abs(ratio - chosen_target_flops) <= 0.02:
                        break
                    if args.baseline > 0:
                        if ratio < chosen_target_flops:
                            lower_bnd = lmda
                            lmda = (lmda + upper_bnd) / 2
                        elif ratio > chosen_target_flops:
                            upper_bnd = lmda
                            lmda = (lmda + lower_bnd) / 2
                    else:
                        if ratio < chosen_target_flops:
                            upper_bnd = lmda
                            lmda = (lmda + lower_bnd) / 2
                        elif ratio > chosen_target_flops:
                            lower_bnd = lmda
                            lmda = (lmda + upper_bnd) / 2
                rand[0] = lmda
                writer.add_scalar('Binary search trials', i, g)

            else:
                parameterization, acq_value = optimize_acqf(
                    self.mobo_obj,
                    bounds=self.mobo_bounds,
                    q=1,
                    num_restarts=5,
                    raw_samples=1000,
                )
                parameterization = parameterization[0].cpu().numpy()

            layer_budget = hyperparams.get_layer_budget_from_parameterization(
                parameterization, self.mask_pruner)
        return layer_budget, parameterization, self.sampling_weights / np.sum(
            self.sampling_weights)
 def __init__(self, x, y, likelihood, kernel, mean):
     super().__init__(x, y, likelihood)
     self.mean_module = mean
     self.covar_module = ScaleKernel(kernel)
     self.float()
     likelihood.float()
    def __init__(
        self,
        train_x: torch.Tensor,
        train_y: torch.Tensor,
        inducing_points: torch.Tensor,
        scales: Union[torch.Tensor, float] = 1.0,
        mean_module: Optional[Mean] = None,
        covar_module: Optional[Kernel] = None,
        fixed_prior_mean: Optional[float] = None,
    ) -> None:
        """Initialize MixedDerivativeVariationalGP

        Args:
            train_x (torch.Tensor): Training x points. The last column of x is the derivative
                indiciator: 0 if it is an observation of f(x), and i if it
                is an observation of df/dx_i.
            train_y (torch.Tensor): Training y points
            inducing_points (torch.Tensor): Inducing points to use
            scales (Union[torch.Tensor, float], optional): Typical scale of each dimension
                of input space (this is used to set the lengthscale prior).
                Defaults to 1.0.
            mean_module (Mean, optional): A mean class that supports derivative
                indexes as the final dim. Defaults to a constant mean.
            covar_module (Kernel, optional): A covariance kernel class that
                supports derivative indexes as the final dim. Defaults to RBF kernel.
            fixed_prior_mean (float, optional): A prior mean value to use with the
                constant mean. Often setting this to the target threshold speeds
                up experiments. Defaults to None, in which case the mean will be inferred.
        """
        variational_distribution = CholeskyVariationalDistribution(
            inducing_points.size(0))
        variational_distribution.to(train_x)
        variational_strategy = VariationalStrategy(
            model=self,
            inducing_points=inducing_points,
            variational_distribution=variational_distribution,
            learn_inducing_locations=False,
        )
        super(MixedDerivativeVariationalGP,
              self).__init__(variational_strategy)

        # Set the mean if specified to
        if mean_module is None:
            self.mean_module = ConstantMeanPartialObsGrad()
        else:
            self.mean_module = mean_module

        if fixed_prior_mean is not None:
            self.mean_module.constant.requires_grad_(False)
            self.mean_module.constant.copy_(
                torch.tensor([fixed_prior_mean], dtype=train_x.dtype))

        if covar_module is None:
            self.base_kernel = RBFKernelPartialObsGrad(
                ard_num_dims=train_x.shape[-1] - 1,
                lengthscale_prior=GammaPrior(3.0, 6.0 / scales),
            )
            self.covar_module = ScaleKernel(self.base_kernel,
                                            outputscale_prior=GammaPrior(
                                                2.0, 0.15))
        else:
            self.covar_module = covar_module

        self._num_outputs = 1
        self.train_inputs = (train_x, )
        self.train_targets = train_y
        self(train_x)  # Necessary for CholeskyVariationalDistribution
Beispiel #18
0
    def __init__(
        self,
        datapoints: Tensor,
        comparisons: Tensor,
        covar_module: Optional[Module] = None,
        input_transform: Optional[InputTransform] = None,
        **kwargs,
    ) -> None:
        r"""A probit-likelihood GP with Laplace approximation model that learns via
            pairwise comparison data. By default it uses a scaled RBF kernel.

        Args:
            datapoints: A `batch_shape x n x d` tensor of training features.
            comparisons: A `batch_shape x m x 2` training comparisons;
                comparisons[i] is a noisy indicator suggesting the utility value
                of comparisons[i, 0]-th is greater than comparisons[i, 1]-th.
            covar_module: Covariance module.
            input_transform: An input transform that is applied in the model's
                forward pass.
        """
        super().__init__()

        if input_transform is not None:
            input_transform.to(datapoints)
            # input transformation is applied in set_train_data
            self.input_transform = input_transform

        # Compatibility variables with fit_gpytorch_*: Dummy likelihood
        # Likelihood is tightly tied with this model and
        # it doesn't make much sense to keep it separate
        self.likelihood = None

        # TODO: remove these variables from `state_dict()` so that when calling
        #       `load_state_dict()`, only the hyperparameters are copied over
        self.register_buffer("datapoints", None)
        self.register_buffer("comparisons", None)
        self.register_buffer("D", None)
        self.register_buffer("DT", None)
        self.register_buffer("utility", None)
        self.register_buffer("covar_chol", None)
        self.register_buffer("likelihood_hess", None)
        self.register_buffer("hlcov_eye", None)
        self.register_buffer("covar", None)
        self.register_buffer("covar_inv", None)

        self.train_inputs = []
        self.train_targets = None

        self.pred_cov_fac_need_update = True
        self.dim = None

        # See set_train_data for additional compatibility variables.
        # Not that the datapoints here are not transformed even if input_transform
        # is not None to avoid double transformation during model fitting.
        # self.transform_inputs is called in `forward`
        self.set_train_data(datapoints, comparisons, update_model=False)

        # Set optional parameters
        # jitter to add for numerical stability
        self._jitter = kwargs.get("jitter", 1e-6)
        # Clamping z lim for better numerical stability. See self._calc_z for detail
        # norm_cdf(z=3) ~= 0.999, top 0.1% percent
        self._zlim = kwargs.get("zlim", 3)
        # Stopping creteria in scipy.optimize.fsolve used to find f_map in _update()
        # If None, set to 1e-6 by default in _update
        self._xtol = kwargs.get("xtol")
        # The maximum number of calls to the function in scipy.optimize.fsolve
        # If None, set to 100 by default in _update
        # If zero, then 100*(N+1) is used by default by fsolve;
        self._maxfev = kwargs.get("maxfev")

        # Set hyperparameters
        # Do not set the batch_shape explicitly so mean_module can operate in both mode
        # once fsolve used in _update can run in batch mode, we should explicitly set
        # the bacth shape here
        self.mean_module = ConstantMean()
        # Do not optimize constant mean prior
        for param in self.mean_module.parameters():
            param.requires_grad = False

        # set covariance module
        # the default outputscale here is only a rule of thumb, meant to keep
        # estimates away from scale value that would make Phi(f(x)) saturate
        # at 0 or 1
        if covar_module is None:
            ls_prior = GammaPrior(1.2, 0.5)
            ls_prior_mode = (ls_prior.concentration - 1) / ls_prior.rate
            covar_module = ScaleKernel(
                RBFKernel(
                    batch_shape=self.batch_shape,
                    ard_num_dims=self.dim,
                    lengthscale_prior=ls_prior,
                    lengthscale_constraint=Positive(
                        transform=None, initial_value=ls_prior_mode),
                ),
                outputscale_prior=SmoothedBoxPrior(a=1, b=4),
            )

        self.covar_module = covar_module

        self._x0 = None  # will store temporary results for warm-starting
        if self.datapoints is not None and self.comparisons is not None:
            self.to(dtype=self.datapoints.dtype, device=self.datapoints.device)
            # Find f_map for initial parameters with transformed datapoints
            transformed_dp = self.transform_inputs(datapoints)
            self._update(transformed_dp)

        self.to(self.datapoints)
Beispiel #19
0
    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        task_feature: int,
        output_tasks: Optional[List[int]] = None,
        rank: Optional[int] = None,
    ) -> None:
        r"""Multi-Task GP model using an ICM kernel, inferring observation noise.

        Args:
            train_X: A `n x (d + 1)` or `b x n x (d + 1)` (batch mode) tensor
                of training data. One of the columns should contain the task
                features (see `task_feature` argument).
            train_Y: A `n` or `b x n` (batch mode) tensor of training
                observations.
            task_feature: The index of the task feature
                (`-d <= task_feature <= d`).
            output_tasks: A list of task indices for which to compute model
                outputs for. If omitted, return outputs for all task indices.
            rank: The rank to be used for the index kernel. If omitted, use a
                full rank (i.e. number of tasks) kernel.

        Example:
            >>> X1, X2 = torch.rand(10, 2), torch.rand(20, 2)
            >>> i1, i2 = torch.zeros(10, 1), torch.ones(20, 1)
            >>> train_X = torch.stack([
            >>>     torch.cat([X1, i1], -1), torch.cat([X2, i2], -1),
            >>> ])
            >>> train_Y = torch.cat(f1(X1), f2(X2))
            >>> model = MultiTaskGP(train_X, train_Y, task_feature=-1)
        """
        if train_X.ndimension() != 2:
            # Currently, batch mode MTGPs are blocked upstream in GPyTorch
            raise ValueError(f"Unsupported shape {train_X.shape} for train_X.")
        d = train_X.shape[-1] - 1
        if not (-d <= task_feature <= d):
            raise ValueError(f"Must have that -{d} <= task_feature <= {d}")
        all_tasks = train_X[:, task_feature].unique().to(
            dtype=torch.long).tolist()
        if output_tasks is None:
            output_tasks = all_tasks
        else:
            if any(t not in all_tasks for t in output_tasks):
                raise RuntimeError(
                    "All output tasks must be present in input data.")
        self._output_tasks = output_tasks

        # TODO (T41270962): Support task-specific noise levels in likelihood
        likelihood = GaussianLikelihood(noise_prior=GammaPrior(1.1, 0.05))

        # construct indexer to be used in forward
        self._task_feature = task_feature
        self._base_idxr = torch.arange(d)
        self._base_idxr[task_feature:] += 1  # exclude task feature

        super().__init__(train_inputs=train_X,
                         train_targets=train_Y,
                         likelihood=likelihood)
        self.mean_module = ConstantMean()
        self.covar_module = ScaleKernel(
            base_kernel=MaternKernel(nu=2.5,
                                     ard_num_dims=d,
                                     lengthscale_prior=GammaPrior(3.0, 6.0)),
            outputscale_prior=GammaPrior(2.0, 0.15),
        )
        num_tasks = len(all_tasks)
        self._rank = rank if rank is not None else num_tasks
        # TODO: Add LKJ prior for the index kernel
        self.task_covar_module = IndexKernel(num_tasks=num_tasks,
                                             rank=self._rank)
        self.to(train_X)
Beispiel #20
0
    def sample_arch(self, START_BO, g, steps, hyperparams, og_flops, full_val_loss, target_flops=0):
        if args.slim:
            if target_flops == 0:
                parameterization = hyperparams.random_sample()
                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
            else:
                parameterization = np.ones(hyperparams.get_dim()) * args.lower_channel
                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
        else:
            # random sample to warmup history for MOBO
            if g < START_BO:
                if target_flops == 0:
                    f = np.random.rand(1) * (args.upper_channel-args.lower_channel) + args.lower_channel
                else:
                    f = args.lower_channel
                parameterization = np.ones(hyperparams.get_dim()) * f
                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
            # put the largest model into the history
            elif g == START_BO:
                if target_flops == 0:
                    parameterization = np.ones(hyperparams.get_dim())
                else:
                    f = args.lower_channel
                    parameterization = np.ones(hyperparams.get_dim()) * f
                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
            # MOBO
            else:
                # this is the scalarization (lambda_{FLOPs})
                rand = torch.rand(1).cuda()

                # standardize data for building Gaussian Processes
                train_X = torch.FloatTensor(self.X).cuda()
                train_Y_loss = torch.FloatTensor(np.array(self.Y)[:, 0].reshape(-1, 1)).cuda()
                train_Y_loss = standardize(train_Y_loss)

                train_Y_cost = torch.FloatTensor(np.array(self.Y)[:, 1].reshape(-1, 1)).cuda()
                train_Y_cost = standardize(train_Y_cost)

                new_train_X = train_X
                # GP for the cross entropy loss
                gp_loss = SingleTaskGP(new_train_X, train_Y_loss)
                mll = ExactMarginalLogLikelihood(gp_loss.likelihood, gp_loss)
                mll = mll.to('cuda')
                fit_gpytorch_model(mll)


                # GP for FLOPs
                # we use add-gp since FLOPs has addive structure (not exactly though)
                # the parameters for ScaleKernel and MaternKernel simply follow the default
                covar_module = AdditiveStructureKernel(
                    ScaleKernel(
                        MaternKernel(
                            nu=2.5,
                            lengthscale_prior=GammaPrior(3.0, 6.0),
                            num_dims=1
                        ),
                        outputscale_prior=GammaPrior(2.0, 0.15),
                    ),
                    num_dims=train_X.shape[1]
                )
                gp_cost = SingleTaskGP(new_train_X, train_Y_cost, covar_module=covar_module)
                mll = ExactMarginalLogLikelihood(gp_cost.likelihood, gp_cost)
                mll = mll.to('cuda')
                fit_gpytorch_model(mll)

                # Build acquisition functions
                UCB_loss = UpperConfidenceBound(gp_loss, beta=0.1).cuda()
                UCB_cost = UpperConfidenceBound(gp_cost, beta=0.1).cuda()

                # Combine them via augmented Tchebyshev scalarization
                self.mobo_obj = RandAcquisition(UCB_loss).cuda()
                self.mobo_obj.setup(UCB_loss, UCB_cost, rand)

                # Bounds for the optimization variable (alpha)
                lower = torch.ones(new_train_X.shape[1])*args.lower_channel
                upper = torch.ones(new_train_X.shape[1])*args.upper_channel
                self.mobo_bounds = torch.stack([lower, upper]).cuda()

                # Pareto-aware sampling
                if args.pas:
                    # Generate approximate Pareto front first
                    costs = []
                    for i in range(len(self.population_data)):
                        costs.append([self.population_data[i]['loss'], self.population_data[i]['ratio']])
                    costs = np.array(costs)
                    efficient_mask = is_pareto_efficient(costs)
                    costs = costs[efficient_mask]
                    loss = costs[:, 0]
                    flops = costs[:, 1]
                    sorted_idx = np.argsort(flops)
                    loss = loss[sorted_idx]
                    flops = flops[sorted_idx]
                    if flops[0] > args.lower_flops:
                        flops = np.concatenate([[args.lower_flops], flops.reshape(-1)])
                        loss = np.concatenate([[8], loss.reshape(-1)])
                    else:
                        flops = flops.reshape(-1)
                        loss = loss.reshape(-1)

                    if flops[-1] < args.upper_flops and (loss[-1] > full_val_loss):
                        flops = np.concatenate([flops.reshape(-1), [args.upper_flops]])
                        loss = np.concatenate([loss.reshape(-1), [full_val_loss]])
                    else:
                        flops = flops.reshape(-1)
                        loss = loss.reshape(-1)

                    # Equation (4) in paper
                    areas = (flops[1:]-flops[:-1])*(loss[:-1]-loss[1:])

                    # Quantize into 50 bins to sample from multinomial
                    self.sampling_weights = np.zeros(50)
                    k = 0
                    while k < len(flops) and flops[k] < args.lower_flops:
                        k+=1
                    for i in range(50):
                        lower = i/50.
                        upper = (i+1)/50.
                        if upper < args.lower_flops or lower > args.upper_flops or lower < args.lower_flops:
                            continue
                        cnt = 1
                        while ((k+1) < len(flops)) and upper > flops[k+1]:
                            self.sampling_weights[i] += areas[k]
                            cnt += 1
                            k += 1
                        if k < len(areas):
                            self.sampling_weights[i] += areas[k]
                        self.sampling_weights[i] /= cnt
                    if np.sum(self.sampling_weights) == 0:
                        self.sampling_weights = np.ones(50)
                        
                    if target_flops == 0:
                        val = np.arange(0.01, 1, 0.02)
                        chosen_target_flops = np.random.choice(val, p=(self.sampling_weights/np.sum(self.sampling_weights)))
                    else:
                        chosen_target_flops = target_flops
                    
                    # Binary search is here
                    lower_bnd, upper_bnd = 0, 1
                    lmda = 0.5
                    for i in range(10):
                        self.mobo_obj.rand = lmda

                        parameterization, acq_value = optimize_acqf(
                            self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000,
                        )

                        parameterization = parameterization[0].cpu().numpy()
                        layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
                        sim_flops = self.mask_pruner.simulate_and_count_flops(layer_budget)
                        ratio = sim_flops/og_flops

                        if np.abs(ratio - chosen_target_flops) <= 0.02:
                            break
                        if args.baseline > 0:
                            if ratio < chosen_target_flops:
                                lower_bnd = lmda
                                lmda = (lmda + upper_bnd) / 2
                            elif ratio > chosen_target_flops:
                                upper_bnd = lmda
                                lmda = (lmda + lower_bnd) / 2
                        else:
                            if ratio < chosen_target_flops:
                                upper_bnd = lmda
                                lmda = (lmda + lower_bnd) / 2
                            elif ratio > chosen_target_flops:
                                lower_bnd = lmda
                                lmda = (lmda + upper_bnd) / 2
                    rand[0] = lmda
                    writer.add_scalar('Binary search trials', i, steps)

                else:
                    parameterization, acq_value = optimize_acqf(
                        self.mobo_obj, bounds=self.mobo_bounds, q=1, num_restarts=5, raw_samples=1000,
                    )
                    parameterization = parameterization[0].cpu().numpy()

                layer_budget = hyperparams.get_layer_budget_from_parameterization(parameterization, self.mask_pruner)
        return layer_budget, parameterization, self.sampling_weights/np.sum(self.sampling_weights)
    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        iteration_fidelity: Optional[int] = None,
        data_fidelity: Optional[int] = None,
        linear_truncated: bool = True,
        nu: float = 2.5,
        likelihood: Optional[Likelihood] = None,
    ) -> None:
        if iteration_fidelity is None and data_fidelity is None:
            raise UnsupportedError(
                "SingleTaskMultiFidelityGP requires at least one fidelity parameter."
            )
        if iteration_fidelity is not None and iteration_fidelity < 0:
            iteration_fidelity = train_X.size(-1) + iteration_fidelity
        if data_fidelity is not None and data_fidelity < 0:
            data_fidelity = train_X.size(-1) + data_fidelity
        self._set_dimensions(train_X=train_X, train_Y=train_Y)
        if linear_truncated:
            fidelity_dims = [
                i for i in (iteration_fidelity, data_fidelity) if i is not None
            ]
            kernel = LinearTruncatedFidelityKernel(
                fidelity_dims=fidelity_dims,
                dimension=train_X.size(-1),
                nu=nu,
                batch_shape=self._aug_batch_shape,
                power_prior=GammaPrior(3.0, 3.0),
            )
        else:
            active_dimsX = [
                i for i in range(train_X.size(-1))
                if i not in {iteration_fidelity, data_fidelity}
            ]
            kernel = RBFKernel(
                ard_num_dims=len(active_dimsX),
                batch_shape=self._aug_batch_shape,
                lengthscale_prior=GammaPrior(3.0, 6.0),
                active_dims=active_dimsX,
            )
            additional_kernels = []
            if iteration_fidelity is not None:
                exp_kernel = ExponentialDecayKernel(
                    batch_shape=self._aug_batch_shape,
                    lengthscale_prior=GammaPrior(3.0, 6.0),
                    offset_prior=GammaPrior(3.0, 6.0),
                    power_prior=GammaPrior(3.0, 6.0),
                    active_dims=[iteration_fidelity],
                )
                additional_kernels.append(exp_kernel)
            if data_fidelity is not None:
                ds_kernel = DownsamplingKernel(
                    batch_shape=self._aug_batch_shape,
                    offset_prior=GammaPrior(3.0, 6.0),
                    power_prior=GammaPrior(3.0, 6.0),
                    active_dims=[data_fidelity],
                )
                additional_kernels.append(ds_kernel)
            kernel = ProductKernel(kernel, *additional_kernels)

        covar_module = ScaleKernel(
            kernel,
            batch_shape=self._aug_batch_shape,
            outputscale_prior=GammaPrior(2.0, 0.15),
        )
        super().__init__(train_X=train_X,
                         train_Y=train_Y,
                         covar_module=covar_module)
        self.to(train_X)
Beispiel #22
0
    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        task_feature: int,
        covar_module: Optional[Module] = None,
        task_covar_prior: Optional[Prior] = None,
        output_tasks: Optional[List[int]] = None,
        rank: Optional[int] = None,
        input_transform: Optional[InputTransform] = None,
        outcome_transform: Optional[OutcomeTransform] = None,
    ) -> None:
        r"""Multi-Task GP model using an ICM kernel, inferring observation noise.

        Args:
            train_X: A `n x (d + 1)` or `b x n x (d + 1)` (batch mode) tensor
                of training data. One of the columns should contain the task
                features (see `task_feature` argument).
            train_Y: A `n x 1` or `b x n x 1` (batch mode) tensor of training
                observations.
            task_feature: The index of the task feature (`-d <= task_feature <= d`).
            output_tasks: A list of task indices for which to compute model
                outputs for. If omitted, return outputs for all task indices.
            rank: The rank to be used for the index kernel. If omitted, use a
                full rank (i.e. number of tasks) kernel.
            task_covar_prior : A Prior on the task covariance matrix. Must operate
                on p.s.d. matrices. A common prior for this is the `LKJ` prior.
            input_transform: An input transform that is applied in the model's
                forward pass.

        Example:
            >>> X1, X2 = torch.rand(10, 2), torch.rand(20, 2)
            >>> i1, i2 = torch.zeros(10, 1), torch.ones(20, 1)
            >>> train_X = torch.cat([
            >>>     torch.cat([X1, i1], -1), torch.cat([X2, i2], -1),
            >>> ])
            >>> train_Y = torch.cat(f1(X1), f2(X2)).unsqueeze(-1)
            >>> model = MultiTaskGP(train_X, train_Y, task_feature=-1)
        """
        with torch.no_grad():
            transformed_X = self.transform_inputs(
                X=train_X, input_transform=input_transform)
        self._validate_tensor_args(X=transformed_X, Y=train_Y)
        all_tasks, task_feature, d = self.get_all_tasks(
            transformed_X, task_feature, output_tasks)
        if outcome_transform is not None:
            train_Y, _ = outcome_transform(train_Y)

        # squeeze output dim
        train_Y = train_Y.squeeze(-1)
        if output_tasks is None:
            output_tasks = all_tasks
        else:
            if set(output_tasks) - set(all_tasks):
                raise RuntimeError(
                    "All output tasks must be present in input data.")
        self._output_tasks = output_tasks
        self._num_outputs = len(output_tasks)

        # TODO (T41270962): Support task-specific noise levels in likelihood
        likelihood = GaussianLikelihood(noise_prior=GammaPrior(1.1, 0.05))

        # construct indexer to be used in forward
        self._task_feature = task_feature
        self._base_idxr = torch.arange(d)
        self._base_idxr[task_feature:] += 1  # exclude task feature

        super().__init__(train_inputs=train_X,
                         train_targets=train_Y,
                         likelihood=likelihood)
        self.mean_module = ConstantMean()
        if covar_module is None:
            self.covar_module = ScaleKernel(
                base_kernel=MaternKernel(nu=2.5,
                                         ard_num_dims=d,
                                         lengthscale_prior=GammaPrior(
                                             3.0, 6.0)),
                outputscale_prior=GammaPrior(2.0, 0.15),
            )
        else:
            self.covar_module = covar_module

        num_tasks = len(all_tasks)
        self._rank = rank if rank is not None else num_tasks

        self.task_covar_module = IndexKernel(num_tasks=num_tasks,
                                             rank=self._rank,
                                             prior=task_covar_prior)
        if input_transform is not None:
            self.input_transform = input_transform
        if outcome_transform is not None:
            self.outcome_transform = outcome_transform
        self.to(train_X)
def _setup_multifidelity_covar_module(
    dim: int,
    aug_batch_shape: torch.Size,
    iteration_fidelity: Optional[int],
    data_fidelity: Optional[int],
    linear_truncated: bool,
    nu: float,
) -> Tuple[ScaleKernel, Dict]:
    """Helper function to get the covariance module and associated subset_batch_dict
    for the multifidelity setting.

    Args:
        dim: The dimensionality of the training data.
        aug_batch_shape: The output-augmented batch shape as defined in
            `BatchedMultiOutputGPyTorchModel`.
        iteration_fidelity: The column index for the training iteration fidelity
            parameter (optional).
        data_fidelity: The column index for the downsampling fidelity parameter
            (optional).
        linear_truncated: If True, use a `LinearTruncatedFidelityKernel` instead
            of the default kernel.
        nu: The smoothness parameter for the Matern kernel: either 1/2, 3/2, or
            5/2. Only used when `linear_truncated=True`.

    Returns:
        The covariance module and subset_batch_dict.
    """

    if iteration_fidelity is not None and iteration_fidelity < 0:
        iteration_fidelity = dim + iteration_fidelity
    if data_fidelity is not None and data_fidelity < 0:
        data_fidelity = dim + data_fidelity

    if linear_truncated:
        fidelity_dims = [
            i for i in (iteration_fidelity, data_fidelity) if i is not None
        ]
        kernel = LinearTruncatedFidelityKernel(
            fidelity_dims=fidelity_dims,
            dimension=dim,
            nu=nu,
            batch_shape=aug_batch_shape,
            power_prior=GammaPrior(3.0, 3.0),
        )
    else:
        active_dimsX = [
            i for i in range(dim)
            if i not in {iteration_fidelity, data_fidelity}
        ]
        kernel = RBFKernel(
            ard_num_dims=len(active_dimsX),
            batch_shape=aug_batch_shape,
            lengthscale_prior=GammaPrior(3.0, 6.0),
            active_dims=active_dimsX,
        )
        additional_kernels = []
        if iteration_fidelity is not None:
            exp_kernel = ExponentialDecayKernel(
                batch_shape=aug_batch_shape,
                lengthscale_prior=GammaPrior(3.0, 6.0),
                offset_prior=GammaPrior(3.0, 6.0),
                power_prior=GammaPrior(3.0, 6.0),
                active_dims=[iteration_fidelity],
            )
            additional_kernels.append(exp_kernel)
        if data_fidelity is not None:
            ds_kernel = DownsamplingKernel(
                batch_shape=aug_batch_shape,
                offset_prior=GammaPrior(3.0, 6.0),
                power_prior=GammaPrior(3.0, 6.0),
                active_dims=[data_fidelity],
            )
            additional_kernels.append(ds_kernel)
        kernel = ProductKernel(kernel, *additional_kernels)

    covar_module = ScaleKernel(kernel,
                               batch_shape=aug_batch_shape,
                               outputscale_prior=GammaPrior(2.0, 0.15))

    if linear_truncated:
        subset_batch_dict = {
            "covar_module.base_kernel.raw_power": -2,
            "covar_module.base_kernel.covar_module_unbiased.raw_lengthscale":
            -3,
            "covar_module.base_kernel.covar_module_biased.raw_lengthscale": -3,
        }
    else:
        subset_batch_dict = {
            "covar_module.base_kernel.kernels.0.raw_lengthscale": -3,
            "covar_module.base_kernel.kernels.1.raw_power": -2,
            "covar_module.base_kernel.kernels.1.raw_offset": -2,
        }
        if iteration_fidelity is not None:
            subset_batch_dict = {
                "covar_module.base_kernel.kernels.1.raw_lengthscale": -3,
                **subset_batch_dict,
            }
            if data_fidelity is not None:
                subset_batch_dict = {
                    "covar_module.base_kernel.kernels.2.raw_power": -2,
                    "covar_module.base_kernel.kernels.2.raw_offset": -2,
                    **subset_batch_dict,
                }

    return covar_module, subset_batch_dict
Beispiel #24
0
    def __init__(
        self,
        train_X: Tensor,
        train_Y: Tensor,
        iteration_fidelity: Optional[int] = None,
        data_fidelity: Optional[int] = None,
        linear_truncated: bool = True,
        nu: float = 2.5,
        likelihood: Optional[Likelihood] = None,
        outcome_transform: Optional[OutcomeTransform] = None,
    ) -> None:
        self._init_args = {
            "iteration_fidelity": iteration_fidelity,
            "data_fidelity": data_fidelity,
            "linear_truncated": linear_truncated,
            "nu": nu,
            "outcome_transform": outcome_transform,
        }
        if iteration_fidelity is None and data_fidelity is None:
            raise UnsupportedError(
                "SingleTaskMultiFidelityGP requires at least one fidelity parameter."
            )
        if iteration_fidelity is not None and iteration_fidelity < 0:
            iteration_fidelity = train_X.size(-1) + iteration_fidelity
        if data_fidelity is not None and data_fidelity < 0:
            data_fidelity = train_X.size(-1) + data_fidelity
        self._set_dimensions(train_X=train_X, train_Y=train_Y)
        if linear_truncated:
            fidelity_dims = [
                i for i in (iteration_fidelity, data_fidelity) if i is not None
            ]
            kernel = LinearTruncatedFidelityKernel(
                fidelity_dims=fidelity_dims,
                dimension=train_X.size(-1),
                nu=nu,
                batch_shape=self._aug_batch_shape,
                power_prior=GammaPrior(3.0, 3.0),
            )
        else:
            active_dimsX = [
                i
                for i in range(train_X.size(-1))
                if i not in {iteration_fidelity, data_fidelity}
            ]
            kernel = RBFKernel(
                ard_num_dims=len(active_dimsX),
                batch_shape=self._aug_batch_shape,
                lengthscale_prior=GammaPrior(3.0, 6.0),
                active_dims=active_dimsX,
            )
            additional_kernels = []
            if iteration_fidelity is not None:
                exp_kernel = ExponentialDecayKernel(
                    batch_shape=self._aug_batch_shape,
                    lengthscale_prior=GammaPrior(3.0, 6.0),
                    offset_prior=GammaPrior(3.0, 6.0),
                    power_prior=GammaPrior(3.0, 6.0),
                    active_dims=[iteration_fidelity],
                )
                additional_kernels.append(exp_kernel)
            if data_fidelity is not None:
                ds_kernel = DownsamplingKernel(
                    batch_shape=self._aug_batch_shape,
                    offset_prior=GammaPrior(3.0, 6.0),
                    power_prior=GammaPrior(3.0, 6.0),
                    active_dims=[data_fidelity],
                )
                additional_kernels.append(ds_kernel)
            kernel = ProductKernel(kernel, *additional_kernels)

        covar_module = ScaleKernel(
            kernel,
            batch_shape=self._aug_batch_shape,
            outputscale_prior=GammaPrior(2.0, 0.15),
        )
        super().__init__(
            train_X=train_X,
            train_Y=train_Y,
            covar_module=covar_module,
            outcome_transform=outcome_transform,
        )
        if linear_truncated:
            subset_batch_dict = {
                "covar_module.base_kernel.raw_power": -2,
                "covar_module.base_kernel.covar_module_unbiased.raw_lengthscale": -3,
                "covar_module.base_kernel.covar_module_biased.raw_lengthscale": -3,
            }
        else:
            subset_batch_dict = {
                "covar_module.base_kernel.kernels.0.raw_lengthscale": -3,
                "covar_module.base_kernel.kernels.1.raw_power": -2,
                "covar_module.base_kernel.kernels.1.raw_offset": -2,
            }
            if iteration_fidelity is not None:
                subset_batch_dict = {
                    "covar_module.base_kernel.kernels.1.raw_lengthscale": -3,
                    **subset_batch_dict,
                }
                if data_fidelity is not None:
                    subset_batch_dict = {
                        "covar_module.base_kernel.kernels.2.raw_power": -2,
                        "covar_module.base_kernel.kernels.2.raw_offset": -2,
                        **subset_batch_dict,
                    }
        self._subset_batch_dict = {
            "likelihood.noise_covar.raw_noise": -2,
            "mean_module.constant": -2,
            "covar_module.raw_outputscale": -1,
            **subset_batch_dict,
        }

        self.to(train_X)