def _train(self, X: np.ndarray, y: np.ndarray, do_optimize: bool = True) -> 'GaussianProcessMCMC':
        """
        Performs MCMC sampling to sample hyperparameter configurations from the
        likelihood and trains for each sample a GP on X and y

        Parameters
        ----------
        X: np.ndarray (N, D)
            Input data points. The dimensionality of X is (N, D),
            with N as the number of points and D is the number of features.
        y: np.ndarray (N,)
            The corresponding target values.
        do_optimize: boolean
            If set to true we perform MCMC sampling otherwise we just use the
            hyperparameter specified in the kernel.
        """
        X = self._impute_inactive(X)
        if self.normalize_y:
            # A note on normalization for the Gaussian process with MCMC:
            # Scikit-learn uses a different "normalization" than we use in SMAC3. Scikit-learn normalizes the data to
            # have zero mean, while we normalize it to have zero mean unit variance. To make sure the scikit-learn GP
            # behaves the same when we use it directly or indirectly (through the gaussian_process.py file), we
            # normalize the data here. Then, after the individual GPs are fit, we inject the statistics into them so
            # they unnormalize the data at prediction time.
            y = self._normalize_y(y)

        self.gp = self._get_gp()

        if do_optimize:
            self.gp.fit(X, y)
            self._all_priors = self._get_all_priors(
                add_bound_priors=True,
                add_soft_bounds=True if self.mcmc_sampler == 'nuts' else False,
            )

            if self.mcmc_sampler == 'emcee':
                sampler = emcee.EnsembleSampler(self.n_mcmc_walkers,
                                                len(self.kernel.theta),
                                                self._ll)
                sampler.random_state = self.rng.get_state()
                # Do a burn-in in the first iteration
                if not self.burned:
                    # Initialize the walkers by sampling from the prior
                    dim_samples = []

                    prior = None  # type: typing.Optional[typing.Union[typing.List[Prior], Prior]]
                    for dim, prior in enumerate(self._all_priors):
                        # Always sample from the first prior
                        if isinstance(prior, list):
                            if len(prior) == 0:
                                prior = None
                            else:
                                prior = prior[0]
                        prior = typing.cast(typing.Optional[Prior], prior)
                        if prior is None:
                            raise NotImplementedError()
                        else:
                            dim_samples.append(prior.sample_from_prior(self.n_mcmc_walkers).flatten())
                    self.p0 = np.vstack(dim_samples).transpose()

                    # Run MCMC sampling
                    with warnings.catch_warnings():
                        warnings.filterwarnings('ignore', r'invalid value encountered in double_scalars.*')
                        self.p0, _, _ = sampler.run_mcmc(self.p0,
                                                         self.burnin_steps)

                    self.burned = True

                # Start sampling & save the current position, it will be the start point in the next iteration
                with warnings.catch_warnings():
                    warnings.filterwarnings('ignore', r'invalid value encountered in double_scalars.*')
                    self.p0, _, _ = sampler.run_mcmc(self.p0, self.chain_length)

                # Take the last samples from each walker
                self.hypers = sampler.get_chain()[:, -1]
            elif self.mcmc_sampler == 'nuts':
                # Originally published as:
                # http://www.stat.columbia.edu/~gelman/research/published/nuts.pdf
                # A good explanation of HMC:
                # https://theclevermachine.wordpress.com/2012/11/18/mcmc-hamiltonian-monte-carlo-a-k-a-hybrid-monte-carlo/
                # A good explanation of HMC and NUTS can be found in:
                # https://besjournals.onlinelibrary.wiley.com/doi/full/10.1111/2041-210X.12681

                # Do not require the installation of NUTS for SMAC
                # This requires NUTS from https://github.com/mfeurer/NUTS
                import nuts.nuts

                # Perform initial fit to the data to obtain theta0
                if not self.burned:
                    theta0 = self.gp.kernel.theta
                    self.burned = True
                else:
                    theta0 = self.p0
                samples, _, _ = nuts.nuts.nuts6(
                    f=self._ll_w_grad,
                    Madapt=self.burnin_steps,
                    M=self.chain_length,
                    theta0=theta0,
                    # Increasing this value results in longer running times
                    delta=0.5,
                    adapt_mass=False,
                    # Rather low max depth to keep the number of required gradient steps low
                    max_depth=10,
                    rng=self.rng,
                )
                indices = [int(np.rint(ind)) for ind in np.linspace(start=0, stop=len(samples) - 1, num=10)]
                self.hypers = samples[indices]
                self.p0 = self.hypers.mean(axis=0)
            else:
                raise ValueError(self.mcmc_sampler)

            if self.average_samples:
                self.hypers = [self.hypers.mean(axis=0)]

        else:
            self.hypers = self.gp.kernel.theta
            self.hypers = [self.hypers]

        self.models = []
        for sample in self.hypers:

            if (sample < -50).any():
                sample[sample < -50] = -50
            if (sample > 50).any():
                sample[sample > 50] = 50

            # Instantiate a GP for each hyperparameter configuration
            kernel = deepcopy(self.kernel)
            kernel.theta = sample
            model = GaussianProcess(
                configspace=self.configspace,
                types=self.types,
                bounds=self.bounds,
                kernel=kernel,
                normalize_y=False,
                seed=self.rng.randint(low=0, high=10000),
            )
            try:
                model._train(X, y, do_optimize=False)
                self.models.append(model)
            except np.linalg.LinAlgError:
                pass

        if len(self.models) == 0:
            kernel = deepcopy(self.kernel)
            kernel.theta = self.p0
            model = GaussianProcess(
                configspace=self.configspace,
                types=self.types,
                bounds=self.bounds,
                kernel=kernel,
                normalize_y=False,
                seed=self.rng.randint(low=0, high=10000),
            )
            model._train(X, y, do_optimize=False)
            self.models.append(model)

        if self.normalize_y:
            # Inject the normalization statistics into the individual models. Setting normalize_y to True makes the
            # individual GPs unnormalize the data at predict time.
            for model in self.models:
                model.normalize_y = True
                model.mean_y_ = self.mean_y_
                model.std_y_ = self.std_y_

        self.is_trained = True
        return self
Beispiel #2
0
    def _train(self, X: np.ndarray, y: np.ndarray):
        """Trains the random forest on X and y.

        Parameters
        ----------
        X : np.ndarray [n_samples, n_features (config + instance features)]
            Input data points.
        Y : np.ndarray [n_samples, ]
            The corresponding target values.

        Returns
        -------
        self
        """

        self.X = X
        self.y = y.flatten()

        from smac.epm.gp_kernels import ConstantKernel, Matern, WhiteKernel, HammingKernel
        from smac.epm.gp_base_prior import HorseshoePrior, LognormalPrior

        self.rf = sklearn.ensemble.RandomForestRegressor(
            max_features=0.5,
            bootstrap=True,
            max_depth=3,
            min_samples_leaf=10,
            n_estimators=N_EST,
        )
        # self.rf.fit(X, np.log(y - np.min(y) + 1e-7).ravel())
        self.rf.fit(X, y.ravel())
        indicators = np.array(self.rf.apply(X))
        all_datasets = []
        all_targets = []
        all_mappings = []
        for est in range(N_EST):
            unique = np.unique(indicators[:, est])
            mapping = {j: i for i, j in enumerate(unique)}
            datasets = [[] for _ in unique]
            targets = [[] for _ in indicators]
            for indicator, x, y_ in zip(indicators[:, est], X, y):
                index = mapping[indicator]
                datasets[index].append(x)
                targets[index].append(y_)
            all_mappings.append(mapping)
            all_datasets.append(datasets)
            all_targets.append(targets)

        # print('Before')
        # for est in range(N_EST):
        #     for dataset in all_datasets[est]:
        #         print(len(dataset))

        for est in range(N_EST):
            n_nodes = self.rf.estimators_[est].tree_.node_count
            children_left = self.rf.estimators_[est].tree_.children_left
            children_right = self.rf.estimators_[est].tree_.children_right
            feature = self.rf.estimators_[est].tree_.feature
            threshold = self.rf.estimators_[est].tree_.threshold

            # The tree structure can be traversed to compute various properties such
            # as the depth of each node and whether or not it is a leaf.
            node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
            is_leaves = np.zeros(shape=n_nodes, dtype=bool)
            stack = [(0, -1)]  # seed is the root node id and its parent depth
            while len(stack) > 0:
                node_id, parent_depth = stack.pop()
                node_depth[node_id] = parent_depth + 1

                # If we have a test node
                if (children_left[node_id] != children_right[node_id]):
                    stack.append((children_left[node_id], parent_depth + 1))
                    stack.append((children_right[node_id], parent_depth + 1))
                else:
                    is_leaves[node_id] = True

            rules = {}
            import copy

            def extend(rule, idx):
                if is_leaves[idx]:
                    rules[idx] = rule
                else:
                    rule_left = copy.deepcopy(rule)
                    rule_left.append((threshold[idx], '<=', feature[idx]))
                    extend(rule_left, children_left[idx])
                    rule_right = copy.deepcopy(rule)
                    rule_right.append((threshold[idx], '>', feature[idx]))
                    extend(rule_right, children_right[idx])

            extend([], 0)
            #print(rules)

            for key, rule in rules.items():
                lower = -np.ones((X.shape[1], )) * np.inf
                upper = np.ones((X.shape[1], )) * np.inf
                for element in rule:
                    if element[1] == '<=':
                        if element[0] < upper[element[2]]:
                            upper[element[2]] = element[0]
                    else:
                        if element[0] > lower[element[2]]:
                            lower[element[2]] = element[0]

                for feature_idx in range(X.shape[1]):
                    closest_lower = -np.inf
                    closes_lower_idx = None
                    closest_upper = np.inf
                    closest_upper_idx = None
                    for x in X:
                        if x[feature_idx] > lower[feature_idx] and x[
                                feature_idx] < upper[feature_idx]:
                            continue
                        if x[feature_idx] <= lower[feature_idx]:
                            if x[feature_idx] > closest_lower:
                                closest_lower = x[feature_idx]
                                closes_lower_idx = feature_idx
                        if x[feature_idx] >= upper[feature_idx]:
                            if x[feature_idx] < closest_upper:
                                closest_upper = x[feature_idx]
                                closest_upper_idx = feature_idx

                    if closest_upper_idx is not None:
                        all_datasets[est][all_mappings[est][key]].append(
                            X[closest_upper_idx])
                        all_targets[est][all_mappings[est][key]].append(
                            y[closest_upper_idx])
                    if closes_lower_idx is not None:
                        all_datasets[est][all_mappings[est][key]].append(
                            X[closes_lower_idx])
                        all_targets[est][all_mappings[est][key]].append(
                            y[closes_lower_idx])

        # print('After')
        # for est in range(N_EST):
        #     for dataset in all_datasets[est]:
        #         print(len(dataset))

        self.all_mappings = all_mappings
        self.models = []
        for est in range(N_EST):
            models = []
            for dataset, targets_ in zip(all_datasets[est], all_targets[est]):

                cov_amp = ConstantKernel(
                    2.0,
                    constant_value_bounds=(np.exp(-10), np.exp(2)),
                    prior=LognormalPrior(mean=0.0, sigma=1.0, rng=self.rng),
                )

                cont_dims = np.nonzero(self.types == 0)[0]
                cat_dims = np.nonzero(self.types != 0)[0]

                if len(cont_dims) > 0:
                    exp_kernel = Matern(
                        np.ones([len(cont_dims)]),
                        [(np.exp(-10), np.exp(2))
                         for _ in range(len(cont_dims))],
                        nu=2.5,
                        operate_on=cont_dims,
                    )

                if len(cat_dims) > 0:
                    ham_kernel = HammingKernel(
                        np.ones([len(cat_dims)]),
                        [(np.exp(-10), np.exp(2))
                         for _ in range(len(cat_dims))],
                        operate_on=cat_dims,
                    )

                noise_kernel = WhiteKernel(
                    noise_level=1e-8,
                    noise_level_bounds=(np.exp(-25), np.exp(2)),
                    prior=HorseshoePrior(scale=0.1, rng=self.rng),
                )

                if len(cont_dims) > 0 and len(cat_dims) > 0:
                    # both
                    kernel = cov_amp * (exp_kernel * ham_kernel) + noise_kernel
                elif len(cont_dims) > 0 and len(cat_dims) == 0:
                    # only cont
                    kernel = cov_amp * exp_kernel + noise_kernel
                elif len(cont_dims) == 0 and len(cat_dims) > 0:
                    # only cont
                    kernel = cov_amp * ham_kernel + noise_kernel
                else:
                    raise ValueError()

                gp = GaussianProcess(
                    configspace=self.configspace,
                    types=self.types,
                    bounds=self.bounds,
                    kernel=kernel,
                    normalize_y=True,
                    seed=self.rng.randint(low=0, high=10000),
                )
                gp.train(np.array(dataset), np.array(targets_))
                gp._train(X, y, do_optimize=False)
                models.append(gp)
            self.models.append(models)
        return self
Beispiel #3
0
    def _train(self, X: np.ndarray, y: np.ndarray, do_optimize: bool = True):
        """
        Performs MCMC sampling to sample hyperparameter configurations from the
        likelihood and trains for each sample a GP on X and y

        Parameters
        ----------
        X: np.ndarray (N, D)
            Input data points. The dimensionality of X is (N, D),
            with N as the number of points and D is the number of features.
        y: np.ndarray (N,)
            The corresponding target values.
        do_optimize: boolean
            If set to true we perform MCMC sampling otherwise we just use the
            hyperparameter specified in the kernel.
        """

        if self.normalize_input:
            # Normalize input to be in [0, 1]
            self.X, self.lower, self.upper = normalization.zero_one_normalization(
                X, self.lower, self.upper)
        else:
            self.X = X

        if len(y.shape) > 1:
            y = y.flatten()
            if len(y) != len(X):
                raise ValueError('Shape mismatch: %s vs %s' %
                                 (y.shape, X.shape))

        if self.normalize_output:
            # Normalize output to have zero mean and unit standard deviation
            self.y, self.y_mean, self.y_std = normalization.zero_mean_unit_var_normalization(
                y)
            if self.y_std == 0:
                raise ValueError(
                    "Cannot normalize output. All targets have the same value")
        else:
            self.y = y

        # Use the mean of the data as mean for the GP
        self.mean = np.mean(self.y, axis=0)
        self.gp = george.GP(self.kernel, mean=self.mean)

        if do_optimize:
            # We have one walker for each hyperparameter configuration
            sampler = emcee.EnsembleSampler(self.n_hypers,
                                            len(self.kernel) + 1,
                                            self._loglikelihood)
            sampler.random_state = self.rng.get_state()
            # Do a burn-in in the first iteration
            if not self.burned:
                # Initialize the walkers by sampling from the prior
                if self.prior is None:
                    self.p0 = self.rng.rand(self.n_hypers,
                                            len(self.kernel) + 1)
                else:
                    self.p0 = self.prior.sample_from_prior(self.n_hypers)
                # Run MCMC sampling
                self.p0, _, _ = sampler.run_mcmc(self.p0,
                                                 self.burnin_steps,
                                                 rstate0=self.rng)

                self.burned = True

            # Start sampling
            pos, _, _ = sampler.run_mcmc(self.p0,
                                         self.chain_length,
                                         rstate0=self.rng)

            # Save the current position, it will be the start point in
            # the next iteration
            self.p0 = pos

            # Take the last samples from each walker
            self.hypers = sampler.chain[:, -1]

        else:
            self.hypers = self.gp.kernel.get_parameter_vector().tolist()
            self.hypers.append(self.noise)
            self.hypers = [self.hypers]

        self.models = []
        for sample in self.hypers:

            # Instantiate a GP for each hyperparameter configuration
            kernel = deepcopy(self.kernel)
            kernel.set_parameter_vector(sample[:-1])
            noise = np.exp(sample[-1])
            model = GaussianProcess(
                types=self.types,
                bounds=self.bounds,
                kernel=kernel,
                normalize_output=self.normalize_output,
                normalize_input=self.normalize_input,
                noise=noise,
                rng=self.rng,
            )
            model._train(X, y, do_optimize=False)
            self.models.append(model)

        self.is_trained = True