def _train(self, X: np.ndarray, y: np.ndarray, do_optimize: bool = True) -> 'GaussianProcessMCMC': """ Performs MCMC sampling to sample hyperparameter configurations from the likelihood and trains for each sample a GP on X and y Parameters ---------- X: np.ndarray (N, D) Input data points. The dimensionality of X is (N, D), with N as the number of points and D is the number of features. y: np.ndarray (N,) The corresponding target values. do_optimize: boolean If set to true we perform MCMC sampling otherwise we just use the hyperparameter specified in the kernel. """ X = self._impute_inactive(X) if self.normalize_y: # A note on normalization for the Gaussian process with MCMC: # Scikit-learn uses a different "normalization" than we use in SMAC3. Scikit-learn normalizes the data to # have zero mean, while we normalize it to have zero mean unit variance. To make sure the scikit-learn GP # behaves the same when we use it directly or indirectly (through the gaussian_process.py file), we # normalize the data here. Then, after the individual GPs are fit, we inject the statistics into them so # they unnormalize the data at prediction time. y = self._normalize_y(y) self.gp = self._get_gp() if do_optimize: self.gp.fit(X, y) self._all_priors = self._get_all_priors( add_bound_priors=True, add_soft_bounds=True if self.mcmc_sampler == 'nuts' else False, ) if self.mcmc_sampler == 'emcee': sampler = emcee.EnsembleSampler(self.n_mcmc_walkers, len(self.kernel.theta), self._ll) sampler.random_state = self.rng.get_state() # Do a burn-in in the first iteration if not self.burned: # Initialize the walkers by sampling from the prior dim_samples = [] prior = None # type: typing.Optional[typing.Union[typing.List[Prior], Prior]] for dim, prior in enumerate(self._all_priors): # Always sample from the first prior if isinstance(prior, list): if len(prior) == 0: prior = None else: prior = prior[0] prior = typing.cast(typing.Optional[Prior], prior) if prior is None: raise NotImplementedError() else: dim_samples.append(prior.sample_from_prior(self.n_mcmc_walkers).flatten()) self.p0 = np.vstack(dim_samples).transpose() # Run MCMC sampling with warnings.catch_warnings(): warnings.filterwarnings('ignore', r'invalid value encountered in double_scalars.*') self.p0, _, _ = sampler.run_mcmc(self.p0, self.burnin_steps) self.burned = True # Start sampling & save the current position, it will be the start point in the next iteration with warnings.catch_warnings(): warnings.filterwarnings('ignore', r'invalid value encountered in double_scalars.*') self.p0, _, _ = sampler.run_mcmc(self.p0, self.chain_length) # Take the last samples from each walker self.hypers = sampler.get_chain()[:, -1] elif self.mcmc_sampler == 'nuts': # Originally published as: # http://www.stat.columbia.edu/~gelman/research/published/nuts.pdf # A good explanation of HMC: # https://theclevermachine.wordpress.com/2012/11/18/mcmc-hamiltonian-monte-carlo-a-k-a-hybrid-monte-carlo/ # A good explanation of HMC and NUTS can be found in: # https://besjournals.onlinelibrary.wiley.com/doi/full/10.1111/2041-210X.12681 # Do not require the installation of NUTS for SMAC # This requires NUTS from https://github.com/mfeurer/NUTS import nuts.nuts # Perform initial fit to the data to obtain theta0 if not self.burned: theta0 = self.gp.kernel.theta self.burned = True else: theta0 = self.p0 samples, _, _ = nuts.nuts.nuts6( f=self._ll_w_grad, Madapt=self.burnin_steps, M=self.chain_length, theta0=theta0, # Increasing this value results in longer running times delta=0.5, adapt_mass=False, # Rather low max depth to keep the number of required gradient steps low max_depth=10, rng=self.rng, ) indices = [int(np.rint(ind)) for ind in np.linspace(start=0, stop=len(samples) - 1, num=10)] self.hypers = samples[indices] self.p0 = self.hypers.mean(axis=0) else: raise ValueError(self.mcmc_sampler) if self.average_samples: self.hypers = [self.hypers.mean(axis=0)] else: self.hypers = self.gp.kernel.theta self.hypers = [self.hypers] self.models = [] for sample in self.hypers: if (sample < -50).any(): sample[sample < -50] = -50 if (sample > 50).any(): sample[sample > 50] = 50 # Instantiate a GP for each hyperparameter configuration kernel = deepcopy(self.kernel) kernel.theta = sample model = GaussianProcess( configspace=self.configspace, types=self.types, bounds=self.bounds, kernel=kernel, normalize_y=False, seed=self.rng.randint(low=0, high=10000), ) try: model._train(X, y, do_optimize=False) self.models.append(model) except np.linalg.LinAlgError: pass if len(self.models) == 0: kernel = deepcopy(self.kernel) kernel.theta = self.p0 model = GaussianProcess( configspace=self.configspace, types=self.types, bounds=self.bounds, kernel=kernel, normalize_y=False, seed=self.rng.randint(low=0, high=10000), ) model._train(X, y, do_optimize=False) self.models.append(model) if self.normalize_y: # Inject the normalization statistics into the individual models. Setting normalize_y to True makes the # individual GPs unnormalize the data at predict time. for model in self.models: model.normalize_y = True model.mean_y_ = self.mean_y_ model.std_y_ = self.std_y_ self.is_trained = True return self
def _train(self, X: np.ndarray, y: np.ndarray): """Trains the random forest on X and y. Parameters ---------- X : np.ndarray [n_samples, n_features (config + instance features)] Input data points. Y : np.ndarray [n_samples, ] The corresponding target values. Returns ------- self """ self.X = X self.y = y.flatten() from smac.epm.gp_kernels import ConstantKernel, Matern, WhiteKernel, HammingKernel from smac.epm.gp_base_prior import HorseshoePrior, LognormalPrior self.rf = sklearn.ensemble.RandomForestRegressor( max_features=0.5, bootstrap=True, max_depth=3, min_samples_leaf=10, n_estimators=N_EST, ) # self.rf.fit(X, np.log(y - np.min(y) + 1e-7).ravel()) self.rf.fit(X, y.ravel()) indicators = np.array(self.rf.apply(X)) all_datasets = [] all_targets = [] all_mappings = [] for est in range(N_EST): unique = np.unique(indicators[:, est]) mapping = {j: i for i, j in enumerate(unique)} datasets = [[] for _ in unique] targets = [[] for _ in indicators] for indicator, x, y_ in zip(indicators[:, est], X, y): index = mapping[indicator] datasets[index].append(x) targets[index].append(y_) all_mappings.append(mapping) all_datasets.append(datasets) all_targets.append(targets) # print('Before') # for est in range(N_EST): # for dataset in all_datasets[est]: # print(len(dataset)) for est in range(N_EST): n_nodes = self.rf.estimators_[est].tree_.node_count children_left = self.rf.estimators_[est].tree_.children_left children_right = self.rf.estimators_[est].tree_.children_right feature = self.rf.estimators_[est].tree_.feature threshold = self.rf.estimators_[est].tree_.threshold # The tree structure can be traversed to compute various properties such # as the depth of each node and whether or not it is a leaf. node_depth = np.zeros(shape=n_nodes, dtype=np.int64) is_leaves = np.zeros(shape=n_nodes, dtype=bool) stack = [(0, -1)] # seed is the root node id and its parent depth while len(stack) > 0: node_id, parent_depth = stack.pop() node_depth[node_id] = parent_depth + 1 # If we have a test node if (children_left[node_id] != children_right[node_id]): stack.append((children_left[node_id], parent_depth + 1)) stack.append((children_right[node_id], parent_depth + 1)) else: is_leaves[node_id] = True rules = {} import copy def extend(rule, idx): if is_leaves[idx]: rules[idx] = rule else: rule_left = copy.deepcopy(rule) rule_left.append((threshold[idx], '<=', feature[idx])) extend(rule_left, children_left[idx]) rule_right = copy.deepcopy(rule) rule_right.append((threshold[idx], '>', feature[idx])) extend(rule_right, children_right[idx]) extend([], 0) #print(rules) for key, rule in rules.items(): lower = -np.ones((X.shape[1], )) * np.inf upper = np.ones((X.shape[1], )) * np.inf for element in rule: if element[1] == '<=': if element[0] < upper[element[2]]: upper[element[2]] = element[0] else: if element[0] > lower[element[2]]: lower[element[2]] = element[0] for feature_idx in range(X.shape[1]): closest_lower = -np.inf closes_lower_idx = None closest_upper = np.inf closest_upper_idx = None for x in X: if x[feature_idx] > lower[feature_idx] and x[ feature_idx] < upper[feature_idx]: continue if x[feature_idx] <= lower[feature_idx]: if x[feature_idx] > closest_lower: closest_lower = x[feature_idx] closes_lower_idx = feature_idx if x[feature_idx] >= upper[feature_idx]: if x[feature_idx] < closest_upper: closest_upper = x[feature_idx] closest_upper_idx = feature_idx if closest_upper_idx is not None: all_datasets[est][all_mappings[est][key]].append( X[closest_upper_idx]) all_targets[est][all_mappings[est][key]].append( y[closest_upper_idx]) if closes_lower_idx is not None: all_datasets[est][all_mappings[est][key]].append( X[closes_lower_idx]) all_targets[est][all_mappings[est][key]].append( y[closes_lower_idx]) # print('After') # for est in range(N_EST): # for dataset in all_datasets[est]: # print(len(dataset)) self.all_mappings = all_mappings self.models = [] for est in range(N_EST): models = [] for dataset, targets_ in zip(all_datasets[est], all_targets[est]): cov_amp = ConstantKernel( 2.0, constant_value_bounds=(np.exp(-10), np.exp(2)), prior=LognormalPrior(mean=0.0, sigma=1.0, rng=self.rng), ) cont_dims = np.nonzero(self.types == 0)[0] cat_dims = np.nonzero(self.types != 0)[0] if len(cont_dims) > 0: exp_kernel = Matern( np.ones([len(cont_dims)]), [(np.exp(-10), np.exp(2)) for _ in range(len(cont_dims))], nu=2.5, operate_on=cont_dims, ) if len(cat_dims) > 0: ham_kernel = HammingKernel( np.ones([len(cat_dims)]), [(np.exp(-10), np.exp(2)) for _ in range(len(cat_dims))], operate_on=cat_dims, ) noise_kernel = WhiteKernel( noise_level=1e-8, noise_level_bounds=(np.exp(-25), np.exp(2)), prior=HorseshoePrior(scale=0.1, rng=self.rng), ) if len(cont_dims) > 0 and len(cat_dims) > 0: # both kernel = cov_amp * (exp_kernel * ham_kernel) + noise_kernel elif len(cont_dims) > 0 and len(cat_dims) == 0: # only cont kernel = cov_amp * exp_kernel + noise_kernel elif len(cont_dims) == 0 and len(cat_dims) > 0: # only cont kernel = cov_amp * ham_kernel + noise_kernel else: raise ValueError() gp = GaussianProcess( configspace=self.configspace, types=self.types, bounds=self.bounds, kernel=kernel, normalize_y=True, seed=self.rng.randint(low=0, high=10000), ) gp.train(np.array(dataset), np.array(targets_)) gp._train(X, y, do_optimize=False) models.append(gp) self.models.append(models) return self
def _train(self, X: np.ndarray, y: np.ndarray, do_optimize: bool = True): """ Performs MCMC sampling to sample hyperparameter configurations from the likelihood and trains for each sample a GP on X and y Parameters ---------- X: np.ndarray (N, D) Input data points. The dimensionality of X is (N, D), with N as the number of points and D is the number of features. y: np.ndarray (N,) The corresponding target values. do_optimize: boolean If set to true we perform MCMC sampling otherwise we just use the hyperparameter specified in the kernel. """ if self.normalize_input: # Normalize input to be in [0, 1] self.X, self.lower, self.upper = normalization.zero_one_normalization( X, self.lower, self.upper) else: self.X = X if len(y.shape) > 1: y = y.flatten() if len(y) != len(X): raise ValueError('Shape mismatch: %s vs %s' % (y.shape, X.shape)) if self.normalize_output: # Normalize output to have zero mean and unit standard deviation self.y, self.y_mean, self.y_std = normalization.zero_mean_unit_var_normalization( y) if self.y_std == 0: raise ValueError( "Cannot normalize output. All targets have the same value") else: self.y = y # Use the mean of the data as mean for the GP self.mean = np.mean(self.y, axis=0) self.gp = george.GP(self.kernel, mean=self.mean) if do_optimize: # We have one walker for each hyperparameter configuration sampler = emcee.EnsembleSampler(self.n_hypers, len(self.kernel) + 1, self._loglikelihood) sampler.random_state = self.rng.get_state() # Do a burn-in in the first iteration if not self.burned: # Initialize the walkers by sampling from the prior if self.prior is None: self.p0 = self.rng.rand(self.n_hypers, len(self.kernel) + 1) else: self.p0 = self.prior.sample_from_prior(self.n_hypers) # Run MCMC sampling self.p0, _, _ = sampler.run_mcmc(self.p0, self.burnin_steps, rstate0=self.rng) self.burned = True # Start sampling pos, _, _ = sampler.run_mcmc(self.p0, self.chain_length, rstate0=self.rng) # Save the current position, it will be the start point in # the next iteration self.p0 = pos # Take the last samples from each walker self.hypers = sampler.chain[:, -1] else: self.hypers = self.gp.kernel.get_parameter_vector().tolist() self.hypers.append(self.noise) self.hypers = [self.hypers] self.models = [] for sample in self.hypers: # Instantiate a GP for each hyperparameter configuration kernel = deepcopy(self.kernel) kernel.set_parameter_vector(sample[:-1]) noise = np.exp(sample[-1]) model = GaussianProcess( types=self.types, bounds=self.bounds, kernel=kernel, normalize_output=self.normalize_output, normalize_input=self.normalize_input, noise=noise, rng=self.rng, ) model._train(X, y, do_optimize=False) self.models.append(model) self.is_trained = True