def setUp(self): data_set_prefix = '${CMAKE_SOURCE_DIR}/test_data_sets/' self.data = reg.default_data_container(64) self.data.import_csv_files(data_set_prefix+'features13.csv', data_set_prefix+'responses13.csv') self.rng = reg.default_random_engine(1) self.forest_constructor = reg.qr_forest
def __init__(self, num_trees=30, do_bootstrapping=True, n_points_per_tree=0, rng=None): """ Interface for the random_forest_run library to model the objective function with a random forest. Parameters ---------- num_trees: int The number of trees in the random forest. do_bootstrapping: bool Turns on / off bootstrapping in the random forest. n_points_per_tree: int Number of data point per tree. If set to 0 then we will use all data points in each tree rng: np.random.RandomState Random number generator """ if rng is None: self.rng = np.random.RandomState() else: self.rng = rng self.reg_rng = reg.default_random_engine(self.rng.randint(1000)) self.n_points_per_tree = n_points_per_tree self.rf = reg.binary_rss_forest() self.rf.options.num_trees = num_trees self.rf.options.do_bootstrapping = do_bootstrapping self.rf.options.num_data_points_per_tree = n_points_per_tree
def predict(self, X_test, **kwargs): """ Seeds the RNG of Random Forest before calling parent's predict(). """ # NOTE: We cannot save `reg_rng` state so instead we control it # with random integers sampled from `rng` and keep track of `rng` state. self.reg_rng = reg.default_random_engine(int(self.rng.randint(10e8))) return super(OrionRandomForestWrapper, self).predict(X_test, **kwargs)
def setUp(self): data_set_prefix = '${CMAKE_SOURCE_DIR}/test_data_sets/' self.data = reg.default_data_container(3) self.data.import_csv_files( data_set_prefix + 'online_lda_features.csv', data_set_prefix + 'online_lda_responses.csv') self.rng = reg.default_random_engine(1) self.forest_constructor = reg.fanova_forest
def __init__( self, configspace: ConfigurationSpace, types: typing.List[int], bounds: typing.List[typing.Tuple[float, float]], seed: int, log_y: bool = False, num_trees: int = N_TREES, do_bootstrapping: bool = True, n_points_per_tree: int = -1, ratio_features: float = 5. / 6., min_samples_split: int = 3, min_samples_leaf: int = 3, max_depth: int = 2**20, eps_purity: float = 1e-8, max_num_nodes: int = 2**20, instance_features: typing.Optional[np.ndarray] = None, pca_components: typing.Optional[int] = None, ) -> None: super().__init__( configspace=configspace, types=types, bounds=bounds, seed=seed, instance_features=instance_features, pca_components=pca_components, ) self.log_y = log_y self.rng = regression.default_random_engine(seed) self.rf_opts = regression.forest_opts() self.rf_opts.num_trees = num_trees self.rf_opts.do_bootstrapping = do_bootstrapping max_features = 0 if ratio_features > 1.0 else \ max(1, int(len(types) * ratio_features)) self.rf_opts.tree_opts.max_features = max_features self.rf_opts.tree_opts.min_samples_to_split = min_samples_split self.rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf self.rf_opts.tree_opts.max_depth = max_depth self.rf_opts.tree_opts.epsilon_purity = eps_purity self.rf_opts.tree_opts.max_num_nodes = max_num_nodes self.rf_opts.compute_law_of_total_variance = False self.n_points_per_tree = n_points_per_tree self.rf = None # type: regression.binary_rss_forest # This list well be read out by save_iteration() in the solver self.hypers = [ num_trees, max_num_nodes, do_bootstrapping, n_points_per_tree, ratio_features, min_samples_split, min_samples_leaf, max_depth, eps_purity, self.seed ]
def __init__(self, X_init: np.ndarray, Y_init: np.ndarray, num_trees: int = 30, do_bootstrapping: bool = True, n_points_per_tree: int = 0, seed: int = None) -> None: """ Interface to random forests for Bayesian optimization based on pyrfr package which due to the random splitting gives better uncertainty estimates than the sklearn random forest. Dependencies: AutoML rfr (https://github.com/automl/random_forest_run) :param X_init: Initial input data points to train the model :param Y_init: Initial target values :param num_trees: Specifies the number of trees to build the random forest :param do_bootstrapping: Defines if we use boostrapping for the individual trees or not :param n_points_per_tree: Specifies the number of points for each individual tree (0 mean no restriction) :param seed: Used to seed the random number generator for the random forest (None means random seed) """ super().__init__() # Set random number generator for the random forest if seed is None: seed = np.random.randint(10000) self.reg_rng = reg.default_random_engine(seed) self.n_points_per_tree = n_points_per_tree self.rf = reg.binary_rss_forest() self.rf.options.num_trees = num_trees self.rf.options.do_bootstrapping = do_bootstrapping self.rf.options.num_data_points_per_tree = n_points_per_tree self._X = X_init self._Y = Y_init if self.n_points_per_tree == 0: self.rf.options.num_data_points_per_tree = X_init.shape[0] data = reg.default_data_container(self._X.shape[1]) for row_X, row_y in zip(X_init, Y_init): data.add_data_point(row_X, row_y) self.rf.fit(data, self.reg_rng)
def setUp(self): data_set_prefix = '${CMAKE_SOURCE_DIR}/test_data_sets/' self.data = reg.default_data_container(64) self.data.import_csv_files(data_set_prefix+'features13.csv', data_set_prefix+'responses13.csv') self.forest = reg.binary_rss_forest() self.forest.options.num_trees = 64 self.forest.options.do_bootstrapping = True self.forest.options.num_data_points_per_tree = 200 self.assertEqual(self.forest.options.num_trees, 64) self.assertTrue (self.forest.options.do_bootstrapping) self.assertEqual(self.forest.options.num_data_points_per_tree, 200) self.rng = reg.default_random_engine(1)
def __init__(self, types, bounds, num_trees=10, do_bootstrapping=True, n_points_per_tree=-1, ratio_features=5. / 6., min_samples_split=3, min_samples_leaf=3, max_depth=20, eps_purity=1e-8, max_num_nodes=1000, seed=42, **kwargs): super().__init__(**kwargs) self.types = types self.bounds = bounds self.rng = regression.default_random_engine(seed) self.rf_opts = regression.forest_opts() self.rf_opts.num_trees = num_trees self.rf_opts.seed = seed self.rf_opts.do_bootstrapping = do_bootstrapping max_features = 0 if ratio_features >= 1.0 else \ max(1, int(types.shape[0] * ratio_features)) self.rf_opts.max_features = max_features self.rf_opts.min_samples_to_split = min_samples_split self.rf_opts.min_samples_in_leaf = min_samples_leaf self.rf_opts.max_depth = max_depth self.rf_opts.epsilon_purity = eps_purity self.rf_opts.max_num_nodes = max_num_nodes self.n_points_per_tree = n_points_per_tree self.rf = None # type: regression.binary_rss_forest # This list well be read out by save_iteration() in the solver self.hypers = [ num_trees, max_num_nodes, do_bootstrapping, n_points_per_tree, ratio_features, min_samples_split, min_samples_leaf, max_depth, eps_purity, seed ] self.seed = seed self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__)
def _eval_rf( self, c: Configuration, X: np.ndarray, y: np.ndarray, X_test: np.ndarray, y_test: np.ndarray, ) -> float: """Evaluate random forest configuration on train/test data. Parameters ---------- c : Configuration Random forest configuration to evaluate on the train/test data X : np.ndarray [n_samples, n_features (config + instance features)] Training features y : np.ndarray [n_samples, ] Training targets X_test : np.ndarray [n_samples, n_features (config + instance features)] Validation features y_test : np.ndarray [n_samples, ] Validation targets Returns ------- float """ opts = self._set_conf(c, n_features=X.shape[1], num_data_points=X.shape[0]) rng = regression.default_random_engine(1) rf = regression.binary_rss_forest() rf.options = opts data = self._init_data_container(X, y) rf.fit(data, rng=rng) loss = 0 for row, lab in zip(X_test, y_test): m, v = rf.predict_mean_var(row) std = max(1e-8, np.sqrt(v)) nllh = -scst.norm(loc=m, scale=std).logpdf(lab) loss += nllh return loss
def __init__(self, num_trees=30, do_bootstrapping=True, n_points_per_tree=0, compute_oob_error=False, return_total_variance=True, rng=None): """ Interface for the random_forest_run library to model the objective function with a random forest. Parameters ---------- num_trees: int The number of trees in the random forest. do_bootstrapping: bool Turns on / off bootstrapping in the random forest. n_points_per_tree: int Number of data point per tree. If set to 0 then we will use all data points in each tree compute_oob_error: bool Turns on / off calculation of out-of-bag error. Default: False return_total_variance: bool Return law of total variance (mean of variances + variance of means, if True) or explained variance (variance of means, if False). Default: True rng: np.random.RandomState Random number generator """ if rng is None: self.rng = np.random.RandomState() else: self.rng = rng self.reg_rng = reg.default_random_engine(self.rng.randint(1000)) self.n_points_per_tree = n_points_per_tree self.rf = reg.binary_rss_forest() self.rf.options.num_trees = num_trees self.rf.options.do_bootstrapping = do_bootstrapping self.rf.options.num_data_points_per_tree = n_points_per_tree self.rf.options.compute_oob_error = compute_oob_error self.rf.options.compute_law_of_total_variance = return_total_variance
def setUp(self): self.X = [[0., 0., 0.], [0., 0., 0.], [0., 0., 0.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 1., 0.], [0., 1., 0.], [0., 1., 0.], [0., 1., 1.], [0., 1., 1.], [0., 1., 1.], [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 1.], [1., 0., 1.], [1., 0., 1.], [1., 1., 0.], [1., 1., 0.], [1., 1., 0.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.]] self.y = [[50], [50], [50], [.2], [.2], [.2], [9], [9], [9], [9.2], [9.2], [9.2], [500], [500], [500], [10.2], [10.2], [10.2], [109.], [109.], [109.], [100], [100], [100]] self.y_dual = list(map(lambda x: [math.log10(x[0]), x[0]], self.y)) bounds = [(0, float('nan')), (0, float('nan')), (0, float('nan'))] def init_data(X, y, bounds): data = reg.default_data_container(len(X[0])) for i, (mn, mx) in enumerate(bounds): if math.isnan(mx): data.set_type_of_feature(i, mn) else: data.set_bounds_of_feature(i, mn, mx) for row_X, row_y in zip(X, y): data.add_data_point(row_X, row_y) return data self.data = init_data(self.X, self.y, bounds) self.data_dual = init_data(self.X, self.y_dual, bounds) self.forest = reg.binary_rss_forest() self.forest.options.num_trees = 64 self.forest.options.do_bootstrapping = True self.forest.options.num_data_points_per_tree = 200 self.forest.options.compute_law_of_total_variance = True self.assertEqual(self.forest.options.num_trees, 64) self.assertTrue(self.forest.options.do_bootstrapping) self.assertEqual(self.forest.options.num_data_points_per_tree, 200) self.assertTrue(self.forest.options.compute_law_of_total_variance) self.rng = reg.default_random_engine(1)
def __init__( self, configspace: ConfigurationSpace, types: typing.List[int], bounds: typing.List[typing.Tuple[float, float]], seed: int, log_y: bool = False, num_trees: int = N_TREES, do_bootstrapping: bool = True, n_points_per_tree: int = -1, ratio_features: float = 5. / 6., min_samples_split: int = 3, min_samples_leaf: int = 3, max_depth: int = 2**20, eps_purity: float = 1e-8, max_num_nodes: int = 2**20, instance_features: typing.Optional[np.ndarray] = None, pca_components: typing.Optional[int] = None, ) -> None: """ Parameters ---------- types : List[int] Specifies the number of categorical values of an input dimension where the i-th entry corresponds to the i-th input dimension. Let's say we have 2 dimension where the first dimension consists of 3 different categorical choices and the second dimension is continuous than we have to pass [3, 0]. Note that we count starting from 0. bounds : List[Tuple[float, float]] bounds of input dimensions: (lower, uppper) for continuous dims; (n_cat, np.nan) for categorical dims seed : int The seed that is passed to the random_forest_run library. log_y: bool y values (passed to this RF) are expected to be log(y) transformed; this will be considered during predicting num_trees : int The number of trees in the random forest. do_bootstrapping : bool Turns on / off bootstrapping in the random forest. n_points_per_tree : int Number of points per tree. If <= 0 X.shape[0] will be used in _train(X, y) instead ratio_features : float The ratio of features that are considered for splitting. min_samples_split : int The minimum number of data points to perform a split. min_samples_leaf : int The minimum number of data points in a leaf. max_depth : int The maximum depth of a single tree. eps_purity : float The minimum difference between two target values to be considered different max_num_nodes : int The maxmimum total number of nodes in a tree instance_features : np.ndarray (I, K) Contains the K dimensional instance features of the I different instances pca_components : float Number of components to keep when using PCA to reduce dimensionality of instance features. Requires to set n_feats (> pca_dims). """ super().__init__( configspace=configspace, types=types, bounds=bounds, seed=seed, instance_features=instance_features, pca_components=pca_components, ) self.log_y = log_y self.rng = regression.default_random_engine(seed) self.rf_opts = regression.forest_opts() self.rf_opts.num_trees = num_trees self.rf_opts.do_bootstrapping = do_bootstrapping max_features = 0 if ratio_features > 1.0 else \ max(1, int(len(types) * ratio_features)) self.rf_opts.tree_opts.max_features = max_features self.rf_opts.tree_opts.min_samples_to_split = min_samples_split self.rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf self.rf_opts.tree_opts.max_depth = max_depth self.rf_opts.tree_opts.epsilon_purity = eps_purity self.rf_opts.tree_opts.max_num_nodes = max_num_nodes self.rf_opts.compute_law_of_total_variance = False self.n_points_per_tree = n_points_per_tree self.rf = None # type: regression.binary_rss_forest # This list well be read out by save_iteration() in the solver self.hypers = [num_trees, max_num_nodes, do_bootstrapping, n_points_per_tree, ratio_features, min_samples_split, min_samples_leaf, max_depth, eps_purity, self.seed]
def __init__(self, X, Y, config_space=None, n_trees=16, seed=None, bootstrapping=True, points_per_tree = None, max_features=None, min_samples_split=0, min_samples_leaf=0, max_depth=64, cutoffs= (-np.inf, np.inf)): """ Calculate and provide midpoints and sizes from the forest's split values in order to get the marginals Parameters ------------ X: matrix with the features (numerically encoded) Y: vector with the response values (numerically encoded) config_space : ConfigSpace instantiation forest: trained random forest n_trees: number of trees in the forest to be fit seed: seed for the forests randomness bootstrapping: whether or not to bootstrap the data for each tree points_per_tree: number of points used for each tree (only subsampling if bootstrapping is false) max_features: number of features to be used at each split, default is 70% min_samples_split: minimum number of samples required to attempt to split min_samples_leaf: minimum number of samples required in a leaf max_depth: maximal depth of each tree in the forest cutoffs: tuple of (lower, upper), all values outside this range will be mapped to either the lower or the upper bound. (See: "Generalized Functional ANOVA Diagnostics for High Dimensional Functions of Dependent Variables" by Hooker.) """ logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger(self.__module__ + '.' + self.__class__.__name__) pcs = [(np.nan, np.nan)]*X.shape[1] # if no ConfigSpace is specified, let's build one with all continuous variables if (config_space is None): # if no info is given, use min and max values of each variable as bounds config_space = ConfigSpace.ConfigurationSpace() for i,(mn, mx) in enumerate(zip(np.min(X,axis=0), np.max(X, axis=0) )): config_space.add_hyperparameter(UniformFloatHyperparameter("x_%03i" %i, mn, mx)) self.percentiles = np.percentile(Y, range(0,100)) self.cs = config_space self.cs_params =self.cs.get_hyperparameters() self.n_dims = len(self.cs_params) self.n_trees = n_trees self._dict = False # at this point we have a valid ConfigSpace object # check if param number is correct etc: if X.shape[1] != len(self.cs_params): raise RuntimeError('Number of parameters in ConfigSpace object does not match input X') for i in range(len(self.cs_params)): if isinstance(self.cs_params[i], NumericalHyperparameter): if (np.max(X[:, i]) > self.cs_params[i].upper) or \ (np.min(X[:, i]) < self.cs_params[i].lower): raise RuntimeError('Some sample values from X are not in the given interval') elif isinstance(self.cs_params[i], CategoricalHyperparameter): unique_vals = set(X[:, i]) if len(unique_vals) > len(self.cs_params[i].choices): raise RuntimeError('There are some categoricals missing in the ConfigSpace specification for hyperparameter %s:' % self.cs_params[i].name) elif isinstance(self.cs_params[i], (Constant)): # oddly, unparameterizedhyperparameter and constant are not supported. # raise TypeError('Unsupported Hyperparameter: %s' % type(self.cs_params[i])) pass # unique_vals = set(X[:, i]) # if len(unique_vals) > 1: # raise RuntimeError('Got multiple values for Unparameterized (Constant) hyperparameter') else: raise TypeError('Unsupported Hyperparameter: %s' % type(self.cs_params[i])) if not np.issubdtype(X.dtype, np.float64): logging.warning('low level library expects X argument to be float') if not np.issubdtype(Y.dtype, np.float64): logging.warning('low level library expects Y argument to be float') # initialize all types as 0 types = np.zeros(len(self.cs_params), dtype=np.uint) # retrieve the types and the bounds from the ConfigSpace # TODO: Test if that actually works for i, hp in enumerate(self.cs_params): if isinstance(hp, CategoricalHyperparameter): types[i] = len(hp.choices) pcs[i] = (len(hp.choices), np.nan) elif isinstance(self.cs_params[i], NumericalHyperparameter): pcs[i] = (hp.lower, hp.upper) elif isinstance(self.cs_params[i], (Constant)): # raise TypeError('Unsupported Hyperparameter: %s' % type(hp)) types[i] = 1 pcs[i] = (1, np.nan) else: raise TypeError('Unsupported Hyperparameter: %s' % type(hp)) # set forest options forest = reg.fanova_forest() forest.options.num_trees = n_trees forest.options.do_bootstrapping = bootstrapping forest.options.num_data_points_per_tree = X.shape[0] if points_per_tree is None else points_per_tree forest.options.tree_opts.max_features = (X.shape[1]*7)//10 if max_features is None else max_features forest.options.tree_opts.min_samples_to_split = min_samples_split forest.options.tree_opts.min_samples_in_leaf = min_samples_leaf forest.options.tree_opts.max_depth=max_depth forest.options.tree_opts.epsilon_purity = 1e-8 # create data conatainer and provide all the necessary information if seed is None: rng = reg.default_random_engine( np.random.randint(2**31-1)) else: rng = reg.default_random_engine(seed) data = reg.default_data_container(X.shape[1]) for i, (mn,mx) in enumerate(pcs): if(np.isnan(mx)): data.set_type_of_feature(i, mn) else: data.set_bounds_of_feature(i, mn, mx) for i in range(len(Y)): try: data.add_data_point(X[i].tolist(),Y[i]) except: self.logger.warning("failed to process datapoint: %s", str(X[i].tolist())) raise forest.fit(data, rng) self.the_forest = forest # initialize a dictionary with parameter dims self.variance_dict = dict() # getting split values forest_split_values = self.the_forest.all_split_values() # all midpoints and interval sizes treewise for the whole forest self.all_midpoints = [] self.all_sizes = [] #compute midpoints and interval sizes for variables in each tree for tree_split_values in forest_split_values: sizes =[] midpoints = [] for i, split_vals in enumerate(tree_split_values): if np.isnan(pcs[i][1]): # categorical parameter # check if the tree actually splits on this parameter if len(split_vals) > 0: midpoints.append(split_vals) sizes.append( np.ones(len(split_vals))) # if not, simply append 0 as the value with the number # of categories as the size, that way this parameter will # get 0 importance from this tree. else: midpoints.append((0,)) sizes.append((pcs[i][0],)) else: # add bounds to split values sv = np.array([pcs[i][0]] + list(split_vals) + [pcs[i][1]]) # compute midpoints and sizes midpoints.append((1/2)* (sv[1:] + sv[:-1])) sizes.append(sv[1:] - sv[:-1]) self.all_midpoints.append(midpoints) self.all_sizes.append(sizes) # capital V in the paper self.trees_total_variances = [] # dict of lists where the keys are tuples of the dimensions # and the value list contains \hat{f}_U for the individual trees # reset all the variance fractions computed self.trees_variance_fractions = {} self.V_U_total = {} self.V_U_individual = {} self.cutoffs = cutoffs self.set_cutoffs(cutoffs)
def __init__(self, configspace, types: np.ndarray, bounds: np.ndarray, seed: int, num_trees: int = 10, do_bootstrapping: bool = True, n_points_per_tree: int = -1, ratio_features: float = 5. / 6., min_samples_split: int = 3, min_samples_leaf: int = 3, max_depth: int = 20, eps_purity: int = 1e-8, max_num_nodes: int = 2**20, logged_y: bool = True, **kwargs): """Constructor Parameters ---------- configspace: ConfigurationSpace configspace to be passed to random forest (used to impute inactive parameter-values) types : np.ndarray (D) Specifies the number of categorical values of an input dimension where the i-th entry corresponds to the i-th input dimension. Let's say we have 2 dimension where the first dimension consists of 3 different categorical choices and the second dimension is continuous than we have to pass np.array([2, 0]). Note that we count starting from 0. bounds : np.ndarray (D, 2) Specifies the bounds for continuous features. seed : int The seed that is passed to the random_forest_run library. num_trees : int The number of trees in the random forest. do_bootstrapping : bool Turns on / off bootstrapping in the random forest. n_points_per_tree : int Number of points per tree. If <= 0 X.shape[0] will be used in _train(X, y) instead ratio_features : float The ratio of features that are considered for splitting. min_samples_split : int The minimum number of data points to perform a split. min_samples_leaf : int The minimum number of data points in a leaf. max_depth : int The maximum depth of a single tree. eps_purity : float The minimum difference between two target values to be considered different max_num_nodes : int The maxmimum total number of nodes in a tree logged_y: bool Indicates if the y data is transformed (i.e. put on logscale) or not """ super().__init__(configspace=configspace, types=types, bounds=bounds, seed=seed, **kwargs) self.configspace = configspace self.types = types self.bounds = bounds self.rng = regression.default_random_engine(seed) self.rf_opts = regression.forest_opts() self.rf_opts.num_trees = num_trees self.rf_opts.do_bootstrapping = do_bootstrapping max_features = 0 if ratio_features > 1.0 else \ max(1, int(types.shape[0] * ratio_features)) self.rf_opts.tree_opts.max_features = max_features self.rf_opts.tree_opts.min_samples_to_split = min_samples_split self.rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf self.rf_opts.tree_opts.max_depth = max_depth self.rf_opts.tree_opts.epsilon_purity = eps_purity self.rf_opts.tree_opts.max_num_nodes = max_num_nodes self.rf_opts.compute_law_of_total_variance = False # Always off. No need for this in our base EPM self.n_points_per_tree = n_points_per_tree self.rf = None # type: regression.binary_rss_forest self.logged_y = logged_y # This list well be read out by save_iteration() in the solver self.hypers = [ num_trees, max_num_nodes, do_bootstrapping, n_points_per_tree, ratio_features, min_samples_split, min_samples_leaf, max_depth, eps_purity, seed ] self.seed = seed self.impute_values = {} self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__)
def __init__( self, types: np.ndarray, bounds: typing.List[typing.Tuple[float, float]], log_y: bool=False, bootstrap: bool=False, n_iters: int=50, n_splits: int=10, seed: int=42, ): """Parameters ---------- types : np.ndarray (D) Specifies the number of categorical values of an input dimension where the i-th entry corresponds to the i-th input dimension. Let's say we have 2 dimension where the first dimension consists of 3 different categorical choices and the second dimension is continuous than we have to pass np.array([2, 0]). Note that we count starting from 0. bounds : np.ndarray (D, 2) Specifies the bounds for continuous features. log_y: bool y values (passed to this RF) are expected to be log(y) transformed; this will be considered during predicting bootstrap : bool Turns on / off bootstrapping in the random forest. n_iters : int Number of iterations for random search. n_splits : int Number of cross-validation splits. seed : int The seed that is passed to the random_forest_run library. """ super().__init__( types, bounds, log_y, num_trees=N_TREES, do_bootstrapping=bootstrap, n_points_per_tree=N_POINTS_PER_TREE, ratio_features=5/6, min_samples_split=3, min_samples_leaf=3, max_depth=MAX_DEPTH, eps_purity=EPSILON_IMPURITY, max_num_nodes=MAX_NUM_NODES, seed=seed, ) self.types = types self.bounds = bounds self.log_y = log_y self.n_iters = n_iters self.n_splits = n_splits self.rng = regression.default_random_engine(seed) self.rs = np.random.RandomState(seed) self.bootstrap = bootstrap self.rf_opts = regression.forest_opts() self.rf_opts.num_trees = N_TREES self.rf_opts.compute_oob_error = True self.rf_opts.do_bootstrapping = self.bootstrap self.rf_opts.tree_opts.max_features = int(types.shape[0]) self.rf_opts.tree_opts.min_samples_to_split = 2 self.rf_opts.tree_opts.min_samples_in_leaf = 1 self.rf_opts.tree_opts.max_depth = MAX_DEPTH self.rf_opts.tree_opts.epsilon_purity = EPSILON_IMPURITY self.rf_opts.tree_opts.max_num_nodes = MAX_NUM_NODES self.rf_opts.compute_law_of_total_variance = False self.rf = None # type: regression.binary_rss_forest # This list will be read out by save_iteration() in the solver self._set_hypers(self._get_configuration_space().get_default_configuration()) self.seed = seed self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__)
def __setstate__(self, sdict): self.__dict__.update(sdict) self.reg_rng = reg.default_random_engine(sdict['rng'].randint(1000))
def __init__(self, types: np.ndarray, bounds: typing.List[typing.Tuple[float, float]], log_y: bool = False, num_trees: int = N_TREES, do_bootstrapping: bool = True, n_points_per_tree: int = -1, ratio_features: float = 5. / 6., min_samples_split: int = 3, min_samples_leaf: int = 3, max_depth: int = 2**20, eps_purity: float = 1e-8, max_num_nodes: int = 2**20, seed: int = 42, **kwargs): """ Parameters ---------- types : np.ndarray (D) Specifies the number of categorical values of an input dimension where the i-th entry corresponds to the i-th input dimension. Let's say we have 2 dimension where the first dimension consists of 3 different categorical choices and the second dimension is continuous than we have to pass np.array([2, 0]). Note that we count starting from 0. bounds : list Specifies the bounds for continuous features. log_y: bool y values (passed to this RF) are expected to be log(y) transformed; this will be considered during predicting num_trees : int The number of trees in the random forest. do_bootstrapping : bool Turns on / off bootstrapping in the random forest. n_points_per_tree : int Number of points per tree. If <= 0 X.shape[0] will be used in _train(X, y) instead ratio_features : float The ratio of features that are considered for splitting. min_samples_split : int The minimum number of data points to perform a split. min_samples_leaf : int The minimum number of data points in a leaf. max_depth : int The maximum depth of a single tree. eps_purity : float The minimum difference between two target values to be considered different max_num_nodes : int The maxmimum total number of nodes in a tree seed : int The seed that is passed to the random_forest_run library. """ super().__init__(types, bounds, **kwargs) self.log_y = log_y self.rng = regression.default_random_engine(seed) self.rf_opts = regression.forest_opts() self.rf_opts.num_trees = num_trees self.rf_opts.do_bootstrapping = do_bootstrapping max_features = 0 if ratio_features > 1.0 else \ max(1, int(types.shape[0] * ratio_features)) self.rf_opts.tree_opts.max_features = max_features self.rf_opts.tree_opts.min_samples_to_split = min_samples_split self.rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf self.rf_opts.tree_opts.max_depth = max_depth self.rf_opts.tree_opts.epsilon_purity = eps_purity self.rf_opts.tree_opts.max_num_nodes = max_num_nodes self.rf_opts.compute_law_of_total_variance = False self.n_points_per_tree = n_points_per_tree self.rf = None # type: regression.binary_rss_forest # This list well be read out by save_iteration() in the solver self.hypers = [ num_trees, max_num_nodes, do_bootstrapping, n_points_per_tree, ratio_features, min_samples_split, min_samples_leaf, max_depth, eps_purity, seed ] self.seed = seed self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__)
import pyrfr.regression as reg num_points = 8 features = np.array([np.linspace(-1, 1, num_points)]).transpose() x2 = np.array([np.linspace(-1, 1, 100)]).transpose() responses = np.exp(-np.power(features / 0.3, 2)).flatten( ) + 0.05 * np.random.randn(features.shape[0]) data = reg.default_data_container(1) for f, r in zip(features, responses): data.add_data_point(f, r) rng = reg.default_random_engine() # create an instance of a regerssion forest using binary splits and the RSS loss the_forest = reg.binary_rss_forest() the_forest.options.num_trees = 64 the_forest.options.num_data_points_per_tree = num_points the_forest.options.tree_opts.min_samples_in_leaf = 1 the_forest.fit(data, rng) fig, (ax1, ax2, ax3) = plt.subplots(3, sharex=True) predictions = np.array([the_forest.predict_mean_var(x) for x in x2]) ax1.fill_between(x2[:, 0], predictions[:, 0] - predictions[:, 1], predictions[:, 0] + predictions[:, 1],
def __init__(self, X, Y, config_space=None, n_trees=16, seed=None, bootstrapping=True, points_per_tree=None, max_features=None, min_samples_split=0, min_samples_leaf=0, max_depth=64, cutoffs=(-np.inf, np.inf)): """ Calculate and provide midpoints and sizes from the forest's split values in order to get the marginals Parameters ------------ X: matrix with the features Y: vector with the response values config_space : ConfigSpace instantiation forest: trained random forest n_trees: number of trees in the forest to be fit seed: seed for the forests randomness bootstrapping: whether or not to bootstrap the data for each tree points_per_tree: number of points used for each tree (only subsampling if bootstrapping is false) max_features: number of features to be used at each split, default is 70% min_samples_split: minimum number of samples required to attempt to split min_samples_leaf: minimum number of samples required in a leaf max_depth: maximal depth of each tree in the forest """ pcs = [(np.nan, np.nan)] * X.shape[1] # if no ConfigSpace is specified, let's build one with all continuous variables if (config_space is None): # if no info is given, use min and max values of each variable as bounds config_space = ConfigSpace.ConfigurationSpace() for i, (mn, mx) in enumerate(zip(np.min(X, axis=0), np.max(X, axis=0))): config_space.add_hyperparameter( UniformFloatHyperparameter("x_%03i" % i, mn, mx)) self.percentiles = np.percentile(Y, range(0, 100)) self.cs = config_space self.cs_params = self.cs.get_hyperparameters() self.n_dims = len(self.cs_params) self.n_trees = n_trees self._dict = False # at this point we have a valid ConfigSpace object # check if param number is correct etc: if X.shape[1] != len(self.cs_params): raise RuntimeError( 'Number of parameters in ConfigSpace object does not match input X' ) for i in range(len(self.cs_params)): if not isinstance(self.cs_params[i], (CategoricalHyperparameter)): if (np.max(X[:, i]) > self.cs_params[i].upper) or \ (np.min(X[:, i]) < self.cs_params[i].lower): raise RuntimeError( 'Some sample values from X are not in the given interval' ) else: unique_vals = set(X[:, i]) if len(unique_vals) > len(self.cs_params[i].choices): raise RuntimeError( 'There are some categoricals missing in the ConfigSpace specification' ) # initialize all types as 0 types = np.zeros(len(self.cs_params), dtype=np.uint) # retrieve the types and the bounds from the ConfigSpace # TODO: Test if that actually works for i, hp in enumerate(self.cs_params): if isinstance(hp, CategoricalHyperparameter): types[i] = len(hp.choices) pcs[i] = (len(hp.choices), np.nan) else: pcs[i] = (hp.lower, hp.upper) # set forest options forest = reg.fanova_forest() forest.options.num_trees = n_trees forest.options.do_bootstrapping = bootstrapping forest.options.num_data_points_per_tree = X.shape[ 0] if points_per_tree is None else points_per_tree forest.options.tree_opts.max_features = ( X.shape[1] * 7) // 10 if max_features is None else max_features forest.options.tree_opts.min_samples_to_split = min_samples_split forest.options.tree_opts.min_samples_in_leaf = min_samples_leaf forest.options.tree_opts.max_depth = max_depth forest.options.tree_opts.epsilon_purity = 1e-8 # create data conatainer and provide all the necessary information if seed is None: rng = reg.default_random_engine(np.random.randint(2**31 - 1)) else: rng = reg.default_random_engine(seed) data = reg.default_data_container(X.shape[1]) for i, (mn, mx) in enumerate(pcs): if (np.isnan(mx)): data.set_type_of_feature(i, mn) else: data.set_bounds_of_feature(i, mn, mx) for i in range(len(Y)): try: data.add_data_point(X[i].tolist(), Y[i]) except: print("failed to process datapoint:", X[i].tolist()) raise forest.fit(data, rng) self.the_forest = forest # initialize a dictionary with parameter dims self.variance_dict = dict() # getting split values forest_split_values = self.the_forest.all_split_values() # all midpoints and interval sizes treewise for the whole forest self.all_midpoints = [] self.all_sizes = [] #compute midpoints and interval sizes for variables in each tree for tree_split_values in forest_split_values: sizes = [] midpoints = [] for i, split_vals in enumerate(tree_split_values): if np.isnan(pcs[i][1]): # categorical parameter # check if the tree actually splits on this parameter if len(split_vals) > 0: midpoints.append(split_vals) sizes.append(np.ones(len(split_vals))) # if not, simply append 0 as the value with the number # of categories as the size, that way this parameter will # get 0 importance from this tree. else: midpoints.append((0, )) sizes.append((pcs[i][0], )) else: # add bounds to split values sv = np.array([pcs[i][0]] + list(split_vals) + [pcs[i][1]]) # compute midpoints and sizes midpoints.append((1 / 2) * (sv[1:] + sv[:-1])) sizes.append(sv[1:] - sv[:-1]) self.all_midpoints.append(midpoints) self.all_sizes.append(sizes) # capital V in the paper self.trees_total_variances = [] # dict of lists where the keys are tuples of the dimensions # and the value list contains \hat{f}_U for the individual trees # reset all the variance fractions computed self.trees_variance_fractions = {} self.V_U_total = {} self.V_U_individual = {} self.cutoffs = cutoffs self.set_cutoffs(cutoffs)
def __init__(self, X, Y, config_space=None, n_trees=16, seed=None, bootstrapping=True, points_per_tree=None, max_features=None, min_samples_split=0, min_samples_leaf=0, max_depth=64, cutoffs=(-np.inf, np.inf)): """ Calculate and provide midpoints and sizes from the forest's split values in order to get the marginals Parameters ------------ X: matrix with the features, either a np.array or a pd.DataFrame (numerically encoded) Y: vector with the response values (numerically encoded) config_space : ConfigSpace instantiation n_trees: number of trees in the forest to be fit seed: seed for the forests randomness bootstrapping: whether or not to bootstrap the data for each tree points_per_tree: number of points used for each tree (only subsampling if bootstrapping is false) max_features: number of features to be used at each split, default is 70% min_samples_split: minimum number of samples required to attempt to split min_samples_leaf: minimum number of samples required in a leaf max_depth: maximal depth of each tree in the forest cutoffs: tuple of (lower, upper), all values outside this range will be mapped to either the lower or the upper bound. (See: "Generalized Functional ANOVA Diagnostics for High Dimensional Functions of Dependent Variables" by Hooker.) """ logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger(self.__module__ + '.' + self.__class__.__name__) pcs = [(np.nan, np.nan)] * X.shape[1] # Convert pd.DataFrame to np.array if isinstance(X, pd.DataFrame): self.logger.debug("Detected pandas dataframes, converting to floats...") if config_space is not None: # Check if column names match parameter names bad_input = set(X.columns) - set(config_space.get_hyperparameter_names()) if len(bad_input) != 0: raise ValueError("Could not identify parameters %s from pandas dataframes" % str(bad_input)) # Reorder dataframe if necessary X = X[config_space.get_hyperparameter_names()] X = X.to_numpy() elif config_space is not None: # There is a config_space but no way to check if the np.array'ed data in X is in the correct order... self.logger.warning("Note that fANOVA expects data to be ordered like the return of ConfigSpace's " "'get_hyperparameters'-method. We recommend to use labeled pandas dataframes to " "avoid any problems.") # if no ConfigSpace is specified, let's build one with all continuous variables if config_space is None: # if no info is given, use min and max values of each variable as bounds config_space = ConfigSpace.ConfigurationSpace() for i, (mn, mx) in enumerate(zip(np.min(X, axis=0), np.max(X, axis=0))): config_space.add_hyperparameter(UniformFloatHyperparameter("x_%03i" % i, mn, mx)) self.percentiles = np.percentile(Y, range(0, 100)) self.cs = config_space self.cs_params = self.cs.get_hyperparameters() self.n_dims = len(self.cs_params) self.n_trees = n_trees self._dict = False # at this point we have a valid ConfigSpace object # check if param number is correct etc: if X.shape[1] != len(self.cs_params): raise RuntimeError('Number of parameters in ConfigSpace object does not match input X') for i in range(len(self.cs_params)): if isinstance(self.cs_params[i], NumericalHyperparameter): if (np.max(X[:, i]) > self.cs_params[i].upper) or \ (np.min(X[:, i]) < self.cs_params[i].lower): raise RuntimeError('Some sample values from X are not in the given interval') elif isinstance(self.cs_params[i], CategoricalHyperparameter): unique_vals = set(X[:, i]) if len(unique_vals) > len(self.cs_params[i].choices): raise RuntimeError("There are some categoricals missing in the ConfigSpace specification for " "hyperparameter %s:" % self.cs_params[i].name) elif isinstance(self.cs_params[i], OrdinalHyperparameter): unique_vals = set(X[:, i]) if len(unique_vals) > len(self.cs_params[i].sequence): raise RuntimeError("There are some sequence-options missing in the ConfigSpace specification for " "hyperparameter %s:" % self.cs_params[i].name) elif isinstance(self.cs_params[i], Constant): # oddly, unparameterizedhyperparameter and constant are not supported. # raise TypeError('Unsupported Hyperparameter: %s' % type(self.cs_params[i])) pass # unique_vals = set(X[:, i]) # if len(unique_vals) > 1: # raise RuntimeError('Got multiple values for Unparameterized (Constant) hyperparameter') else: raise TypeError('Unsupported Hyperparameter: %s' % type(self.cs_params[i])) if not np.issubdtype(X.dtype, np.float64): logging.warning('low level library expects X argument to be float') if not np.issubdtype(Y.dtype, np.float64): logging.warning('low level library expects Y argument to be float') # initialize all types as 0 types = np.zeros(len(self.cs_params), dtype=np.uint) # retrieve the types and the bounds from the ConfigSpace # TODO: Test if that actually works for i, hp in enumerate(self.cs_params): if isinstance(hp, CategoricalHyperparameter): types[i] = len(hp.choices) pcs[i] = (len(hp.choices), np.nan) elif isinstance(hp, OrdinalHyperparameter): types[i] = len(hp.sequence) pcs[i] = (len(hp.sequence), np.nan) elif isinstance(self.cs_params[i], NumericalHyperparameter): pcs[i] = (hp.lower, hp.upper) elif isinstance(self.cs_params[i], Constant): types[i] = 1 pcs[i] = (1, np.nan) else: raise TypeError('Unsupported Hyperparameter: %s' % type(hp)) # set forest options forest = reg.fanova_forest() forest.options.num_trees = n_trees forest.options.do_bootstrapping = bootstrapping forest.options.num_data_points_per_tree = X.shape[0] if points_per_tree is None else points_per_tree forest.options.tree_opts.max_features = (X.shape[1] * 7) // 10 if max_features is None else max_features forest.options.tree_opts.min_samples_to_split = min_samples_split forest.options.tree_opts.min_samples_in_leaf = min_samples_leaf forest.options.tree_opts.max_depth = max_depth forest.options.tree_opts.epsilon_purity = 1e-8 # create data container and provide all the necessary information if seed is None: rng = reg.default_random_engine(np.random.randint(2 ** 31 - 1)) else: rng = reg.default_random_engine(seed) data = reg.default_data_container(X.shape[1]) for i, (mn, mx) in enumerate(pcs): if np.isnan(mx): data.set_type_of_feature(i, mn) else: data.set_bounds_of_feature(i, mn, mx) for i in range(len(Y)): self.logger.debug("process datapoint: %s", str(X[i].tolist())) data.add_data_point(X[i].tolist(), Y[i]) forest.fit(data, rng) self.the_forest = forest # initialize a dictionary with parameter dims self.variance_dict = dict() # getting split values forest_split_values = self.the_forest.all_split_values() # all midpoints and interval sizes treewise for the whole forest self.all_midpoints = [] self.all_sizes = [] # compute midpoints and interval sizes for variables in each tree for tree_split_values in forest_split_values: sizes = [] midpoints = [] for i, split_vals in enumerate(tree_split_values): if np.isnan(pcs[i][1]): # categorical parameter # check if the tree actually splits on this parameter if len(split_vals) > 0: midpoints.append(split_vals) sizes.append(np.ones(len(split_vals))) # if not, simply append 0 as the value with the number of categories as the size, that way this # parameter will get 0 importance from this tree. else: midpoints.append((0,)) sizes.append((pcs[i][0],)) else: # add bounds to split values sv = np.array([pcs[i][0]] + list(split_vals) + [pcs[i][1]]) # compute midpoints and sizes midpoints.append((1 / 2) * (sv[1:] + sv[:-1])) sizes.append(sv[1:] - sv[:-1]) self.all_midpoints.append(midpoints) self.all_sizes.append(sizes) # capital V in the paper self.trees_total_variances = [] # dict of lists where the keys are tuples of the dimensions # and the value list contains \hat{f}_U for the individual trees # reset all the variance fractions computed self.trees_variance_fractions = {} self.V_U_total = {} self.V_U_individual = {} self.cutoffs = cutoffs self.set_cutoffs(cutoffs)