def setUp(self):
		data_set_prefix = '${CMAKE_SOURCE_DIR}/test_data_sets/'
		self.data = reg.default_data_container(64)
		self.data.import_csv_files(data_set_prefix+'features13.csv', data_set_prefix+'responses13.csv')
		
		self.rng = reg.default_random_engine(1)
		self.forest_constructor = reg.qr_forest
Example #2
0
    def __init__(self, num_trees=30,
                 do_bootstrapping=True,
                 n_points_per_tree=0,
                 rng=None):
        """
        Interface for the random_forest_run library to model the
        objective function with a random forest.

        Parameters
        ----------
        num_trees: int
            The number of trees in the random forest.
        do_bootstrapping: bool
            Turns on / off bootstrapping in the random forest.
        n_points_per_tree: int
            Number of data point per tree. If set to 0 then we will use all data points in each tree
        rng: np.random.RandomState
            Random number generator
        """

        if rng is None:
            self.rng = np.random.RandomState()
        else:
            self.rng = rng

        self.reg_rng = reg.default_random_engine(self.rng.randint(1000))

        self.n_points_per_tree = n_points_per_tree

        self.rf = reg.binary_rss_forest()
        self.rf.options.num_trees = num_trees

        self.rf.options.do_bootstrapping = do_bootstrapping
        self.rf.options.num_data_points_per_tree = n_points_per_tree
Example #3
0
    def __init__(self,
                 num_trees=30,
                 do_bootstrapping=True,
                 n_points_per_tree=0,
                 rng=None):
        """
        Interface for the random_forest_run library to model the
        objective function with a random forest.

        Parameters
        ----------
        num_trees: int
            The number of trees in the random forest.
        do_bootstrapping: bool
            Turns on / off bootstrapping in the random forest.
        n_points_per_tree: int
            Number of data point per tree. If set to 0 then we will use all data points in each tree
        rng: np.random.RandomState
            Random number generator
        """

        if rng is None:
            self.rng = np.random.RandomState()
        else:
            self.rng = rng

        self.reg_rng = reg.default_random_engine(self.rng.randint(1000))

        self.n_points_per_tree = n_points_per_tree

        self.rf = reg.binary_rss_forest()
        self.rf.options.num_trees = num_trees

        self.rf.options.do_bootstrapping = do_bootstrapping
        self.rf.options.num_data_points_per_tree = n_points_per_tree
Example #4
0
 def predict(self, X_test, **kwargs):
     """
     Seeds the RNG of Random Forest before calling parent's predict().
     """
     # NOTE: We cannot save `reg_rng` state so instead we control it
     #       with random integers sampled from `rng` and keep track of `rng` state.
     self.reg_rng = reg.default_random_engine(int(self.rng.randint(10e8)))
     return super(OrionRandomForestWrapper, self).predict(X_test, **kwargs)
Example #5
0
    def setUp(self):
        data_set_prefix = '${CMAKE_SOURCE_DIR}/test_data_sets/'
        self.data = reg.default_data_container(3)
        self.data.import_csv_files(
            data_set_prefix + 'online_lda_features.csv',
            data_set_prefix + 'online_lda_responses.csv')

        self.rng = reg.default_random_engine(1)
        self.forest_constructor = reg.fanova_forest
Example #6
0
    def __init__(
        self,
        configspace: ConfigurationSpace,
        types: typing.List[int],
        bounds: typing.List[typing.Tuple[float, float]],
        seed: int,
        log_y: bool = False,
        num_trees: int = N_TREES,
        do_bootstrapping: bool = True,
        n_points_per_tree: int = -1,
        ratio_features: float = 5. / 6.,
        min_samples_split: int = 3,
        min_samples_leaf: int = 3,
        max_depth: int = 2**20,
        eps_purity: float = 1e-8,
        max_num_nodes: int = 2**20,
        instance_features: typing.Optional[np.ndarray] = None,
        pca_components: typing.Optional[int] = None,
    ) -> None:
        super().__init__(
            configspace=configspace,
            types=types,
            bounds=bounds,
            seed=seed,
            instance_features=instance_features,
            pca_components=pca_components,
        )

        self.log_y = log_y
        self.rng = regression.default_random_engine(seed)

        self.rf_opts = regression.forest_opts()
        self.rf_opts.num_trees = num_trees
        self.rf_opts.do_bootstrapping = do_bootstrapping
        max_features = 0 if ratio_features > 1.0 else \
            max(1, int(len(types) * ratio_features))
        self.rf_opts.tree_opts.max_features = max_features
        self.rf_opts.tree_opts.min_samples_to_split = min_samples_split
        self.rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf
        self.rf_opts.tree_opts.max_depth = max_depth
        self.rf_opts.tree_opts.epsilon_purity = eps_purity
        self.rf_opts.tree_opts.max_num_nodes = max_num_nodes
        self.rf_opts.compute_law_of_total_variance = False

        self.n_points_per_tree = n_points_per_tree
        self.rf = None  # type: regression.binary_rss_forest

        # This list well be read out by save_iteration() in the solver
        self.hypers = [
            num_trees, max_num_nodes, do_bootstrapping, n_points_per_tree,
            ratio_features, min_samples_split, min_samples_leaf, max_depth,
            eps_purity, self.seed
        ]
Example #7
0
    def __init__(self,
                 X_init: np.ndarray,
                 Y_init: np.ndarray,
                 num_trees: int = 30,
                 do_bootstrapping: bool = True,
                 n_points_per_tree: int = 0,
                 seed: int = None) -> None:
        """
        Interface to random forests for Bayesian optimization based on pyrfr package which due to the random splitting
        gives better uncertainty estimates than the sklearn random forest.

        Dependencies:
            AutoML rfr (https://github.com/automl/random_forest_run)

        :param X_init: Initial input data points to train the model
        :param Y_init: Initial target values
        :param num_trees: Specifies the number of trees to build the random forest
        :param do_bootstrapping: Defines if we use boostrapping for the individual trees or not
        :param n_points_per_tree: Specifies the number of points for each individual tree (0 mean no restriction)
        :param seed: Used to seed the random number generator for the random forest (None means random seed)
        """
        super().__init__()

        # Set random number generator for the random forest
        if seed is None:
            seed = np.random.randint(10000)
        self.reg_rng = reg.default_random_engine(seed)

        self.n_points_per_tree = n_points_per_tree

        self.rf = reg.binary_rss_forest()
        self.rf.options.num_trees = num_trees

        self.rf.options.do_bootstrapping = do_bootstrapping

        self.rf.options.num_data_points_per_tree = n_points_per_tree

        self._X = X_init
        self._Y = Y_init

        if self.n_points_per_tree == 0:
            self.rf.options.num_data_points_per_tree = X_init.shape[0]

        data = reg.default_data_container(self._X.shape[1])

        for row_X, row_y in zip(X_init, Y_init):
            data.add_data_point(row_X, row_y)

        self.rf.fit(data, self.reg_rng)
Example #8
0
	def setUp(self):
		data_set_prefix = '${CMAKE_SOURCE_DIR}/test_data_sets/'
		self.data = reg.default_data_container(64)
		self.data.import_csv_files(data_set_prefix+'features13.csv', data_set_prefix+'responses13.csv')
		

		self.forest = reg.binary_rss_forest()
		self.forest.options.num_trees = 64
		self.forest.options.do_bootstrapping = True
		self.forest.options.num_data_points_per_tree = 200

		self.assertEqual(self.forest.options.num_trees, 64)
		self.assertTrue (self.forest.options.do_bootstrapping)
		self.assertEqual(self.forest.options.num_data_points_per_tree, 200)

		self.rng = reg.default_random_engine(1)
Example #9
0
    def __init__(self,
                 types,
                 bounds,
                 num_trees=10,
                 do_bootstrapping=True,
                 n_points_per_tree=-1,
                 ratio_features=5. / 6.,
                 min_samples_split=3,
                 min_samples_leaf=3,
                 max_depth=20,
                 eps_purity=1e-8,
                 max_num_nodes=1000,
                 seed=42,
                 **kwargs):

        super().__init__(**kwargs)

        self.types = types
        self.bounds = bounds
        self.rng = regression.default_random_engine(seed)

        self.rf_opts = regression.forest_opts()
        self.rf_opts.num_trees = num_trees
        self.rf_opts.seed = seed
        self.rf_opts.do_bootstrapping = do_bootstrapping
        max_features = 0 if ratio_features >= 1.0 else \
            max(1, int(types.shape[0] * ratio_features))
        self.rf_opts.max_features = max_features
        self.rf_opts.min_samples_to_split = min_samples_split
        self.rf_opts.min_samples_in_leaf = min_samples_leaf
        self.rf_opts.max_depth = max_depth
        self.rf_opts.epsilon_purity = eps_purity
        self.rf_opts.max_num_nodes = max_num_nodes

        self.n_points_per_tree = n_points_per_tree
        self.rf = None  # type: regression.binary_rss_forest

        # This list well be read out by save_iteration() in the solver
        self.hypers = [
            num_trees, max_num_nodes, do_bootstrapping, n_points_per_tree,
            ratio_features, min_samples_split, min_samples_leaf, max_depth,
            eps_purity, seed
        ]
        self.seed = seed

        self.logger = logging.getLogger(self.__module__ + "." +
                                        self.__class__.__name__)
Example #10
0
    def _eval_rf(
        self,
        c: Configuration,
        X: np.ndarray,
        y: np.ndarray,
        X_test: np.ndarray,
        y_test: np.ndarray,
    ) -> float:
        """Evaluate random forest configuration on train/test data.

        Parameters
        ----------
        c : Configuration
            Random forest configuration to evaluate on the train/test data
        X : np.ndarray [n_samples, n_features (config + instance features)]
            Training features
        y : np.ndarray [n_samples, ]
            Training targets
        X_test : np.ndarray [n_samples, n_features (config + instance features)]
            Validation features
        y_test : np.ndarray [n_samples, ]
            Validation targets

        Returns
        -------
        float
        """
        opts = self._set_conf(c,
                              n_features=X.shape[1],
                              num_data_points=X.shape[0])
        rng = regression.default_random_engine(1)
        rf = regression.binary_rss_forest()
        rf.options = opts
        data = self._init_data_container(X, y)
        rf.fit(data, rng=rng)

        loss = 0
        for row, lab in zip(X_test, y_test):
            m, v = rf.predict_mean_var(row)
            std = max(1e-8, np.sqrt(v))
            nllh = -scst.norm(loc=m, scale=std).logpdf(lab)
            loss += nllh

        return loss
Example #11
0
    def __init__(self,
                 num_trees=30,
                 do_bootstrapping=True,
                 n_points_per_tree=0,
                 compute_oob_error=False,
                 return_total_variance=True,
                 rng=None):
        """
        Interface for the random_forest_run library to model the
        objective function with a random forest.

        Parameters
        ----------
        num_trees: int
            The number of trees in the random forest.
        do_bootstrapping: bool
            Turns on / off bootstrapping in the random forest.
        n_points_per_tree: int
            Number of data point per tree. If set to 0 then we will use all data points in each tree
        compute_oob_error: bool
            Turns on / off calculation of out-of-bag error. Default: False
        return_total_variance: bool
            Return law of total variance (mean of variances + variance of means, if True)
            or explained variance (variance of means, if False). Default: True
        rng: np.random.RandomState
            Random number generator
        """

        if rng is None:
            self.rng = np.random.RandomState()
        else:
            self.rng = rng

        self.reg_rng = reg.default_random_engine(self.rng.randint(1000))

        self.n_points_per_tree = n_points_per_tree

        self.rf = reg.binary_rss_forest()
        self.rf.options.num_trees = num_trees
        self.rf.options.do_bootstrapping = do_bootstrapping
        self.rf.options.num_data_points_per_tree = n_points_per_tree
        self.rf.options.compute_oob_error = compute_oob_error
        self.rf.options.compute_law_of_total_variance = return_total_variance
    def setUp(self):
        self.X = [[0., 0., 0.], [0., 0., 0.], [0., 0., 0.], [0., 0., 1.],
                  [0., 0., 1.], [0., 0., 1.], [0., 1., 0.], [0., 1., 0.],
                  [0., 1., 0.], [0., 1., 1.], [0., 1., 1.], [0., 1., 1.],
                  [1., 0., 0.], [1., 0., 0.], [1., 0., 0.], [1., 0., 1.],
                  [1., 0., 1.], [1., 0., 1.], [1., 1., 0.], [1., 1., 0.],
                  [1., 1., 0.], [1., 1., 1.], [1., 1., 1.], [1., 1., 1.]]
        self.y = [[50], [50], [50], [.2], [.2], [.2], [9], [9], [9], [9.2],
                  [9.2], [9.2], [500], [500], [500], [10.2], [10.2], [10.2],
                  [109.], [109.], [109.], [100], [100], [100]]
        self.y_dual = list(map(lambda x: [math.log10(x[0]), x[0]], self.y))
        bounds = [(0, float('nan')), (0, float('nan')), (0, float('nan'))]

        def init_data(X, y, bounds):
            data = reg.default_data_container(len(X[0]))

            for i, (mn, mx) in enumerate(bounds):
                if math.isnan(mx):
                    data.set_type_of_feature(i, mn)
                else:
                    data.set_bounds_of_feature(i, mn, mx)

            for row_X, row_y in zip(X, y):
                data.add_data_point(row_X, row_y)
            return data

        self.data = init_data(self.X, self.y, bounds)
        self.data_dual = init_data(self.X, self.y_dual, bounds)

        self.forest = reg.binary_rss_forest()
        self.forest.options.num_trees = 64
        self.forest.options.do_bootstrapping = True
        self.forest.options.num_data_points_per_tree = 200
        self.forest.options.compute_law_of_total_variance = True

        self.assertEqual(self.forest.options.num_trees, 64)
        self.assertTrue(self.forest.options.do_bootstrapping)
        self.assertEqual(self.forest.options.num_data_points_per_tree, 200)
        self.assertTrue(self.forest.options.compute_law_of_total_variance)

        self.rng = reg.default_random_engine(1)
Example #13
0
    def __init__(
        self,
        configspace: ConfigurationSpace,
        types: typing.List[int],
        bounds: typing.List[typing.Tuple[float, float]],
        seed: int,
        log_y: bool = False,
        num_trees: int = N_TREES,
        do_bootstrapping: bool = True,
        n_points_per_tree: int = -1,
        ratio_features: float = 5. / 6.,
        min_samples_split: int = 3,
        min_samples_leaf: int = 3,
        max_depth: int = 2**20,
        eps_purity: float = 1e-8,
        max_num_nodes: int = 2**20,
        instance_features: typing.Optional[np.ndarray] = None,
        pca_components: typing.Optional[int] = None,
    ) -> None:
        """
        Parameters
        ----------
        types : List[int]
            Specifies the number of categorical values of an input dimension where
            the i-th entry corresponds to the i-th input dimension. Let's say we
            have 2 dimension where the first dimension consists of 3 different
            categorical choices and the second dimension is continuous than we
            have to pass [3, 0]. Note that we count starting from 0.
        bounds : List[Tuple[float, float]]
            bounds of input dimensions: (lower, uppper) for continuous dims; (n_cat, np.nan) for categorical dims
        seed : int
            The seed that is passed to the random_forest_run library.
        log_y: bool
            y values (passed to this RF) are expected to be log(y) transformed;
            this will be considered during predicting
        num_trees : int
            The number of trees in the random forest.
        do_bootstrapping : bool
            Turns on / off bootstrapping in the random forest.
        n_points_per_tree : int
            Number of points per tree. If <= 0 X.shape[0] will be used
            in _train(X, y) instead
        ratio_features : float
            The ratio of features that are considered for splitting.
        min_samples_split : int
            The minimum number of data points to perform a split.
        min_samples_leaf : int
            The minimum number of data points in a leaf.
        max_depth : int
            The maximum depth of a single tree.
        eps_purity : float
            The minimum difference between two target values to be considered
            different
        max_num_nodes : int
            The maxmimum total number of nodes in a tree
        instance_features : np.ndarray (I, K)
            Contains the K dimensional instance features of the I different instances
        pca_components : float
            Number of components to keep when using PCA to reduce dimensionality of instance features. Requires to
            set n_feats (> pca_dims).
        """
        super().__init__(
            configspace=configspace,
            types=types,
            bounds=bounds,
            seed=seed,
            instance_features=instance_features,
            pca_components=pca_components,
        )

        self.log_y = log_y
        self.rng = regression.default_random_engine(seed)

        self.rf_opts = regression.forest_opts()
        self.rf_opts.num_trees = num_trees
        self.rf_opts.do_bootstrapping = do_bootstrapping
        max_features = 0 if ratio_features > 1.0 else \
            max(1, int(len(types) * ratio_features))
        self.rf_opts.tree_opts.max_features = max_features
        self.rf_opts.tree_opts.min_samples_to_split = min_samples_split
        self.rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf
        self.rf_opts.tree_opts.max_depth = max_depth
        self.rf_opts.tree_opts.epsilon_purity = eps_purity
        self.rf_opts.tree_opts.max_num_nodes = max_num_nodes
        self.rf_opts.compute_law_of_total_variance = False

        self.n_points_per_tree = n_points_per_tree
        self.rf = None  # type: regression.binary_rss_forest

        # This list well be read out by save_iteration() in the solver
        self.hypers = [num_trees, max_num_nodes, do_bootstrapping,
                       n_points_per_tree, ratio_features, min_samples_split,
                       min_samples_leaf, max_depth, eps_purity, self.seed]
Example #14
0
    def __init__(self, X, Y, config_space=None, 
                n_trees=16, seed=None, bootstrapping=True,
                points_per_tree = None, max_features=None,
                min_samples_split=0, min_samples_leaf=0,
                max_depth=64, cutoffs= (-np.inf, np.inf)):

        """
        Calculate and provide midpoints and sizes from the forest's 
        split values in order to get the marginals
        
        Parameters
        ------------
        X: matrix with the features (numerically encoded)
        
        Y: vector with the response values (numerically encoded)
        
        config_space : ConfigSpace instantiation
        
        forest: trained random forest

        n_trees: number of trees in the forest to be fit
        
        seed: seed for the forests randomness
        
        bootstrapping: whether or not to bootstrap the data for each tree
        
        points_per_tree: number of points used for each tree 
                        (only subsampling if bootstrapping is false)
        
        max_features: number of features to be used at each split, default is 70%
        
        min_samples_split: minimum number of samples required to attempt to split 
        
        min_samples_leaf: minimum number of samples required in a leaf
        
        max_depth: maximal depth of each tree in the forest
        
        cutoffs: tuple of (lower, upper), all values outside this range will be
                 mapped to either the lower or the upper bound. (See:
                 "Generalized Functional ANOVA Diagnostics for High Dimensional
                 Functions of Dependent Variables" by Hooker.)
        """
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(self.__module__ + '.' + self.__class__.__name__)

        pcs = [(np.nan, np.nan)]*X.shape[1]

        # if no ConfigSpace is specified, let's build one with all continuous variables
        if (config_space is None):
            # if no info is given, use min and max values of each variable as bounds
            config_space = ConfigSpace.ConfigurationSpace()
            for i,(mn, mx) in enumerate(zip(np.min(X,axis=0), np.max(X, axis=0) )):
                config_space.add_hyperparameter(UniformFloatHyperparameter("x_%03i" %i, mn, mx))
                
        self.percentiles = np.percentile(Y, range(0,100))
        self.cs = config_space
        self.cs_params =self.cs.get_hyperparameters()
        self.n_dims = len(self.cs_params)
        self.n_trees = n_trees
        self._dict = False

        # at this point we have a valid ConfigSpace object
        # check if param number is correct etc:
        if X.shape[1] != len(self.cs_params):
            raise RuntimeError('Number of parameters in ConfigSpace object does not match input X')
        for i in range(len(self.cs_params)):
            if isinstance(self.cs_params[i], NumericalHyperparameter):
                if (np.max(X[:, i]) > self.cs_params[i].upper) or \
                        (np.min(X[:, i]) < self.cs_params[i].lower):
                    raise RuntimeError('Some sample values from X are not in the given interval')
            elif isinstance(self.cs_params[i], CategoricalHyperparameter):
                unique_vals = set(X[:, i])
                if len(unique_vals) > len(self.cs_params[i].choices):
                    raise RuntimeError('There are some categoricals missing in the ConfigSpace specification for hyperparameter %s:' % self.cs_params[i].name)
            elif isinstance(self.cs_params[i], (Constant)):
                # oddly, unparameterizedhyperparameter and constant are not supported. 
                # raise TypeError('Unsupported Hyperparameter: %s' % type(self.cs_params[i]))
                pass
                # unique_vals = set(X[:, i])
                # if len(unique_vals) > 1:
                #     raise RuntimeError('Got multiple values for Unparameterized (Constant) hyperparameter')
            else:
                raise TypeError('Unsupported Hyperparameter: %s' % type(self.cs_params[i]))
        
        if not np.issubdtype(X.dtype, np.float64):
            logging.warning('low level library expects X argument to be float')
        if not np.issubdtype(Y.dtype, np.float64):
            logging.warning('low level library expects Y argument to be float')

        # initialize all types as 0
        types = np.zeros(len(self.cs_params), dtype=np.uint)
        # retrieve the types and the bounds from the ConfigSpace 
        # TODO: Test if that actually works
        for i, hp in enumerate(self.cs_params):
            if isinstance(hp, CategoricalHyperparameter):
                types[i] = len(hp.choices)
                pcs[i] = (len(hp.choices), np.nan)
            elif isinstance(self.cs_params[i], NumericalHyperparameter):
                pcs[i] = (hp.lower, hp.upper)
            elif isinstance(self.cs_params[i], (Constant)):
                # raise TypeError('Unsupported Hyperparameter: %s' % type(hp))
                types[i] = 1
                pcs[i] = (1, np.nan)
            else:
                raise TypeError('Unsupported Hyperparameter: %s' % type(hp))

        # set forest options
        forest = reg.fanova_forest()
        forest.options.num_trees = n_trees
        forest.options.do_bootstrapping = bootstrapping
        forest.options.num_data_points_per_tree = X.shape[0] if points_per_tree is None else points_per_tree
        forest.options.tree_opts.max_features = (X.shape[1]*7)//10 if max_features is None else max_features

        forest.options.tree_opts.min_samples_to_split = min_samples_split
        forest.options.tree_opts.min_samples_in_leaf = min_samples_leaf
        forest.options.tree_opts.max_depth=max_depth
        forest.options.tree_opts.epsilon_purity = 1e-8

        # create data conatainer and provide all the necessary information
        if seed is None:
            rng = reg.default_random_engine( np.random.randint(2**31-1))
        else:
            rng = reg.default_random_engine(seed)
        data = reg.default_data_container(X.shape[1])

        for i, (mn,mx) in enumerate(pcs):
            if(np.isnan(mx)):
                data.set_type_of_feature(i, mn)
            else:
                data.set_bounds_of_feature(i, mn, mx)

        for i in range(len(Y)):
            try:
                data.add_data_point(X[i].tolist(),Y[i])
            except:
                self.logger.warning("failed to process datapoint: %s", str(X[i].tolist()))
                raise
        
        forest.fit(data, rng)

        self.the_forest = forest

        # initialize a dictionary with parameter dims
        self.variance_dict = dict()    


        # getting split values
        forest_split_values = self.the_forest.all_split_values()
        
        
        # all midpoints and interval sizes treewise for the whole forest
        self.all_midpoints = []
        self.all_sizes = []
        
        
        #compute midpoints and interval sizes for variables in each tree
        for tree_split_values in forest_split_values:
            sizes =[]
            midpoints =  []
            for i, split_vals in enumerate(tree_split_values):
                if np.isnan(pcs[i][1]): # categorical parameter
                    # check if the tree actually splits on this parameter
                    if len(split_vals) > 0:
                        midpoints.append(split_vals)
                        sizes.append( np.ones(len(split_vals)))
                    # if not, simply append 0 as the value with the number
                    # of categories as the size, that way this parameter will
                    # get 0 importance from this tree.
                    else:
                        midpoints.append((0,))
                        sizes.append((pcs[i][0],))
                else:
                    # add bounds to split values
                    sv = np.array([pcs[i][0]] + list(split_vals) + [pcs[i][1]])
                    # compute midpoints and sizes
                    midpoints.append((1/2)* (sv[1:] + sv[:-1]))
                    sizes.append(sv[1:] - sv[:-1])

            self.all_midpoints.append(midpoints)
            self.all_sizes.append(sizes)
            

        # capital V in the paper
        self.trees_total_variances = []
        # dict of lists where the keys are tuples of the dimensions
        # and the value list contains \hat{f}_U for the individual trees
        # reset all the variance fractions computed
        self.trees_variance_fractions = {}
        self.V_U_total = {}
        self.V_U_individual = {}

        self.cutoffs = cutoffs
        self.set_cutoffs(cutoffs)
Example #15
0
    def __init__(self,
                 configspace,
                 types: np.ndarray,
                 bounds: np.ndarray,
                 seed: int,
                 num_trees: int = 10,
                 do_bootstrapping: bool = True,
                 n_points_per_tree: int = -1,
                 ratio_features: float = 5. / 6.,
                 min_samples_split: int = 3,
                 min_samples_leaf: int = 3,
                 max_depth: int = 20,
                 eps_purity: int = 1e-8,
                 max_num_nodes: int = 2**20,
                 logged_y: bool = True,
                 **kwargs):
        """Constructor

        Parameters
        ----------
        configspace: ConfigurationSpace
            configspace to be passed to random forest (used to impute inactive parameter-values)
        types : np.ndarray (D)
            Specifies the number of categorical values of an input dimension where
            the i-th entry corresponds to the i-th input dimension. Let's say we
            have 2 dimension where the first dimension consists of 3 different
            categorical choices and the second dimension is continuous than we
            have to pass np.array([2, 0]). Note that we count starting from 0.
        bounds : np.ndarray (D, 2)
            Specifies the bounds for continuous features.
        seed : int
            The seed that is passed to the random_forest_run library.
        num_trees : int
            The number of trees in the random forest.
        do_bootstrapping : bool
            Turns on / off bootstrapping in the random forest.
        n_points_per_tree : int
            Number of points per tree. If <= 0 X.shape[0] will be used
            in _train(X, y) instead
        ratio_features : float
            The ratio of features that are considered for splitting.
        min_samples_split : int
            The minimum number of data points to perform a split.
        min_samples_leaf : int
            The minimum number of data points in a leaf.
        max_depth : int
            The maximum depth of a single tree.
        eps_purity : float
            The minimum difference between two target values to be considered
            different
        max_num_nodes : int
            The maxmimum total number of nodes in a tree
        logged_y: bool
            Indicates if the y data is transformed (i.e. put on logscale) or not
        """
        super().__init__(configspace=configspace,
                         types=types,
                         bounds=bounds,
                         seed=seed,
                         **kwargs)

        self.configspace = configspace
        self.types = types
        self.bounds = bounds
        self.rng = regression.default_random_engine(seed)

        self.rf_opts = regression.forest_opts()
        self.rf_opts.num_trees = num_trees
        self.rf_opts.do_bootstrapping = do_bootstrapping
        max_features = 0 if ratio_features > 1.0 else \
            max(1, int(types.shape[0] * ratio_features))
        self.rf_opts.tree_opts.max_features = max_features
        self.rf_opts.tree_opts.min_samples_to_split = min_samples_split
        self.rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf
        self.rf_opts.tree_opts.max_depth = max_depth
        self.rf_opts.tree_opts.epsilon_purity = eps_purity
        self.rf_opts.tree_opts.max_num_nodes = max_num_nodes
        self.rf_opts.compute_law_of_total_variance = False  # Always off. No need for this in our base EPM

        self.n_points_per_tree = n_points_per_tree
        self.rf = None  # type: regression.binary_rss_forest
        self.logged_y = logged_y

        # This list well be read out by save_iteration() in the solver
        self.hypers = [
            num_trees, max_num_nodes, do_bootstrapping, n_points_per_tree,
            ratio_features, min_samples_split, min_samples_leaf, max_depth,
            eps_purity, seed
        ]
        self.seed = seed

        self.impute_values = {}

        self.logger = logging.getLogger(self.__module__ + "." +
                                        self.__class__.__name__)
    def __init__(
        self,
        types: np.ndarray,
        bounds: typing.List[typing.Tuple[float, float]],
        log_y: bool=False,
        bootstrap: bool=False,
        n_iters: int=50,
        n_splits: int=10,
        seed: int=42,
    ):
        """Parameters
        ----------
        types : np.ndarray (D)
            Specifies the number of categorical values of an input dimension where
            the i-th entry corresponds to the i-th input dimension. Let's say we
            have 2 dimension where the first dimension consists of 3 different
            categorical choices and the second dimension is continuous than we
            have to pass np.array([2, 0]). Note that we count starting from 0.
        bounds : np.ndarray (D, 2)
            Specifies the bounds for continuous features.
        log_y: bool
            y values (passed to this RF) are expected to be log(y) transformed;
            this will be considered during predicting
        bootstrap : bool
            Turns on / off bootstrapping in the random forest.
        n_iters : int
            Number of iterations for random search.
        n_splits : int
            Number of cross-validation splits.
        seed : int
            The seed that is passed to the random_forest_run library.
        """
        super().__init__(
            types,
            bounds,
            log_y,
            num_trees=N_TREES,
            do_bootstrapping=bootstrap,
            n_points_per_tree=N_POINTS_PER_TREE,
            ratio_features=5/6,
            min_samples_split=3,
            min_samples_leaf=3,
            max_depth=MAX_DEPTH,
            eps_purity=EPSILON_IMPURITY,
            max_num_nodes=MAX_NUM_NODES,
            seed=seed,
        )

        self.types = types
        self.bounds = bounds
        self.log_y = log_y
        self.n_iters = n_iters
        self.n_splits = n_splits
        self.rng = regression.default_random_engine(seed)
        self.rs = np.random.RandomState(seed)
        self.bootstrap = bootstrap

        self.rf_opts = regression.forest_opts()
        self.rf_opts.num_trees = N_TREES
        self.rf_opts.compute_oob_error = True
        self.rf_opts.do_bootstrapping = self.bootstrap
        self.rf_opts.tree_opts.max_features = int(types.shape[0])
        self.rf_opts.tree_opts.min_samples_to_split = 2
        self.rf_opts.tree_opts.min_samples_in_leaf = 1
        self.rf_opts.tree_opts.max_depth = MAX_DEPTH
        self.rf_opts.tree_opts.epsilon_purity = EPSILON_IMPURITY
        self.rf_opts.tree_opts.max_num_nodes = MAX_NUM_NODES
        self.rf_opts.compute_law_of_total_variance = False

        self.rf = None  # type: regression.binary_rss_forest

        # This list will be read out by save_iteration() in the solver
        self._set_hypers(self._get_configuration_space().get_default_configuration())
        self.seed = seed

        self.logger = logging.getLogger(self.__module__ + "." +
                                        self.__class__.__name__)
Example #17
0
 def __setstate__(self, sdict):
     self.__dict__.update(sdict)
     self.reg_rng = reg.default_random_engine(sdict['rng'].randint(1000))
Example #18
0
    def __init__(self,
                 types: np.ndarray,
                 bounds: typing.List[typing.Tuple[float, float]],
                 log_y: bool = False,
                 num_trees: int = N_TREES,
                 do_bootstrapping: bool = True,
                 n_points_per_tree: int = -1,
                 ratio_features: float = 5. / 6.,
                 min_samples_split: int = 3,
                 min_samples_leaf: int = 3,
                 max_depth: int = 2**20,
                 eps_purity: float = 1e-8,
                 max_num_nodes: int = 2**20,
                 seed: int = 42,
                 **kwargs):
        """
        Parameters
        ----------
        types : np.ndarray (D)
            Specifies the number of categorical values of an input dimension where
            the i-th entry corresponds to the i-th input dimension. Let's say we
            have 2 dimension where the first dimension consists of 3 different
            categorical choices and the second dimension is continuous than we
            have to pass np.array([2, 0]). Note that we count starting from 0.
        bounds : list
            Specifies the bounds for continuous features.
        log_y: bool
            y values (passed to this RF) are expected to be log(y) transformed;
            this will be considered during predicting
        num_trees : int
            The number of trees in the random forest.
        do_bootstrapping : bool
            Turns on / off bootstrapping in the random forest.
        n_points_per_tree : int
            Number of points per tree. If <= 0 X.shape[0] will be used
            in _train(X, y) instead
        ratio_features : float
            The ratio of features that are considered for splitting.
        min_samples_split : int
            The minimum number of data points to perform a split.
        min_samples_leaf : int
            The minimum number of data points in a leaf.
        max_depth : int
            The maximum depth of a single tree.
        eps_purity : float
            The minimum difference between two target values to be considered
            different
        max_num_nodes : int
            The maxmimum total number of nodes in a tree
        seed : int
            The seed that is passed to the random_forest_run library.
        """
        super().__init__(types, bounds, **kwargs)

        self.log_y = log_y
        self.rng = regression.default_random_engine(seed)

        self.rf_opts = regression.forest_opts()
        self.rf_opts.num_trees = num_trees
        self.rf_opts.do_bootstrapping = do_bootstrapping
        max_features = 0 if ratio_features > 1.0 else \
            max(1, int(types.shape[0] * ratio_features))
        self.rf_opts.tree_opts.max_features = max_features
        self.rf_opts.tree_opts.min_samples_to_split = min_samples_split
        self.rf_opts.tree_opts.min_samples_in_leaf = min_samples_leaf
        self.rf_opts.tree_opts.max_depth = max_depth
        self.rf_opts.tree_opts.epsilon_purity = eps_purity
        self.rf_opts.tree_opts.max_num_nodes = max_num_nodes
        self.rf_opts.compute_law_of_total_variance = False

        self.n_points_per_tree = n_points_per_tree
        self.rf = None  # type: regression.binary_rss_forest

        # This list well be read out by save_iteration() in the solver
        self.hypers = [
            num_trees, max_num_nodes, do_bootstrapping, n_points_per_tree,
            ratio_features, min_samples_split, min_samples_leaf, max_depth,
            eps_purity, seed
        ]
        self.seed = seed

        self.logger = logging.getLogger(self.__module__ + "." +
                                        self.__class__.__name__)
Example #19
0
import pyrfr.regression as reg

num_points = 8

features = np.array([np.linspace(-1, 1, num_points)]).transpose()
x2 = np.array([np.linspace(-1, 1, 100)]).transpose()
responses = np.exp(-np.power(features / 0.3, 2)).flatten(
) + 0.05 * np.random.randn(features.shape[0])

data = reg.default_data_container(1)

for f, r in zip(features, responses):
    data.add_data_point(f, r)

rng = reg.default_random_engine()

# create an instance of a regerssion forest using binary splits and the RSS loss
the_forest = reg.binary_rss_forest()
the_forest.options.num_trees = 64
the_forest.options.num_data_points_per_tree = num_points
the_forest.options.tree_opts.min_samples_in_leaf = 1

the_forest.fit(data, rng)

fig, (ax1, ax2, ax3) = plt.subplots(3, sharex=True)

predictions = np.array([the_forest.predict_mean_var(x) for x in x2])
ax1.fill_between(x2[:, 0],
                 predictions[:, 0] - predictions[:, 1],
                 predictions[:, 0] + predictions[:, 1],
Example #20
0
    def __init__(self,
                 X,
                 Y,
                 config_space=None,
                 n_trees=16,
                 seed=None,
                 bootstrapping=True,
                 points_per_tree=None,
                 max_features=None,
                 min_samples_split=0,
                 min_samples_leaf=0,
                 max_depth=64,
                 cutoffs=(-np.inf, np.inf)):
        """
        Calculate and provide midpoints and sizes from the forest's 
        split values in order to get the marginals
        
        Parameters
        ------------
        X: matrix with the features
        
        Y: vector with the response values
        
        config_space : ConfigSpace instantiation
        
        forest: trained random forest

        n_trees: number of trees in the forest to be fit
        
        seed: seed for the forests randomness
        
        bootstrapping: whether or not to bootstrap the data for each tree
        
        points_per_tree: number of points used for each tree 
                        (only subsampling if bootstrapping is false)
        
        max_features: number of features to be used at each split, default is 70%
        
        min_samples_split: minimum number of samples required to attempt to split 
        
        min_samples_leaf: minimum number of samples required in a leaf
        
        max_depth: maximal depth of each tree in the forest
        """

        pcs = [(np.nan, np.nan)] * X.shape[1]

        # if no ConfigSpace is specified, let's build one with all continuous variables
        if (config_space is None):
            # if no info is given, use min and max values of each variable as bounds
            config_space = ConfigSpace.ConfigurationSpace()
            for i, (mn,
                    mx) in enumerate(zip(np.min(X, axis=0), np.max(X,
                                                                   axis=0))):
                config_space.add_hyperparameter(
                    UniformFloatHyperparameter("x_%03i" % i, mn, mx))

        self.percentiles = np.percentile(Y, range(0, 100))
        self.cs = config_space
        self.cs_params = self.cs.get_hyperparameters()
        self.n_dims = len(self.cs_params)
        self.n_trees = n_trees
        self._dict = False

        # at this point we have a valid ConfigSpace object
        # check if param number is correct etc:
        if X.shape[1] != len(self.cs_params):
            raise RuntimeError(
                'Number of parameters in ConfigSpace object does not match input X'
            )
        for i in range(len(self.cs_params)):
            if not isinstance(self.cs_params[i], (CategoricalHyperparameter)):
                if (np.max(X[:, i]) > self.cs_params[i].upper) or \
                        (np.min(X[:, i]) < self.cs_params[i].lower):
                    raise RuntimeError(
                        'Some sample values from X are not in the given interval'
                    )
            else:
                unique_vals = set(X[:, i])
                if len(unique_vals) > len(self.cs_params[i].choices):
                    raise RuntimeError(
                        'There are some categoricals missing in the ConfigSpace specification'
                    )

        # initialize all types as 0
        types = np.zeros(len(self.cs_params), dtype=np.uint)
        # retrieve the types and the bounds from the ConfigSpace
        # TODO: Test if that actually works
        for i, hp in enumerate(self.cs_params):
            if isinstance(hp, CategoricalHyperparameter):
                types[i] = len(hp.choices)
                pcs[i] = (len(hp.choices), np.nan)
            else:
                pcs[i] = (hp.lower, hp.upper)

        # set forest options
        forest = reg.fanova_forest()
        forest.options.num_trees = n_trees
        forest.options.do_bootstrapping = bootstrapping
        forest.options.num_data_points_per_tree = X.shape[
            0] if points_per_tree is None else points_per_tree
        forest.options.tree_opts.max_features = (
            X.shape[1] * 7) // 10 if max_features is None else max_features

        forest.options.tree_opts.min_samples_to_split = min_samples_split
        forest.options.tree_opts.min_samples_in_leaf = min_samples_leaf
        forest.options.tree_opts.max_depth = max_depth
        forest.options.tree_opts.epsilon_purity = 1e-8

        # create data conatainer and provide all the necessary information
        if seed is None:
            rng = reg.default_random_engine(np.random.randint(2**31 - 1))
        else:
            rng = reg.default_random_engine(seed)
        data = reg.default_data_container(X.shape[1])

        for i, (mn, mx) in enumerate(pcs):
            if (np.isnan(mx)):
                data.set_type_of_feature(i, mn)
            else:
                data.set_bounds_of_feature(i, mn, mx)

        for i in range(len(Y)):
            try:
                data.add_data_point(X[i].tolist(), Y[i])
            except:
                print("failed to process datapoint:", X[i].tolist())
                raise

        forest.fit(data, rng)

        self.the_forest = forest

        # initialize a dictionary with parameter dims
        self.variance_dict = dict()

        # getting split values
        forest_split_values = self.the_forest.all_split_values()

        # all midpoints and interval sizes treewise for the whole forest
        self.all_midpoints = []
        self.all_sizes = []

        #compute midpoints and interval sizes for variables in each tree
        for tree_split_values in forest_split_values:
            sizes = []
            midpoints = []
            for i, split_vals in enumerate(tree_split_values):
                if np.isnan(pcs[i][1]):  # categorical parameter
                    # check if the tree actually splits on this parameter
                    if len(split_vals) > 0:
                        midpoints.append(split_vals)
                        sizes.append(np.ones(len(split_vals)))
                    # if not, simply append 0 as the value with the number
                    # of categories as the size, that way this parameter will
                    # get 0 importance from this tree.
                    else:
                        midpoints.append((0, ))
                        sizes.append((pcs[i][0], ))
                else:
                    # add bounds to split values
                    sv = np.array([pcs[i][0]] + list(split_vals) + [pcs[i][1]])
                    # compute midpoints and sizes
                    midpoints.append((1 / 2) * (sv[1:] + sv[:-1]))
                    sizes.append(sv[1:] - sv[:-1])

            self.all_midpoints.append(midpoints)
            self.all_sizes.append(sizes)

        # capital V in the paper
        self.trees_total_variances = []
        # dict of lists where the keys are tuples of the dimensions
        # and the value list contains \hat{f}_U for the individual trees
        # reset all the variance fractions computed
        self.trees_variance_fractions = {}
        self.V_U_total = {}
        self.V_U_individual = {}

        self.cutoffs = cutoffs
        self.set_cutoffs(cutoffs)
Example #21
0
    def __init__(self, X, Y, config_space=None,
                 n_trees=16, seed=None, bootstrapping=True,
                 points_per_tree=None, max_features=None,
                 min_samples_split=0, min_samples_leaf=0,
                 max_depth=64, cutoffs=(-np.inf, np.inf)):

        """
        Calculate and provide midpoints and sizes from the forest's 
        split values in order to get the marginals
        
        Parameters
        ------------
        X: matrix with the features, either a np.array or a pd.DataFrame (numerically encoded)
        
        Y: vector with the response values (numerically encoded)
        
        config_space : ConfigSpace instantiation
        
        n_trees: number of trees in the forest to be fit
        
        seed: seed for the forests randomness
        
        bootstrapping: whether or not to bootstrap the data for each tree
        
        points_per_tree: number of points used for each tree 
                        (only subsampling if bootstrapping is false)
        
        max_features: number of features to be used at each split, default is 70%
        
        min_samples_split: minimum number of samples required to attempt to split 
        
        min_samples_leaf: minimum number of samples required in a leaf
        
        max_depth: maximal depth of each tree in the forest
        
        cutoffs: tuple of (lower, upper), all values outside this range will be
                 mapped to either the lower or the upper bound. (See:
                 "Generalized Functional ANOVA Diagnostics for High Dimensional
                 Functions of Dependent Variables" by Hooker.)
        """
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(self.__module__ + '.' + self.__class__.__name__)

        pcs = [(np.nan, np.nan)] * X.shape[1]

        # Convert pd.DataFrame to np.array
        if isinstance(X, pd.DataFrame):
            self.logger.debug("Detected pandas dataframes, converting to floats...")
            if config_space is not None:
                # Check if column names match parameter names
                bad_input = set(X.columns) - set(config_space.get_hyperparameter_names())
                if len(bad_input) != 0:
                    raise ValueError("Could not identify parameters %s from pandas dataframes" % str(bad_input))
                # Reorder dataframe if necessary
                X = X[config_space.get_hyperparameter_names()]
            X = X.to_numpy()
        elif config_space is not None:
            # There is a config_space but no way to check if the np.array'ed data in X is in the correct order...
            self.logger.warning("Note that fANOVA expects data to be ordered like the return of ConfigSpace's "
                                "'get_hyperparameters'-method. We recommend to use labeled pandas dataframes to "
                                "avoid any problems.")

        # if no ConfigSpace is specified, let's build one with all continuous variables
        if config_space is None:
            # if no info is given, use min and max values of each variable as bounds
            config_space = ConfigSpace.ConfigurationSpace()
            for i, (mn, mx) in enumerate(zip(np.min(X, axis=0), np.max(X, axis=0))):
                config_space.add_hyperparameter(UniformFloatHyperparameter("x_%03i" % i, mn, mx))

        self.percentiles = np.percentile(Y, range(0, 100))
        self.cs = config_space
        self.cs_params = self.cs.get_hyperparameters()
        self.n_dims = len(self.cs_params)
        self.n_trees = n_trees
        self._dict = False

        # at this point we have a valid ConfigSpace object
        # check if param number is correct etc:
        if X.shape[1] != len(self.cs_params):
            raise RuntimeError('Number of parameters in ConfigSpace object does not match input X')
        for i in range(len(self.cs_params)):
            if isinstance(self.cs_params[i], NumericalHyperparameter):
                if (np.max(X[:, i]) > self.cs_params[i].upper) or \
                        (np.min(X[:, i]) < self.cs_params[i].lower):
                    raise RuntimeError('Some sample values from X are not in the given interval')
            elif isinstance(self.cs_params[i], CategoricalHyperparameter):
                unique_vals = set(X[:, i])
                if len(unique_vals) > len(self.cs_params[i].choices):
                    raise RuntimeError("There are some categoricals missing in the ConfigSpace specification for "
                                       "hyperparameter %s:" % self.cs_params[i].name)
            elif isinstance(self.cs_params[i], OrdinalHyperparameter):
                unique_vals = set(X[:, i])
                if len(unique_vals) > len(self.cs_params[i].sequence):
                    raise RuntimeError("There are some sequence-options missing in the ConfigSpace specification for "
                                       "hyperparameter %s:" % self.cs_params[i].name)
            elif isinstance(self.cs_params[i], Constant):
                # oddly, unparameterizedhyperparameter and constant are not supported. 
                # raise TypeError('Unsupported Hyperparameter: %s' % type(self.cs_params[i]))
                pass
                # unique_vals = set(X[:, i])
                # if len(unique_vals) > 1:
                #     raise RuntimeError('Got multiple values for Unparameterized (Constant) hyperparameter')
            else:
                raise TypeError('Unsupported Hyperparameter: %s' % type(self.cs_params[i]))

        if not np.issubdtype(X.dtype, np.float64):
            logging.warning('low level library expects X argument to be float')
        if not np.issubdtype(Y.dtype, np.float64):
            logging.warning('low level library expects Y argument to be float')

        # initialize all types as 0
        types = np.zeros(len(self.cs_params), dtype=np.uint)
        # retrieve the types and the bounds from the ConfigSpace 
        # TODO: Test if that actually works
        for i, hp in enumerate(self.cs_params):
            if isinstance(hp, CategoricalHyperparameter):
                types[i] = len(hp.choices)
                pcs[i] = (len(hp.choices), np.nan)
            elif isinstance(hp, OrdinalHyperparameter):
                types[i] = len(hp.sequence)
                pcs[i] = (len(hp.sequence), np.nan)
            elif isinstance(self.cs_params[i], NumericalHyperparameter):
                pcs[i] = (hp.lower, hp.upper)
            elif isinstance(self.cs_params[i], Constant):
                types[i] = 1
                pcs[i] = (1, np.nan)
            else:
                raise TypeError('Unsupported Hyperparameter: %s' % type(hp))

        # set forest options
        forest = reg.fanova_forest()
        forest.options.num_trees = n_trees
        forest.options.do_bootstrapping = bootstrapping
        forest.options.num_data_points_per_tree = X.shape[0] if points_per_tree is None else points_per_tree
        forest.options.tree_opts.max_features = (X.shape[1] * 7) // 10 if max_features is None else max_features

        forest.options.tree_opts.min_samples_to_split = min_samples_split
        forest.options.tree_opts.min_samples_in_leaf = min_samples_leaf
        forest.options.tree_opts.max_depth = max_depth
        forest.options.tree_opts.epsilon_purity = 1e-8

        # create data container and provide all the necessary information
        if seed is None:
            rng = reg.default_random_engine(np.random.randint(2 ** 31 - 1))
        else:
            rng = reg.default_random_engine(seed)
        data = reg.default_data_container(X.shape[1])

        for i, (mn, mx) in enumerate(pcs):
            if np.isnan(mx):
                data.set_type_of_feature(i, mn)
            else:
                data.set_bounds_of_feature(i, mn, mx)

        for i in range(len(Y)):
            self.logger.debug("process datapoint: %s", str(X[i].tolist()))
            data.add_data_point(X[i].tolist(), Y[i])

        forest.fit(data, rng)

        self.the_forest = forest

        # initialize a dictionary with parameter dims
        self.variance_dict = dict()

        # getting split values
        forest_split_values = self.the_forest.all_split_values()

        # all midpoints and interval sizes treewise for the whole forest
        self.all_midpoints = []
        self.all_sizes = []

        # compute midpoints and interval sizes for variables in each tree
        for tree_split_values in forest_split_values:
            sizes = []
            midpoints = []
            for i, split_vals in enumerate(tree_split_values):
                if np.isnan(pcs[i][1]):  # categorical parameter
                    # check if the tree actually splits on this parameter
                    if len(split_vals) > 0:
                        midpoints.append(split_vals)
                        sizes.append(np.ones(len(split_vals)))
                    # if not, simply append 0 as the value with the number of categories as the size, that way this
                    # parameter will get 0 importance from this tree.
                    else:
                        midpoints.append((0,))
                        sizes.append((pcs[i][0],))
                else:
                    # add bounds to split values
                    sv = np.array([pcs[i][0]] + list(split_vals) + [pcs[i][1]])
                    # compute midpoints and sizes
                    midpoints.append((1 / 2) * (sv[1:] + sv[:-1]))
                    sizes.append(sv[1:] - sv[:-1])

            self.all_midpoints.append(midpoints)
            self.all_sizes.append(sizes)

        # capital V in the paper
        self.trees_total_variances = []
        # dict of lists where the keys are tuples of the dimensions
        # and the value list contains \hat{f}_U for the individual trees
        # reset all the variance fractions computed
        self.trees_variance_fractions = {}
        self.V_U_total = {}
        self.V_U_individual = {}

        self.cutoffs = cutoffs
        self.set_cutoffs(cutoffs)