def test_adjust_rtree_learners(self): """Test modifying the bias and leaf learners of decision trees""" # Make a tree learner that will make only 1 split on 32 data points tree = RegressionTreeLearner(min_leaf_instances=16) # Make y = x + 1 X, y = _make_linear_data() # Fit the model tree.fit(X, y) self.assertEqual(2, len(set(tree.predict(X)))) # Only one split # Use linear regression on the splits tree.leaf_learner = LinearRegression() tree.fit(X, y) self.assertAlmostEqual(1.0, r2_score(y, tree.predict(X))) # Linear leaves means perfect fit # Test whether changing leaf learner does something rf = RandomForestRegressor(leaf_learner=LinearRegression(), min_leaf_instances=16, random_seed = 23478) rf.fit(X[:16, :], y[:16]) # Train only on a subset self.assertAlmostEqual(1.0, r2_score(y, rf.predict(X))) # Should fit perfectly on whole dataset rf = RandomForestRegressor(random_seed = 7834) rf.fit(X[:16, :], y[:16]) self.assertLess(r2_score(y, rf.predict(X)), 1.0) # Should not fit the whole dataset perfectly
def test_rf_multioutput_regressor(self): rf = RandomForestRegressor(random_seed=810355) # A regression dataset with 3 outputs X, y = load_linnerud(return_X_y=True) num_data = len(X) num_outputs = y.shape[1] rf.fit(X, y) y_pred, y_std = rf.predict(X, return_std=True) _, y_cov = rf.predict(X, return_cov_matrix=True) # Assert that all returned values have the correct shape assert y_pred.shape == (num_data, num_outputs) assert y_std.shape == (num_data, num_outputs) assert y_cov.shape == (num_data, num_outputs, num_outputs) # The covariance matrices should be symmetric and the diagonals should be the squares of the standard deviations. assert np.all(y_cov[:, 0, 1] == y_cov[:, 1, 0]) assert np.all(y_cov[:, 0, 0] == y_std[:, 0] ** 2) # Make sure the user cannot call predict with both return_std and return_cov_matrix True with self.assertRaises(ValueError): rf.predict(X, return_std=True, return_cov_matrix=True)
def test_rf_regressor(self): rf = RandomForestRegressor(random_seed = 31247895) # Train the model X, y = load_diabetes(return_X_y=True) # Make sure we get a NotFittedError with self.assertRaises(NotFittedError): rf.predict(X) # Fit the model rf.fit(X, y) # Run some predictions y_pred = rf.predict(X) self.assertEqual(len(y_pred), len(y)) # Test the ability to get importance scores y_import = rf.get_importance_scores(X[:100, :]) self.assertEqual((100, len(X)), y_import.shape) # Basic test for functionality. R^2 above 0.88 was measured on 2021-12-09 score = r2_score(y_pred, y) print('R^2:', score) self.assertGreater(score, 0.88) # Test with weights (make sure it doesn't crash) rf.fit(X, y, [2.0]*len(y)) # Make sure feature importances are stored self.assertEqual(np.shape(rf.feature_importances_), (X.shape[1],)) self.assertAlmostEqual(1.0, np.sum(rf.feature_importances_)) # Run predictions with std dev y_pred, y_std = rf.predict(X, return_std=True) self.assertEqual(len(y_pred), len(y_std)) self.assertTrue((y_std >= 0).all()) # They must be positive self.assertGreater(np.std(y_std), 0) # Must have a variety of values # For a single output, the covariance matrix is just the standard deviation squared _, y_cov = rf.predict(X, return_cov_matrix=True) assert np.all(y_cov.flatten() == y_std ** 2) # Make sure the detach operation functions rf.clear_model() self.assertIsNone(rf.model_)
def fit_lolo(df, md=None, var=None, out=None, domain=None, density=None, seed=None, return_std=True, suppress_warnings=True, **kwargs): r"""Fit a random forest Fit a random forest to given data. Specify inputs and outputs, or inherit from an existing model. Args: df (DataFrame): Data for function fitting md (gr.Model): Model from which to inherit metadata var (list(str) or None): List of features or None for all except outputs out (list(str)): List of outputs to fit domain (gr.Domain): Domain for new model density (gr.Density): Density for new model seed (int or None): Random seed for fitting process return_std (bool): Return predictive standard deviations? suppress_warnings (bool): Suppress warnings when fitting? Keyword Arguments: num_trees (int): use_jackknife (bool): bias_learner (): leaf_learner (): subset_strategy (str): min_leaf_instances (int): max_depth (int): uncertainty_calibration (bool): randomize_pivot_location (bool): randomly_rotate_features (bool): Returns: gr.Model: A grama model with fitted function(s) Notes: - Wrapper for lolopy.learners.RandomForestRegressor """ if suppress_warnings: filterwarnings("ignore") n_obs, n_in = df.shape ## Check minimum rows if n_obs < 8: raise ValueError("The lolo random forest requires at least 8 rows") ## Infer fitting metadata, if available if not (md is None): domain = md.domain density = md.density out = md.out ## Check invariants if not set(out).issubset(set(df.columns)): raise ValueError("out must be subset of df.columns") ## Default input value if var is None: var = list(set(df.columns).difference(set(out))) ## Check more invariants set_inter = set(out).intersection(set(var)) if len(set_inter) > 0: raise ValueError( "outputs and inputs must be disjoint; intersect = {}".format( set_inter)) if not set(var).issubset(set(df.columns)): raise ValueError("var must be subset of df.columns") ## Construct gaussian process for each output functions = [] for output in out: rf = RandomForestRegressor(**kwargs) set_seed(seed) rf.fit(df[var].values, df[output].values) name = "RF" fun = FunctionRFR(rf, var, [output], name, 0, return_std) functions.append(fun) ## Construct model return gr.Model(functions=functions, domain=domain, density=density)
def test_rf_regressor(self): rf = RandomForestRegressor() # Train the model X, y = load_boston(True) rf.fit(X, y) # Run some predictions y_pred = rf.predict(X) self.assertEqual(len(y_pred), len(y)) # Basic test for functionality. R^2 above 0.98 was measured on 27Dec18 score = r2_score(y_pred, y) print('R^2:', score) self.assertGreater(score, 0.98) # Test with weights (make sure it doesn't crash) rf.fit(X, y, [1.0] * len(y)) # Run predictions with std dev y_pred, y_std = rf.predict(X, return_std=True) self.assertEqual(len(y_pred), len(y_std)) self.assertTrue((y_std >= 0).all()) # They must be positive self.assertGreater(np.std(y_std), 0) # Must have a variety of values # Make sure the detach operation functions rf.clear_model() self.assertIsNone(rf.model_) # Test removing Jackknife, which should produce equal uncertainties for all entries rf.useJackknife = False rf.fit(X, y) y_pred, y_std = rf.predict(X, return_std=True) self.assertAlmostEqual(np.std(y_std), 0)
def __init__( self, num_trees: int = -1, use_jackknife: bool = True, bias_learner: Optional[BaseLoloLearner] = None, leaf_learner: Optional[BaseLoloLearner] = None, subset_strategy: Union[str, int, float] = "auto", min_leaf_instances: int = 1, max_depth: int = 2 ** 30, uncertainty_calibration: bool = False, randomize_pivot_location: bool = False, # randomly_rotate_features: bool = False, currently in develop branch **kwargs ): """Initialize random forest model. See lolo Scala source code for initialization parameters: https://github.com/CitrineInformatics/lolo/blob/develop/src/main/scala/io/citrine/lolo/learners/RandomForest.scala When using `uncertainty_calibration=False` (the default), the number of trees `num_trees` should be set to a multiple of the number n of training samples, `num_trees = 4 * n` or higher. When using `uncertainty_calibration=True`, `num_trees = 64` is sufficient. Parameters: num_trees: number of trees in the forest; -1 uses number of training samples use_jackknife: whether to use jackknife-based variance estimates bias_learner: algorithm used to model bias leaf_learner: algorithm used at each leaf of the random forest subset_strategy: strategy to determine number of features used at each split "auto": use the default for lolo (all features for regression, sqrt for classification) "log2": use the base 2 log of the number of features "sqrt": use the square root of the number of features integer: set the number of features explicitly float: use a certain fraction of the features min_leaf_instances: minimum number of features used at each leaf max_depth: maximum depth of decision trees uncertainty_calibration: whether to empirically re-calibrate predicted uncertainties based on out-of-bag residuals randomize_pivot_location: whether to draw pivots randomly or always select the midpoint randomly_rotate_features: whether to rotate real scalar fetures for each tree """ super().__init__(**kwargs) # validate parameters num_trees = params.any_( num_trees, lambda i: params.integer(i, above=0), lambda i: params.integer(i, from_=-1, to=-1), ) use_jackknife = params.boolean(use_jackknife) bias_learner = params.any_( bias_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none ) leaf_learner = params.any_( leaf_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none ) subset_strategy = params.any_( subset_strategy, lambda s: params.enumeration(s, {"auto", "log2", "sqrt"}), lambda s: params.integer(s, above=0), lambda s: params.real(s, above=0), ) min_leaf_instances = params.integer(min_leaf_instances, above=0) # the default 2**30 works for 32 bit or larger architectures max_depth = params.integer(max_depth, above=0) uncertainty_calibration = params.boolean(uncertainty_calibration) randomize_pivot_location = params.boolean(randomize_pivot_location) # randomly_rotate_features = params.boolean(randomly_rotate_features) # set up model try: self._model = RandomForestRegressor( num_trees=num_trees, use_jackknife=use_jackknife, bias_learner=bias_learner, leaf_learner=leaf_learner, subset_strategy=subset_strategy, min_leaf_instances=min_leaf_instances, max_depth=max_depth, uncertainty_calibration=uncertainty_calibration, randomize_pivot_location=randomize_pivot_location, # randomly_rotate_features=randomly_rotate_features, ) except Py4JJavaError as e: raise BenchmarkError("instantiating lolo model failed") from e self._with_uncertainties = use_jackknife # otherwise, deviations will be zero
class RandomForestRegressionLolo(SupervisedLearner): """Random forest regression, lolo implementation. See https://github.com/CitrineInformatics/lolo Supports only numeric (vector) inputs and labels. """ def __init__( self, num_trees: int = -1, use_jackknife: bool = True, bias_learner: Optional[BaseLoloLearner] = None, leaf_learner: Optional[BaseLoloLearner] = None, subset_strategy: Union[str, int, float] = "auto", min_leaf_instances: int = 1, max_depth: int = 2 ** 30, uncertainty_calibration: bool = False, randomize_pivot_location: bool = False, # randomly_rotate_features: bool = False, currently in develop branch **kwargs ): """Initialize random forest model. See lolo Scala source code for initialization parameters: https://github.com/CitrineInformatics/lolo/blob/develop/src/main/scala/io/citrine/lolo/learners/RandomForest.scala When using `uncertainty_calibration=False` (the default), the number of trees `num_trees` should be set to a multiple of the number n of training samples, `num_trees = 4 * n` or higher. When using `uncertainty_calibration=True`, `num_trees = 64` is sufficient. Parameters: num_trees: number of trees in the forest; -1 uses number of training samples use_jackknife: whether to use jackknife-based variance estimates bias_learner: algorithm used to model bias leaf_learner: algorithm used at each leaf of the random forest subset_strategy: strategy to determine number of features used at each split "auto": use the default for lolo (all features for regression, sqrt for classification) "log2": use the base 2 log of the number of features "sqrt": use the square root of the number of features integer: set the number of features explicitly float: use a certain fraction of the features min_leaf_instances: minimum number of features used at each leaf max_depth: maximum depth of decision trees uncertainty_calibration: whether to empirically re-calibrate predicted uncertainties based on out-of-bag residuals randomize_pivot_location: whether to draw pivots randomly or always select the midpoint randomly_rotate_features: whether to rotate real scalar fetures for each tree """ super().__init__(**kwargs) # validate parameters num_trees = params.any_( num_trees, lambda i: params.integer(i, above=0), lambda i: params.integer(i, from_=-1, to=-1), ) use_jackknife = params.boolean(use_jackknife) bias_learner = params.any_( bias_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none ) leaf_learner = params.any_( leaf_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none ) subset_strategy = params.any_( subset_strategy, lambda s: params.enumeration(s, {"auto", "log2", "sqrt"}), lambda s: params.integer(s, above=0), lambda s: params.real(s, above=0), ) min_leaf_instances = params.integer(min_leaf_instances, above=0) # the default 2**30 works for 32 bit or larger architectures max_depth = params.integer(max_depth, above=0) uncertainty_calibration = params.boolean(uncertainty_calibration) randomize_pivot_location = params.boolean(randomize_pivot_location) # randomly_rotate_features = params.boolean(randomly_rotate_features) # set up model try: self._model = RandomForestRegressor( num_trees=num_trees, use_jackknife=use_jackknife, bias_learner=bias_learner, leaf_learner=leaf_learner, subset_strategy=subset_strategy, min_leaf_instances=min_leaf_instances, max_depth=max_depth, uncertainty_calibration=uncertainty_calibration, randomize_pivot_location=randomize_pivot_location, # randomly_rotate_features=randomly_rotate_features, ) except Py4JJavaError as e: raise BenchmarkError("instantiating lolo model failed") from e self._with_uncertainties = use_jackknife # otherwise, deviations will be zero def fit(self, data: Data) -> "RandomForestRegressionLolo": """Fits the model using training data. Parameters: data: labeled tabular data to train on Returns: self (allows chaining) """ data = params.instance( data, Data ) # todo: params.data(..., is_labeled=True, is_finite=True) n = data.num_samples xtrain = params.real_matrix(data.samples(), nrows=n) ytrain = params.real_vector(data.labels(), dimensions=n) try: self._model.fit(xtrain, ytrain) except Py4JJavaError as e: raise BenchmarkError("training lolo model failed") from e return self def apply(self, data: Data) -> PredictiveDistribution: """Predicts new inputs. Parameters: data: finite indexed data to predict Returns: predictive normal distributions if predictive uncertainties were requested, otherwise delta distributions """ data = params.instance( data, Data ) # todo: params.data(..., is_labeled=True, is_finite=True) xpred = params.real_matrix(data.samples()) if self._with_uncertainties: try: preds, stddevs = self._model.predict(xpred, return_std=True) return NormalPredictiveDistribution(mean=preds, stddev=stddevs) except Py4JJavaError as e: raise BenchmarkError("applying lolo model failed") from e else: try: preds = self._model.predict(xpred, return_std=False) return DeltaPredictiveDistribution(mean=preds) except Py4JJavaError as e: raise BenchmarkError("applying lolo model failed") from e