Example #1
0
    def test_adjust_rtree_learners(self):
        """Test modifying the bias and leaf learners of decision trees"""

        # Make a tree learner that will make only 1 split on 32 data points
        tree = RegressionTreeLearner(min_leaf_instances=16)

        # Make y = x + 1
        X, y = _make_linear_data()

        # Fit the model
        tree.fit(X, y)
        self.assertEqual(2, len(set(tree.predict(X))))  # Only one split

        # Use linear regression on the splits
        tree.leaf_learner = LinearRegression()
        tree.fit(X, y)
        self.assertAlmostEqual(1.0, r2_score(y, tree.predict(X)))  # Linear leaves means perfect fit

        # Test whether changing leaf learner does something
        rf = RandomForestRegressor(leaf_learner=LinearRegression(), min_leaf_instances=16, random_seed = 23478)
        rf.fit(X[:16, :], y[:16])  # Train only on a subset
        self.assertAlmostEqual(1.0, r2_score(y, rf.predict(X)))  # Should fit perfectly on whole dataset

        rf = RandomForestRegressor(random_seed = 7834)
        rf.fit(X[:16, :], y[:16])
        self.assertLess(r2_score(y, rf.predict(X)), 1.0)  # Should not fit the whole dataset perfectly
Example #2
0
    def test_rf_multioutput_regressor(self):
        rf = RandomForestRegressor(random_seed=810355)
        # A regression dataset with 3 outputs
        X, y = load_linnerud(return_X_y=True)
        num_data = len(X)
        num_outputs = y.shape[1]

        rf.fit(X, y)
        y_pred, y_std = rf.predict(X, return_std=True)
        _, y_cov = rf.predict(X, return_cov_matrix=True)

        # Assert that all returned values have the correct shape
        assert y_pred.shape == (num_data, num_outputs)
        assert y_std.shape == (num_data, num_outputs)
        assert y_cov.shape == (num_data, num_outputs, num_outputs)

        # The covariance matrices should be symmetric and the diagonals should be the squares of the standard deviations.
        assert np.all(y_cov[:, 0, 1] == y_cov[:, 1, 0])
        assert np.all(y_cov[:, 0, 0] == y_std[:, 0] ** 2)

        # Make sure the user cannot call predict with both return_std and return_cov_matrix True
        with self.assertRaises(ValueError):
            rf.predict(X, return_std=True, return_cov_matrix=True)
Example #3
0
    def test_rf_regressor(self):
        rf = RandomForestRegressor(random_seed = 31247895)

        # Train the model
        X, y = load_diabetes(return_X_y=True)

        # Make sure we get a NotFittedError
        with self.assertRaises(NotFittedError):
            rf.predict(X)

        # Fit the model
        rf.fit(X, y)

        # Run some predictions
        y_pred = rf.predict(X)
        self.assertEqual(len(y_pred), len(y))

        # Test the ability to get importance scores
        y_import = rf.get_importance_scores(X[:100, :])
        self.assertEqual((100, len(X)), y_import.shape)

        # Basic test for functionality. R^2 above 0.88 was measured on 2021-12-09
        score = r2_score(y_pred, y)
        print('R^2:', score)
        self.assertGreater(score, 0.88)

        # Test with weights (make sure it doesn't crash)
        rf.fit(X, y, [2.0]*len(y))

        # Make sure feature importances are stored
        self.assertEqual(np.shape(rf.feature_importances_), (X.shape[1],))
        self.assertAlmostEqual(1.0, np.sum(rf.feature_importances_))

        # Run predictions with std dev
        y_pred, y_std = rf.predict(X, return_std=True)
        self.assertEqual(len(y_pred), len(y_std))
        self.assertTrue((y_std >= 0).all())  # They must be positive
        self.assertGreater(np.std(y_std), 0)  # Must have a variety of values

        # For a single output, the covariance matrix is just the standard deviation squared
        _, y_cov = rf.predict(X, return_cov_matrix=True)
        assert np.all(y_cov.flatten() == y_std ** 2)

        # Make sure the detach operation functions
        rf.clear_model()
        self.assertIsNone(rf.model_)
Example #4
0
def fit_lolo(df,
             md=None,
             var=None,
             out=None,
             domain=None,
             density=None,
             seed=None,
             return_std=True,
             suppress_warnings=True,
             **kwargs):
    r"""Fit a random forest

    Fit a random forest to given data. Specify inputs and outputs, or inherit
    from an existing model.

    Args:
        df (DataFrame): Data for function fitting
        md (gr.Model): Model from which to inherit metadata
        var (list(str) or None): List of features or None for all except outputs
        out (list(str)): List of outputs to fit
        domain (gr.Domain): Domain for new model
        density (gr.Density): Density for new model
        seed (int or None): Random seed for fitting process
        return_std (bool): Return predictive standard deviations?
        suppress_warnings (bool): Suppress warnings when fitting?

    Keyword Arguments:

        num_trees (int):
        use_jackknife (bool):
        bias_learner ():
        leaf_learner ():
        subset_strategy (str):
        min_leaf_instances (int):
        max_depth (int):
        uncertainty_calibration (bool):
        randomize_pivot_location (bool):
        randomly_rotate_features (bool):

    Returns:
        gr.Model: A grama model with fitted function(s)

    Notes:
        - Wrapper for lolopy.learners.RandomForestRegressor

    """
    if suppress_warnings:
        filterwarnings("ignore")

    n_obs, n_in = df.shape

    ## Check minimum rows
    if n_obs < 8:
        raise ValueError("The lolo random forest requires at least 8 rows")

    ## Infer fitting metadata, if available
    if not (md is None):
        domain = md.domain
        density = md.density
        out = md.out

    ## Check invariants
    if not set(out).issubset(set(df.columns)):
        raise ValueError("out must be subset of df.columns")
    ## Default input value
    if var is None:
        var = list(set(df.columns).difference(set(out)))
    ## Check more invariants
    set_inter = set(out).intersection(set(var))
    if len(set_inter) > 0:
        raise ValueError(
            "outputs and inputs must be disjoint; intersect = {}".format(
                set_inter))
    if not set(var).issubset(set(df.columns)):
        raise ValueError("var must be subset of df.columns")

    ## Construct gaussian process for each output
    functions = []

    for output in out:
        rf = RandomForestRegressor(**kwargs)
        set_seed(seed)
        rf.fit(df[var].values, df[output].values)
        name = "RF"

        fun = FunctionRFR(rf, var, [output], name, 0, return_std)
        functions.append(fun)

    ## Construct model
    return gr.Model(functions=functions, domain=domain, density=density)
Example #5
0
    def test_rf_regressor(self):
        rf = RandomForestRegressor()

        # Train the model
        X, y = load_boston(True)
        rf.fit(X, y)

        # Run some predictions
        y_pred = rf.predict(X)
        self.assertEqual(len(y_pred), len(y))

        # Basic test for functionality. R^2 above 0.98 was measured on 27Dec18
        score = r2_score(y_pred, y)
        print('R^2:', score)
        self.assertGreater(score, 0.98)

        # Test with weights (make sure it doesn't crash)
        rf.fit(X, y, [1.0] * len(y))

        # Run predictions with std dev
        y_pred, y_std = rf.predict(X, return_std=True)
        self.assertEqual(len(y_pred), len(y_std))
        self.assertTrue((y_std >= 0).all())  # They must be positive
        self.assertGreater(np.std(y_std), 0)  # Must have a variety of values

        # Make sure the detach operation functions
        rf.clear_model()
        self.assertIsNone(rf.model_)

        # Test removing Jackknife, which should produce equal uncertainties for all entries
        rf.useJackknife = False
        rf.fit(X, y)
        y_pred, y_std = rf.predict(X, return_std=True)
        self.assertAlmostEqual(np.std(y_std), 0)
Example #6
0
    def __init__(
        self,
        num_trees: int = -1,
        use_jackknife: bool = True,
        bias_learner: Optional[BaseLoloLearner] = None,
        leaf_learner: Optional[BaseLoloLearner] = None,
        subset_strategy: Union[str, int, float] = "auto",
        min_leaf_instances: int = 1,
        max_depth: int = 2 ** 30,
        uncertainty_calibration: bool = False,
        randomize_pivot_location: bool = False,
        # randomly_rotate_features: bool = False, currently in develop branch
        **kwargs
    ):
        """Initialize random forest model.

        See lolo Scala source code for initialization parameters:
        https://github.com/CitrineInformatics/lolo/blob/develop/src/main/scala/io/citrine/lolo/learners/RandomForest.scala

        When using `uncertainty_calibration=False` (the default), the number of trees
        `num_trees` should be set to a multiple of the number n of training samples,
        `num_trees = 4 * n` or higher. When using `uncertainty_calibration=True`,
        `num_trees = 64` is sufficient.

        Parameters:
            num_trees: number of trees in the forest; -1 uses number of training samples
            use_jackknife: whether to use jackknife-based variance estimates
            bias_learner: algorithm used to model bias
            leaf_learner: algorithm used at each leaf of the random forest
            subset_strategy: strategy to determine number of features used at each split
                "auto": use the default for lolo (all features for regression, sqrt for classification)
                "log2": use the base 2 log of the number of features
                "sqrt": use the square root of the number of features
                integer: set the number of features explicitly
                float: use a certain fraction of the features
            min_leaf_instances: minimum number of features used at each leaf
            max_depth: maximum depth of decision trees
            uncertainty_calibration: whether to empirically re-calibrate predicted uncertainties
                based on out-of-bag residuals
            randomize_pivot_location: whether to draw pivots randomly or always select the midpoint
            randomly_rotate_features: whether to rotate real scalar fetures for each tree
        """

        super().__init__(**kwargs)

        # validate parameters

        num_trees = params.any_(
            num_trees,
            lambda i: params.integer(i, above=0),
            lambda i: params.integer(i, from_=-1, to=-1),
        )

        use_jackknife = params.boolean(use_jackknife)

        bias_learner = params.any_(
            bias_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none
        )

        leaf_learner = params.any_(
            leaf_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none
        )

        subset_strategy = params.any_(
            subset_strategy,
            lambda s: params.enumeration(s, {"auto", "log2", "sqrt"}),
            lambda s: params.integer(s, above=0),
            lambda s: params.real(s, above=0),
        )

        min_leaf_instances = params.integer(min_leaf_instances, above=0)

        # the default 2**30 works for 32 bit or larger architectures
        max_depth = params.integer(max_depth, above=0)

        uncertainty_calibration = params.boolean(uncertainty_calibration)

        randomize_pivot_location = params.boolean(randomize_pivot_location)

        # randomly_rotate_features = params.boolean(randomly_rotate_features)

        # set up model

        try:
            self._model = RandomForestRegressor(
                num_trees=num_trees,
                use_jackknife=use_jackknife,
                bias_learner=bias_learner,
                leaf_learner=leaf_learner,
                subset_strategy=subset_strategy,
                min_leaf_instances=min_leaf_instances,
                max_depth=max_depth,
                uncertainty_calibration=uncertainty_calibration,
                randomize_pivot_location=randomize_pivot_location,
                # randomly_rotate_features=randomly_rotate_features,
            )
        except Py4JJavaError as e:
            raise BenchmarkError("instantiating lolo model failed") from e

        self._with_uncertainties = use_jackknife  # otherwise, deviations will be zero
Example #7
0
class RandomForestRegressionLolo(SupervisedLearner):
    """Random forest regression, lolo implementation.

    See https://github.com/CitrineInformatics/lolo

    Supports only numeric (vector) inputs and labels.
    """

    def __init__(
        self,
        num_trees: int = -1,
        use_jackknife: bool = True,
        bias_learner: Optional[BaseLoloLearner] = None,
        leaf_learner: Optional[BaseLoloLearner] = None,
        subset_strategy: Union[str, int, float] = "auto",
        min_leaf_instances: int = 1,
        max_depth: int = 2 ** 30,
        uncertainty_calibration: bool = False,
        randomize_pivot_location: bool = False,
        # randomly_rotate_features: bool = False, currently in develop branch
        **kwargs
    ):
        """Initialize random forest model.

        See lolo Scala source code for initialization parameters:
        https://github.com/CitrineInformatics/lolo/blob/develop/src/main/scala/io/citrine/lolo/learners/RandomForest.scala

        When using `uncertainty_calibration=False` (the default), the number of trees
        `num_trees` should be set to a multiple of the number n of training samples,
        `num_trees = 4 * n` or higher. When using `uncertainty_calibration=True`,
        `num_trees = 64` is sufficient.

        Parameters:
            num_trees: number of trees in the forest; -1 uses number of training samples
            use_jackknife: whether to use jackknife-based variance estimates
            bias_learner: algorithm used to model bias
            leaf_learner: algorithm used at each leaf of the random forest
            subset_strategy: strategy to determine number of features used at each split
                "auto": use the default for lolo (all features for regression, sqrt for classification)
                "log2": use the base 2 log of the number of features
                "sqrt": use the square root of the number of features
                integer: set the number of features explicitly
                float: use a certain fraction of the features
            min_leaf_instances: minimum number of features used at each leaf
            max_depth: maximum depth of decision trees
            uncertainty_calibration: whether to empirically re-calibrate predicted uncertainties
                based on out-of-bag residuals
            randomize_pivot_location: whether to draw pivots randomly or always select the midpoint
            randomly_rotate_features: whether to rotate real scalar fetures for each tree
        """

        super().__init__(**kwargs)

        # validate parameters

        num_trees = params.any_(
            num_trees,
            lambda i: params.integer(i, above=0),
            lambda i: params.integer(i, from_=-1, to=-1),
        )

        use_jackknife = params.boolean(use_jackknife)

        bias_learner = params.any_(
            bias_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none
        )

        leaf_learner = params.any_(
            leaf_learner, lambda arg: params.instance(arg, BaseLoloLearner), params.none
        )

        subset_strategy = params.any_(
            subset_strategy,
            lambda s: params.enumeration(s, {"auto", "log2", "sqrt"}),
            lambda s: params.integer(s, above=0),
            lambda s: params.real(s, above=0),
        )

        min_leaf_instances = params.integer(min_leaf_instances, above=0)

        # the default 2**30 works for 32 bit or larger architectures
        max_depth = params.integer(max_depth, above=0)

        uncertainty_calibration = params.boolean(uncertainty_calibration)

        randomize_pivot_location = params.boolean(randomize_pivot_location)

        # randomly_rotate_features = params.boolean(randomly_rotate_features)

        # set up model

        try:
            self._model = RandomForestRegressor(
                num_trees=num_trees,
                use_jackknife=use_jackknife,
                bias_learner=bias_learner,
                leaf_learner=leaf_learner,
                subset_strategy=subset_strategy,
                min_leaf_instances=min_leaf_instances,
                max_depth=max_depth,
                uncertainty_calibration=uncertainty_calibration,
                randomize_pivot_location=randomize_pivot_location,
                # randomly_rotate_features=randomly_rotate_features,
            )
        except Py4JJavaError as e:
            raise BenchmarkError("instantiating lolo model failed") from e

        self._with_uncertainties = use_jackknife  # otherwise, deviations will be zero

    def fit(self, data: Data) -> "RandomForestRegressionLolo":
        """Fits the model using training data.

        Parameters:
            data: labeled tabular data to train on

        Returns:
            self (allows chaining)
        """

        data = params.instance(
            data, Data
        )  # todo: params.data(..., is_labeled=True, is_finite=True)
        n = data.num_samples

        xtrain = params.real_matrix(data.samples(), nrows=n)
        ytrain = params.real_vector(data.labels(), dimensions=n)

        try:
            self._model.fit(xtrain, ytrain)
        except Py4JJavaError as e:
            raise BenchmarkError("training lolo model failed") from e

        return self

    def apply(self, data: Data) -> PredictiveDistribution:
        """Predicts new inputs.

        Parameters:
            data: finite indexed data to predict

        Returns:
            predictive normal distributions if predictive uncertainties were requested,
            otherwise delta distributions
        """

        data = params.instance(
            data, Data
        )  # todo: params.data(..., is_labeled=True, is_finite=True)

        xpred = params.real_matrix(data.samples())

        if self._with_uncertainties:
            try:
                preds, stddevs = self._model.predict(xpred, return_std=True)
                return NormalPredictiveDistribution(mean=preds, stddev=stddevs)
            except Py4JJavaError as e:
                raise BenchmarkError("applying lolo model failed") from e
        else:
            try:
                preds = self._model.predict(xpred, return_std=False)
                return DeltaPredictiveDistribution(mean=preds)
            except Py4JJavaError as e:
                raise BenchmarkError("applying lolo model failed") from e