Example #1
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            cluster=None,
            compute_oob_predictions=False):
        """Fit the grf forest using training data.

        :param array2d X: training input features
        :param array1d y: training input targets
        :param array1d sample_weight: optional weights for input samples
        :param array1d cluster: optional cluster assignments for input samples
        """
        X, y = self._validate_data(X, y, force_all_finite="allow-nan")
        self._check_num_samples(X)
        self._check_n_features(X, reset=True)

        self._check_sample_fraction(oob=compute_oob_predictions)
        self._check_alpha()

        sample_weight, use_sample_weight = check_sample_weight(
            sample_weight, X)

        cluster_ = self._check_cluster(X=X, cluster=cluster)
        self.samples_per_cluster_ = self._check_equalize_cluster_weights(
            cluster=cluster_, sample_weight=sample_weight)
        self.mtry_ = self._check_mtry(X=X)

        train_matrix = self._create_train_matrices(X=X,
                                                   y=y,
                                                   sample_weight=sample_weight)

        self.grf_forest_ = grf.regression_train(
            np.asfortranarray(train_matrix.astype("float64")),
            self.outcome_index_,
            self.sample_weight_index_,
            use_sample_weight,
            self.mtry_,
            1,  # num_trees
            self.min_node_size,
            self.sample_fraction,
            self.honesty,
            self.honesty_fraction,
            self.honesty_prune_leaves,
            1,  # ci_group_size
            self.alpha,
            self.imbalance_penalty,
            cluster_,
            self.samples_per_cluster_,
            compute_oob_predictions,
            1,  # num_threads
            self.seed,
        )
        self._ensure_ptr()
        sample_weight = sample_weight if sample_weight is not None else np.ones(
            len(X))
        self._set_node_values(y, sample_weight)
        self._set_n_classes()
        return self
Example #2
0
    def _estimate_using_regression(self,
                                   X,
                                   y,
                                   sample_weight=None,
                                   cluster=None):
        """Generate target estimates using a regression.

        In the R package, they perform forest tuning here. For now, we just perform
        a single regression without tuning. We also don't expose any of the forest
        parametrization for this process in the estimator.

        # TODO consider implementing tuning, exposing parameters.
        """
        train_matrix = self._create_train_matrices(
            X=X,
            y=y,
            sample_weight=sample_weight,
        )
        sample_weight, use_sample_weight = check_sample_weight(
            sample_weight, X)
        n_estimators = 50
        regression_forest = grf.regression_train(
            np.asfortranarray(train_matrix.astype("float64")),
            self.outcome_index_,
            self.sample_weight_index_,
            use_sample_weight,
            self.mtry_,
            n_estimators,  # num_trees
            5,  # min_node_size
            self.sample_fraction,
            True,  # honesty
            0.5,  # honesty_fraction
            self.honesty_prune_leaves,
            1,  # ci_group_size
            self.alpha,
            self.imbalance_penalty,
            cluster,
            self.samples_per_cluster_,
            True,  # compute_oob_predictions,
            self._get_num_threads(),  # num_threads,
            self.seed,
        )
        return np.atleast_1d(
            np.squeeze(np.array(regression_forest["predictions"])))
Example #3
0
    def fit(self, X, y, sample_weight=None, cluster=None):
        """Fit the grf tree using training data.

        :param array2d X: training input features
        :param array1d y: training input targets, rows of (bool, float) representing
            (survival, time)
        :param array1d sample_weight: optional weights for input samples
        :param array1d cluster: optional cluster assignments for input samples
        """
        X = check_array(X, force_all_finite="allow-nan")
        self._check_num_samples(X)
        self._check_n_features(X, reset=True)
        y = np.array(y.tolist())

        self._check_sample_fraction()
        self._check_alpha()

        cluster = self._check_cluster(X=X, cluster=cluster)
        self.samples_per_cluster_ = self._check_equalize_cluster_weights(
            cluster=cluster, sample_weight=sample_weight)

        sample_weight, use_sample_weight = check_sample_weight(
            sample_weight, X)

        self.mtry_ = self._check_mtry(X=X)

        # Extract the failure times from the training targets
        self.failure_times_ = np.sort(np.unique(y[:, 1][y[:, 0] == 1]))
        self.num_failures_ = len(self.failure_times_)

        # Relabel the failure times to consecutive integers
        y_times_relabeled = np.searchsorted(self.failure_times_, y[:, 1])
        y_censor = y[:, 0]

        train_matrix = self._create_train_matrices(X,
                                                   y_times_relabeled,
                                                   sample_weight=sample_weight,
                                                   censor=y_censor)
        self.train_ = train_matrix

        self.grf_forest_ = grf.survival_train(
            np.asfortranarray(train_matrix.astype("float64")),
            self.outcome_index_,
            self.censor_index_,
            self.sample_weight_index_,
            use_sample_weight,
            self.mtry_,
            1,  # num_trees
            self.min_node_size,
            self.sample_fraction,
            self.honesty,
            self.honesty_fraction,
            self.honesty_prune_leaves,
            self.alpha,
            self.num_failures_,
            cluster,
            self.samples_per_cluster_,
            False,  # compute_oob_predictions,
            1,  # num_threads,
            self.seed,
        )
        self._ensure_ptr()
        sample_weight = sample_weight if sample_weight is not None else np.ones(
            len(X))
        self._set_node_values(y, sample_weight)
        self._set_n_classes()
        return self
Example #4
0
    def fit(
        self,
        X,
        y,
        w,  # treatment
        z,  # instrument
        y_hat=None,
        w_hat=None,
        z_hat=None,
        sample_weight=None,
        cluster=None,
    ):
        """Fit the grf forest using training data.

        :param array2d X: training input features
        :param array1d y: training input targets
        :param array1d w: training input treatments
        :param array1d z: training input instruments
        :param array1d y_hat: estimated expected target responses
        :param array1d w_hat: estimated treatment propensities
        :param array1d z_hat: estimated instrument propensities
        :param array1d sample_weight: optional weights for input samples
        :param array1d cluster: optional cluster assignments for input samples
        """
        X, y = self._validate_data(X, y, force_all_finite="allow-nan")
        self._check_num_samples(X)
        self._check_n_features(X, reset=True)

        self._check_sample_fraction()
        self._check_alpha()

        sample_weight, use_sample_weight = check_sample_weight(
            sample_weight, X)

        cluster = self._check_cluster(X=X, cluster=cluster)
        self.samples_per_cluster_ = self._check_equalize_cluster_weights(
            cluster=cluster, sample_weight=sample_weight)
        self.mtry_ = self._check_mtry(X=X)
        self._check_reduced_form_weight()

        if y_hat is None:
            logger.debug("estimating y_hat")
            y_hat = self._estimate_using_regression(
                X=X, y=y, sample_weight=sample_weight, cluster=cluster)

        if w_hat is None:
            logger.debug("estimating w_hat")
            w_hat = self._estimate_using_regression(
                X=X, y=w, sample_weight=sample_weight, cluster=cluster)

        # don't repeat calculations for causal
        if np.all(w == z):
            z_hat = w_hat

        if z_hat is None:
            logger.debug("estimating z_hat")
            z_hat = self._estimate_using_regression(
                X=X, y=z, sample_weight=sample_weight, cluster=cluster)

        y_centered = y - y_hat
        w_centered = w - w_hat
        z_centered = z - z_hat

        train_matrix = self._create_train_matrices(
            X=X,
            y=y_centered,
            sample_weight=sample_weight,
            treatment=w_centered,
            instrument=z_centered,
        )

        self.grf_forest_ = grf.instrumental_train(
            np.asfortranarray(train_matrix.astype("float64")),
            self.outcome_index_,
            self.treatment_index_,
            self.instrument_index_,
            self.sample_weight_index_,
            use_sample_weight,
            self.mtry_,
            1,  # num_trees
            self.min_node_size,
            self.sample_fraction,
            self.honesty,
            self.honesty_fraction,
            self.honesty_prune_leaves,
            1,  # ci_group_size
            self.reduced_form_weight,
            self.alpha,
            self.imbalance_penalty,
            self.stabilize_splits,
            cluster,
            self.samples_per_cluster_,
            False,  # compute_oob_predictions,
            1,  # num_threads,
            self.seed,
        )
        self._ensure_ptr()
        sample_weight = sample_weight if sample_weight is not None else np.ones(
            len(X))
        self._set_node_values(y, sample_weight)
        self._set_n_classes()
        return self
Example #5
0
    def fit(self, X, y, sample_weight=None, cluster=None):
        """Fit the grf forest using training data.

        :param array2d X: training input features
        :param array1d y: training input targets
        :param array1d sample_weight: optional weights for input samples
        :param array1d cluster: optional cluster assignments for input samples
        """
        X, y = self._validate_data(X, y)
        self._check_num_samples(X)
        self._check_n_features(X, reset=True)

        self._check_sample_fraction()
        self._check_alpha()

        sample_weight, use_sample_weight = check_sample_weight(
            sample_weight, X)

        cluster = self._check_cluster(X=X, cluster=cluster)
        self.samples_per_cluster_ = self._check_equalize_cluster_weights(
            cluster=cluster, sample_weight=sample_weight)
        self.mtry_ = self._check_mtry(X=X)

        train_matrix = self._create_train_matrices(X,
                                                   y,
                                                   sample_weight=sample_weight)
        self.train_ = train_matrix

        if self.ll_split_variables is None:
            self.ll_split_variables_ = list(range(X.shape[1]))
        else:
            self.ll_split_variables_ = self.ll_split_variables

        # calculate overall beta
        if self.ll_split_cutoff is None:
            self.ll_split_cutoff_ = int(X.shape[0]**0.5)
        else:
            self.ll_split_cutoff_ = self.ll_split_cutoff

        if self.ll_split_cutoff_ > 0:
            J = np.eye(X.shape[1] + 1)
            J[0, 0] = 0
            D = np.concatenate([np.ones((X.shape[0], 1)), X], axis=1)
            self.overall_beta_ = (
                np.linalg.solve(D.T @ D + self.ll_split_lambda * J,
                                np.eye(X.shape[1] + 1)) @ D.T @ y)
        else:
            self.overall_beta_ = np.empty((0, ), dtype=float, order="F")

        self.grf_forest_ = grf.ll_regression_train(
            np.asfortranarray(train_matrix.astype("float64")),
            self.outcome_index_,
            self.sample_weight_index_,
            self.ll_split_lambda,
            self.ll_split_weight_penalty,
            self.ll_split_variables_,
            self.ll_split_cutoff_,
            self.overall_beta_,
            use_sample_weight,
            self.mtry_,
            self.n_estimators,  # num_trees
            self.min_node_size,
            self.sample_fraction,
            self.honesty,
            self.honesty_fraction,
            self.honesty_prune_leaves,
            self.ci_group_size,
            self.alpha,
            self.imbalance_penalty,
            cluster,
            self.samples_per_cluster_,
            self._get_num_threads(),  # num_threads,
            self.seed,
        )
        self._ensure_ptr()

        if self.enable_tree_details:
            sample_weight = (sample_weight
                             if sample_weight is not None else np.ones(len(X)))
            self._set_node_values(y, sample_weight)
            self._set_n_classes()

        return self
Example #6
0
    def fit(self, X, y, sample_weight=None, cluster=None):
        """Fit the grf forest using training data.

        :param array2d X: training input features
        :param array1d y: training input targets
        :param array1d sample_weight: optional weights for input samples
        :param array1d cluster: optional cluster assignments for input samples
        """
        X, y = self._validate_data(X, y, force_all_finite="allow-nan")
        self._check_n_features(X, reset=True)

        self._check_boost_error_reduction()

        self._check_sample_fraction(oob=True)
        self._check_alpha()

        sample_weight, _ = check_sample_weight(sample_weight, X)

        cluster_ = self._check_cluster(X=X, cluster=cluster)
        self.samples_per_cluster_ = self._check_equalize_cluster_weights(
            cluster=cluster_, sample_weight=sample_weight)
        self.mtry_ = self._check_mtry(X=X)

        _ = self._create_train_matrices(X=X, y=y, sample_weight=sample_weight)

        # region tuning a regression forest
        regression_forest = GRFForestRegressor(
            n_estimators=self.tune_n_estimators,
            equalize_cluster_weights=self.equalize_cluster_weights,
            sample_fraction=self.sample_fraction,
            mtry=self.mtry,
            min_node_size=self.min_node_size,
            honesty=self.honesty,
            honesty_fraction=self.honesty_fraction,
            honesty_prune_leaves=self.honesty_prune_leaves,
            alpha=self.alpha,
            imbalance_penalty=self.imbalance_penalty,
            ci_group_size=self.ci_group_size,
            n_jobs=self.n_jobs,
            seed=self.seed,
        )
        if self.tune_params is None:
            logger.debug("not tuning boosted forest")
            regression_forest.fit(
                X=X,
                y=y,
                sample_weight=sample_weight,
                cluster=cluster,
                compute_oob_predictions=True,
            )
            params = regression_forest.get_params(deep=True)
            forest = regression_forest
        else:
            logger.debug("tuning boosted forest")
            tunable_params = (
                "sample_fraction",
                "mtry",
                "min_node_size",
                "honesty_fraction",
                "honesty_prune_leaves",
                "alpha",
                "imbalance_penalty",
            )
            param_distributions = {}
            for param in self.tune_params:
                if param not in tunable_params:
                    raise ValueError(
                        f"tuning param {param} not found in {str(tunable_params)}"
                    )
                param_distributions[param] = PARAM_DISTRIBUTIONS[param](
                    *X.shape)

            uniform_samples = random.uniform(size=(self.tune_n_draws,
                                                   len(self.tune_params)))
            param_samples = np.zeros(shape=(self.tune_n_draws,
                                            len(self.tune_params)))
            for idx, param in enumerate(self.tune_params):
                param_samples[:, idx] = param_distributions[param].dist(
                    uniform_samples[:, idx])

            errors = []
            for draw in range(self.tune_n_draws):
                params = {
                    p: param_samples[draw, idx]
                    for idx, p in enumerate(self.tune_params)
                }
                regression_forest.set_params(**params)
                regression_forest.fit(
                    X=X,
                    y=y,
                    sample_weight=sample_weight,
                    cluster=cluster,
                    compute_oob_predictions=True,
                )
                errors.append(
                    np.nanmean(
                        regression_forest.grf_forest_["debiased_error"]))

            if np.any(np.isnan(errors)):
                raise ValueError(
                    "unable to tune because of NaN-valued forest error estimates; consider more trees"
                )

            if np.std(errors) == 0 or np.std(errors) / np.mean(errors) < 1e-10:
                raise ValueError(
                    "unable to tune because of constant errors for forests; consider more trees"
                )

            variance_guess = np.var(errors) / 2
            gp = GaussianProcessRegressor(alpha=variance_guess)
            gp.fit(uniform_samples, errors)

            opt_samples = random.uniform(size=(self.tune_n_draws,
                                               len(self.tune_params)))

            model_surface = gp.predict(opt_samples)
            tuned_params = np.zeros(shape=(self.tune_n_draws,
                                           len(self.tune_params)))
            for idx, param in enumerate(self.tune_params):
                tuned_params[:, idx] = param_distributions[param].dist(
                    opt_samples[:, idx])

            opt_idx = np.argmin(model_surface)
            params = {
                p: tuned_params[opt_idx, idx]
                for idx, p in enumerate(self.tune_params)
            }
            params.update(**{"n_estimators": self.tune_n_estimators * 4})
            regression_forest.set_params(**params)
            regression_forest.fit(
                X,
                y,
                sample_weight=sample_weight,
                cluster=cluster,
                compute_oob_predictions=True,
            )
            retrained_error = np.nanmean(
                regression_forest.grf_forest_["debiased_error"])

            default_params = {
                "sample_fraction": 0.5,
                "mtry": min(np.ceil(np.sqrt(X.shape[1]) + 20), X.shape[1]),
                "min_node_size": 5,
                "honesty_fraction": 0.5,
                "honesty_prune_leaves": True,
                "alpha": 0.05,
                "imbalance_penalty": 0,
            }
            default_forest = clone(regression_forest)
            default_forest.set_params(**default_params)
            default_forest.fit(
                X=X,
                y=y,
                sample_weight=sample_weight,
                cluster=cluster,
                compute_oob_predictions=True,
            )
            default_error = np.nanmean(
                default_forest.grf_forest_["debiased_error"])

            if default_error < retrained_error:
                params = default_forest.get_params()
                forest = default_forest
            else:
                params = regression_forest.get_params()
                forest = regression_forest
        # endregion

        # region boosting with the tuned forest
        logger.debug("boosting forest")
        current_pred = {
            "predictions": forest.grf_forest_["predictions"],
            "debiased_error": forest.grf_forest_["debiased_error"],
            "excess_error": forest.grf_forest_["excess_error"],
        }

        y_hat = np.atleast_1d(np.squeeze(np.array(
            current_pred["predictions"])))
        debiased_error = current_pred["debiased_error"]
        boosted_forests = {
            "forest": [forest],
            "error": [np.mean(debiased_error)],
        }

        step = 1
        while True:
            y_residual = y - y_hat
            if self.boost_steps is not None:
                if step > self.boost_steps:
                    break
            elif step > self.boost_max_steps:
                break
            else:
                forest_small = GRFForestRegressor(
                    sample_fraction=params["sample_fraction"],
                    mtry=params["mtry"],
                    n_estimators=self.boost_trees_tune,
                    n_jobs=self.n_jobs,
                    min_node_size=params["min_node_size"],
                    honesty=self.honesty,
                    honesty_fraction=params["honesty_fraction"],
                    honesty_prune_leaves=params["honesty_prune_leaves"],
                    seed=self.seed,
                    ci_group_size=self.ci_group_size,
                    alpha=params["alpha"],
                    imbalance_penalty=params["imbalance_penalty"],
                    equalize_cluster_weights=self.equalize_cluster_weights,
                )
                forest_small.fit(
                    X=X,
                    y=y_residual,
                    sample_weight=sample_weight,
                    cluster=cluster,
                    compute_oob_predictions=True,
                )
                step_error = forest_small.grf_forest_["debiased_error"]
                if not np.nanmean(
                        step_error
                ) <= self.boost_error_reduction * np.nanmean(debiased_error):
                    break

            forest_residual = GRFForestRegressor(
                sample_fraction=params["sample_fraction"],
                mtry=params["mtry"],
                n_estimators=self.n_estimators,
                n_jobs=self.n_jobs,
                min_node_size=params["min_node_size"],
                honesty=self.honesty,
                honesty_fraction=params["honesty_fraction"],
                honesty_prune_leaves=params["honesty_prune_leaves"],
                seed=self.seed,
                ci_group_size=self.ci_group_size,
                alpha=params["alpha"],
                imbalance_penalty=params["imbalance_penalty"],
                equalize_cluster_weights=self.equalize_cluster_weights,
            )
            forest_residual.fit(
                X,
                y_residual,
                sample_weight=sample_weight,
                cluster=cluster,
                compute_oob_predictions=True,
            )
            current_pred = {
                "predictions": forest_residual.grf_forest_["predictions"],
                "debiased_error":
                forest_residual.grf_forest_["debiased_error"],
                "excess_error": forest_residual.grf_forest_["excess_error"],
            }
            y_hat = y_hat + np.atleast_1d(
                np.squeeze(np.array(current_pred["predictions"])))
            debiased_error = current_pred["debiased_error"]
            boosted_forests["forest"].append(forest_residual)
            boosted_forests["error"].append(np.mean(debiased_error))
            step += 1
        # endregion

        boosted_forests["predictions"] = y_hat
        self.boosted_forests_ = boosted_forests
        return self