def fit(self, X, y, sample_weight=None, cluster=None, compute_oob_predictions=False): """Fit the grf forest using training data. :param array2d X: training input features :param array1d y: training input targets :param array1d sample_weight: optional weights for input samples :param array1d cluster: optional cluster assignments for input samples """ X, y = self._validate_data(X, y, force_all_finite="allow-nan") self._check_num_samples(X) self._check_n_features(X, reset=True) self._check_sample_fraction(oob=compute_oob_predictions) self._check_alpha() sample_weight, use_sample_weight = check_sample_weight( sample_weight, X) cluster_ = self._check_cluster(X=X, cluster=cluster) self.samples_per_cluster_ = self._check_equalize_cluster_weights( cluster=cluster_, sample_weight=sample_weight) self.mtry_ = self._check_mtry(X=X) train_matrix = self._create_train_matrices(X=X, y=y, sample_weight=sample_weight) self.grf_forest_ = grf.regression_train( np.asfortranarray(train_matrix.astype("float64")), self.outcome_index_, self.sample_weight_index_, use_sample_weight, self.mtry_, 1, # num_trees self.min_node_size, self.sample_fraction, self.honesty, self.honesty_fraction, self.honesty_prune_leaves, 1, # ci_group_size self.alpha, self.imbalance_penalty, cluster_, self.samples_per_cluster_, compute_oob_predictions, 1, # num_threads self.seed, ) self._ensure_ptr() sample_weight = sample_weight if sample_weight is not None else np.ones( len(X)) self._set_node_values(y, sample_weight) self._set_n_classes() return self
def _estimate_using_regression(self, X, y, sample_weight=None, cluster=None): """Generate target estimates using a regression. In the R package, they perform forest tuning here. For now, we just perform a single regression without tuning. We also don't expose any of the forest parametrization for this process in the estimator. # TODO consider implementing tuning, exposing parameters. """ train_matrix = self._create_train_matrices( X=X, y=y, sample_weight=sample_weight, ) sample_weight, use_sample_weight = check_sample_weight( sample_weight, X) n_estimators = 50 regression_forest = grf.regression_train( np.asfortranarray(train_matrix.astype("float64")), self.outcome_index_, self.sample_weight_index_, use_sample_weight, self.mtry_, n_estimators, # num_trees 5, # min_node_size self.sample_fraction, True, # honesty 0.5, # honesty_fraction self.honesty_prune_leaves, 1, # ci_group_size self.alpha, self.imbalance_penalty, cluster, self.samples_per_cluster_, True, # compute_oob_predictions, self._get_num_threads(), # num_threads, self.seed, ) return np.atleast_1d( np.squeeze(np.array(regression_forest["predictions"])))
def fit(self, X, y, sample_weight=None, cluster=None): """Fit the grf tree using training data. :param array2d X: training input features :param array1d y: training input targets, rows of (bool, float) representing (survival, time) :param array1d sample_weight: optional weights for input samples :param array1d cluster: optional cluster assignments for input samples """ X = check_array(X, force_all_finite="allow-nan") self._check_num_samples(X) self._check_n_features(X, reset=True) y = np.array(y.tolist()) self._check_sample_fraction() self._check_alpha() cluster = self._check_cluster(X=X, cluster=cluster) self.samples_per_cluster_ = self._check_equalize_cluster_weights( cluster=cluster, sample_weight=sample_weight) sample_weight, use_sample_weight = check_sample_weight( sample_weight, X) self.mtry_ = self._check_mtry(X=X) # Extract the failure times from the training targets self.failure_times_ = np.sort(np.unique(y[:, 1][y[:, 0] == 1])) self.num_failures_ = len(self.failure_times_) # Relabel the failure times to consecutive integers y_times_relabeled = np.searchsorted(self.failure_times_, y[:, 1]) y_censor = y[:, 0] train_matrix = self._create_train_matrices(X, y_times_relabeled, sample_weight=sample_weight, censor=y_censor) self.train_ = train_matrix self.grf_forest_ = grf.survival_train( np.asfortranarray(train_matrix.astype("float64")), self.outcome_index_, self.censor_index_, self.sample_weight_index_, use_sample_weight, self.mtry_, 1, # num_trees self.min_node_size, self.sample_fraction, self.honesty, self.honesty_fraction, self.honesty_prune_leaves, self.alpha, self.num_failures_, cluster, self.samples_per_cluster_, False, # compute_oob_predictions, 1, # num_threads, self.seed, ) self._ensure_ptr() sample_weight = sample_weight if sample_weight is not None else np.ones( len(X)) self._set_node_values(y, sample_weight) self._set_n_classes() return self
def fit( self, X, y, w, # treatment z, # instrument y_hat=None, w_hat=None, z_hat=None, sample_weight=None, cluster=None, ): """Fit the grf forest using training data. :param array2d X: training input features :param array1d y: training input targets :param array1d w: training input treatments :param array1d z: training input instruments :param array1d y_hat: estimated expected target responses :param array1d w_hat: estimated treatment propensities :param array1d z_hat: estimated instrument propensities :param array1d sample_weight: optional weights for input samples :param array1d cluster: optional cluster assignments for input samples """ X, y = self._validate_data(X, y, force_all_finite="allow-nan") self._check_num_samples(X) self._check_n_features(X, reset=True) self._check_sample_fraction() self._check_alpha() sample_weight, use_sample_weight = check_sample_weight( sample_weight, X) cluster = self._check_cluster(X=X, cluster=cluster) self.samples_per_cluster_ = self._check_equalize_cluster_weights( cluster=cluster, sample_weight=sample_weight) self.mtry_ = self._check_mtry(X=X) self._check_reduced_form_weight() if y_hat is None: logger.debug("estimating y_hat") y_hat = self._estimate_using_regression( X=X, y=y, sample_weight=sample_weight, cluster=cluster) if w_hat is None: logger.debug("estimating w_hat") w_hat = self._estimate_using_regression( X=X, y=w, sample_weight=sample_weight, cluster=cluster) # don't repeat calculations for causal if np.all(w == z): z_hat = w_hat if z_hat is None: logger.debug("estimating z_hat") z_hat = self._estimate_using_regression( X=X, y=z, sample_weight=sample_weight, cluster=cluster) y_centered = y - y_hat w_centered = w - w_hat z_centered = z - z_hat train_matrix = self._create_train_matrices( X=X, y=y_centered, sample_weight=sample_weight, treatment=w_centered, instrument=z_centered, ) self.grf_forest_ = grf.instrumental_train( np.asfortranarray(train_matrix.astype("float64")), self.outcome_index_, self.treatment_index_, self.instrument_index_, self.sample_weight_index_, use_sample_weight, self.mtry_, 1, # num_trees self.min_node_size, self.sample_fraction, self.honesty, self.honesty_fraction, self.honesty_prune_leaves, 1, # ci_group_size self.reduced_form_weight, self.alpha, self.imbalance_penalty, self.stabilize_splits, cluster, self.samples_per_cluster_, False, # compute_oob_predictions, 1, # num_threads, self.seed, ) self._ensure_ptr() sample_weight = sample_weight if sample_weight is not None else np.ones( len(X)) self._set_node_values(y, sample_weight) self._set_n_classes() return self
def fit(self, X, y, sample_weight=None, cluster=None): """Fit the grf forest using training data. :param array2d X: training input features :param array1d y: training input targets :param array1d sample_weight: optional weights for input samples :param array1d cluster: optional cluster assignments for input samples """ X, y = self._validate_data(X, y) self._check_num_samples(X) self._check_n_features(X, reset=True) self._check_sample_fraction() self._check_alpha() sample_weight, use_sample_weight = check_sample_weight( sample_weight, X) cluster = self._check_cluster(X=X, cluster=cluster) self.samples_per_cluster_ = self._check_equalize_cluster_weights( cluster=cluster, sample_weight=sample_weight) self.mtry_ = self._check_mtry(X=X) train_matrix = self._create_train_matrices(X, y, sample_weight=sample_weight) self.train_ = train_matrix if self.ll_split_variables is None: self.ll_split_variables_ = list(range(X.shape[1])) else: self.ll_split_variables_ = self.ll_split_variables # calculate overall beta if self.ll_split_cutoff is None: self.ll_split_cutoff_ = int(X.shape[0]**0.5) else: self.ll_split_cutoff_ = self.ll_split_cutoff if self.ll_split_cutoff_ > 0: J = np.eye(X.shape[1] + 1) J[0, 0] = 0 D = np.concatenate([np.ones((X.shape[0], 1)), X], axis=1) self.overall_beta_ = ( np.linalg.solve(D.T @ D + self.ll_split_lambda * J, np.eye(X.shape[1] + 1)) @ D.T @ y) else: self.overall_beta_ = np.empty((0, ), dtype=float, order="F") self.grf_forest_ = grf.ll_regression_train( np.asfortranarray(train_matrix.astype("float64")), self.outcome_index_, self.sample_weight_index_, self.ll_split_lambda, self.ll_split_weight_penalty, self.ll_split_variables_, self.ll_split_cutoff_, self.overall_beta_, use_sample_weight, self.mtry_, self.n_estimators, # num_trees self.min_node_size, self.sample_fraction, self.honesty, self.honesty_fraction, self.honesty_prune_leaves, self.ci_group_size, self.alpha, self.imbalance_penalty, cluster, self.samples_per_cluster_, self._get_num_threads(), # num_threads, self.seed, ) self._ensure_ptr() if self.enable_tree_details: sample_weight = (sample_weight if sample_weight is not None else np.ones(len(X))) self._set_node_values(y, sample_weight) self._set_n_classes() return self
def fit(self, X, y, sample_weight=None, cluster=None): """Fit the grf forest using training data. :param array2d X: training input features :param array1d y: training input targets :param array1d sample_weight: optional weights for input samples :param array1d cluster: optional cluster assignments for input samples """ X, y = self._validate_data(X, y, force_all_finite="allow-nan") self._check_n_features(X, reset=True) self._check_boost_error_reduction() self._check_sample_fraction(oob=True) self._check_alpha() sample_weight, _ = check_sample_weight(sample_weight, X) cluster_ = self._check_cluster(X=X, cluster=cluster) self.samples_per_cluster_ = self._check_equalize_cluster_weights( cluster=cluster_, sample_weight=sample_weight) self.mtry_ = self._check_mtry(X=X) _ = self._create_train_matrices(X=X, y=y, sample_weight=sample_weight) # region tuning a regression forest regression_forest = GRFForestRegressor( n_estimators=self.tune_n_estimators, equalize_cluster_weights=self.equalize_cluster_weights, sample_fraction=self.sample_fraction, mtry=self.mtry, min_node_size=self.min_node_size, honesty=self.honesty, honesty_fraction=self.honesty_fraction, honesty_prune_leaves=self.honesty_prune_leaves, alpha=self.alpha, imbalance_penalty=self.imbalance_penalty, ci_group_size=self.ci_group_size, n_jobs=self.n_jobs, seed=self.seed, ) if self.tune_params is None: logger.debug("not tuning boosted forest") regression_forest.fit( X=X, y=y, sample_weight=sample_weight, cluster=cluster, compute_oob_predictions=True, ) params = regression_forest.get_params(deep=True) forest = regression_forest else: logger.debug("tuning boosted forest") tunable_params = ( "sample_fraction", "mtry", "min_node_size", "honesty_fraction", "honesty_prune_leaves", "alpha", "imbalance_penalty", ) param_distributions = {} for param in self.tune_params: if param not in tunable_params: raise ValueError( f"tuning param {param} not found in {str(tunable_params)}" ) param_distributions[param] = PARAM_DISTRIBUTIONS[param]( *X.shape) uniform_samples = random.uniform(size=(self.tune_n_draws, len(self.tune_params))) param_samples = np.zeros(shape=(self.tune_n_draws, len(self.tune_params))) for idx, param in enumerate(self.tune_params): param_samples[:, idx] = param_distributions[param].dist( uniform_samples[:, idx]) errors = [] for draw in range(self.tune_n_draws): params = { p: param_samples[draw, idx] for idx, p in enumerate(self.tune_params) } regression_forest.set_params(**params) regression_forest.fit( X=X, y=y, sample_weight=sample_weight, cluster=cluster, compute_oob_predictions=True, ) errors.append( np.nanmean( regression_forest.grf_forest_["debiased_error"])) if np.any(np.isnan(errors)): raise ValueError( "unable to tune because of NaN-valued forest error estimates; consider more trees" ) if np.std(errors) == 0 or np.std(errors) / np.mean(errors) < 1e-10: raise ValueError( "unable to tune because of constant errors for forests; consider more trees" ) variance_guess = np.var(errors) / 2 gp = GaussianProcessRegressor(alpha=variance_guess) gp.fit(uniform_samples, errors) opt_samples = random.uniform(size=(self.tune_n_draws, len(self.tune_params))) model_surface = gp.predict(opt_samples) tuned_params = np.zeros(shape=(self.tune_n_draws, len(self.tune_params))) for idx, param in enumerate(self.tune_params): tuned_params[:, idx] = param_distributions[param].dist( opt_samples[:, idx]) opt_idx = np.argmin(model_surface) params = { p: tuned_params[opt_idx, idx] for idx, p in enumerate(self.tune_params) } params.update(**{"n_estimators": self.tune_n_estimators * 4}) regression_forest.set_params(**params) regression_forest.fit( X, y, sample_weight=sample_weight, cluster=cluster, compute_oob_predictions=True, ) retrained_error = np.nanmean( regression_forest.grf_forest_["debiased_error"]) default_params = { "sample_fraction": 0.5, "mtry": min(np.ceil(np.sqrt(X.shape[1]) + 20), X.shape[1]), "min_node_size": 5, "honesty_fraction": 0.5, "honesty_prune_leaves": True, "alpha": 0.05, "imbalance_penalty": 0, } default_forest = clone(regression_forest) default_forest.set_params(**default_params) default_forest.fit( X=X, y=y, sample_weight=sample_weight, cluster=cluster, compute_oob_predictions=True, ) default_error = np.nanmean( default_forest.grf_forest_["debiased_error"]) if default_error < retrained_error: params = default_forest.get_params() forest = default_forest else: params = regression_forest.get_params() forest = regression_forest # endregion # region boosting with the tuned forest logger.debug("boosting forest") current_pred = { "predictions": forest.grf_forest_["predictions"], "debiased_error": forest.grf_forest_["debiased_error"], "excess_error": forest.grf_forest_["excess_error"], } y_hat = np.atleast_1d(np.squeeze(np.array( current_pred["predictions"]))) debiased_error = current_pred["debiased_error"] boosted_forests = { "forest": [forest], "error": [np.mean(debiased_error)], } step = 1 while True: y_residual = y - y_hat if self.boost_steps is not None: if step > self.boost_steps: break elif step > self.boost_max_steps: break else: forest_small = GRFForestRegressor( sample_fraction=params["sample_fraction"], mtry=params["mtry"], n_estimators=self.boost_trees_tune, n_jobs=self.n_jobs, min_node_size=params["min_node_size"], honesty=self.honesty, honesty_fraction=params["honesty_fraction"], honesty_prune_leaves=params["honesty_prune_leaves"], seed=self.seed, ci_group_size=self.ci_group_size, alpha=params["alpha"], imbalance_penalty=params["imbalance_penalty"], equalize_cluster_weights=self.equalize_cluster_weights, ) forest_small.fit( X=X, y=y_residual, sample_weight=sample_weight, cluster=cluster, compute_oob_predictions=True, ) step_error = forest_small.grf_forest_["debiased_error"] if not np.nanmean( step_error ) <= self.boost_error_reduction * np.nanmean(debiased_error): break forest_residual = GRFForestRegressor( sample_fraction=params["sample_fraction"], mtry=params["mtry"], n_estimators=self.n_estimators, n_jobs=self.n_jobs, min_node_size=params["min_node_size"], honesty=self.honesty, honesty_fraction=params["honesty_fraction"], honesty_prune_leaves=params["honesty_prune_leaves"], seed=self.seed, ci_group_size=self.ci_group_size, alpha=params["alpha"], imbalance_penalty=params["imbalance_penalty"], equalize_cluster_weights=self.equalize_cluster_weights, ) forest_residual.fit( X, y_residual, sample_weight=sample_weight, cluster=cluster, compute_oob_predictions=True, ) current_pred = { "predictions": forest_residual.grf_forest_["predictions"], "debiased_error": forest_residual.grf_forest_["debiased_error"], "excess_error": forest_residual.grf_forest_["excess_error"], } y_hat = y_hat + np.atleast_1d( np.squeeze(np.array(current_pred["predictions"]))) debiased_error = current_pred["debiased_error"] boosted_forests["forest"].append(forest_residual) boosted_forests["error"].append(np.mean(debiased_error)) step += 1 # endregion boosted_forests["predictions"] = y_hat self.boosted_forests_ = boosted_forests return self