def test_BaseRRegressor_without_p(generate_regression_data): y, X, treatment, tau, b, e = generate_regression_data() learner = BaseRRegressor(learner=XGBRegressor()) # check the accuracy of the ATE estimation ate_p, lb, ub = learner.estimate_ate(X=X, treatment=treatment, y=y) assert (ate_p >= lb) and (ate_p <= ub) assert ape(tau.mean(), ate_p) < ERROR_THRESHOLD # check the accuracy of the CATE estimation with the bootstrap CI cate_p, _, _ = learner.fit_predict(X=X, treatment=treatment, y=y, return_ci=True, n_bootstraps=10) auuc_metrics = pd.DataFrame({ 'cate_p': cate_p.flatten(), 'W': treatment, 'y': y, 'treatment_effect_col': tau }) cumgain = get_cumgain(auuc_metrics, outcome_col='y', treatment_col='W', treatment_effect_col='tau') # Check if the cumulative gain when using the model's prediction is # higher than it would be under random targeting assert cumgain['cate_p'].sum() > cumgain['Random'].sum()
def __init__( self, learner=None, outcome_learner=None, effect_learner=None, random_state: StateType = None, ): """Setup an RLearner Args: learner: default learner for both outcome and effect outcome_learner: specific learner for outcome effect_learner: specific learner for effect random_state: RandomState or int to be used for K-fold splitting. NOT used in the learners, this has to be done by the user. """ from causalml.inference.meta import BaseRRegressor if learner is None and (outcome_learner is None and effect_learner is None): learner = LinearRegression() self.random_state = check_random_state(random_state) self.model = BaseRRegressor(learner, outcome_learner, effect_learner, random_state=random_state)
def test_BaseRRegressor_without_p(generate_regression_data): y, X, treatment, tau, b, e = generate_regression_data() learner = BaseRRegressor(learner=XGBRegressor()) # check the accuracy of the ATE estimation ate_p, lb, ub = learner.estimate_ate(X=X, treatment=treatment, y=y) assert (ate_p >= lb) and (ate_p <= ub) assert ape(tau.mean(), ate_p) < ERROR_THRESHOLD # check the accuracy of the CATE estimation with the bootstrap CI cate_p, _, _ = learner.fit_predict(X=X, treatment=treatment, y=y, return_ci=True, n_bootstraps=10) assert gini(tau, cate_p.flatten()) > .5
def test_BaseRRegressor(generate_regression_data): y, X, treatment, tau, b, e = generate_regression_data() learner = BaseRRegressor(learner=XGBRegressor()) # check the accuracy of the ATE estimation ate_p, lb, ub = learner.estimate_ate(X=X, treatment=treatment, y=y, p=e) assert (ate_p >= lb) and (ate_p <= ub) assert ape(tau.mean(), ate_p) < ERROR_THRESHOLD # check pre-train model ate_p_pt, lb_pt, ub_pt = learner.estimate_ate(X=X, treatment=treatment, y=y, p=e, pretrain=True) assert (ate_p_pt == ate_p) and (lb_pt == lb) and (ub_pt == ub) # check the accuracy of the CATE estimation with the bootstrap CI cate_p, _, _ = learner.fit_predict(X=X, treatment=treatment, y=y, p=e, return_ci=True, n_bootstraps=10) auuc_metrics = pd.DataFrame({ "cate_p": cate_p.flatten(), "W": treatment, "y": y, "treatment_effect_col": tau, }) cumgain = get_cumgain(auuc_metrics, outcome_col="y", treatment_col="W", treatment_effect_col="tau") # Check if the cumulative gain when using the model's prediction is # higher than it would be under random targeting assert cumgain["cate_p"].sum() > cumgain["Random"].sum()
predictions['predictions_easy_treatment'] = predictions_easy_treatment predictions[ 'predictions_easy_treatment_test'] = predictions_easy_treatment_test predictions['predictions_randomized_trial'] = predictions_randomized_trial predictions[ 'predictions_randomized_trial_test'] = predictions_randomized_trial_test predictions['predictions_easy_propensity'] = predictions_easy_propensity predictions[ 'predictions_easy_propensity_test'] = predictions_easy_propensity_test return predictions estimators_R = { #'learner_dtr': BaseRRegressor(learner=DecisionTreeRegressor()), 'learner_xgb': BaseRRegressor(learner=XGBRegressor()), 'learner_lr': BaseRRegressor(learner=LinearRegression()) } estimators_T = { 'learner_xgb': BaseTRegressor(learner=XGBRegressor()), 'learner_lr': BaseTRegressor(learner=LinearRegression()) } import stacking_helpers predictions_R = generate_predicitons_by_learner(estimators_R) predictions_T = generate_predicitons_by_learner(estimators_T) pred_R = predictions_R['predictions_randomized_trial'] pred_R_test = predictions_R['predictions_randomized_trial_test']
class RLearner: """A wrapper of the BaseRRegressor from ``causalml`` Defaults to LassoLars regression as a base learner if not specified otherwise. Allows to either specify one learner for both tasks or two distinct learners for the task outcome and effect learning. References: CausalML Framework `on Github <https://github.com/uber/causalml/>'_. [1] X. Nie and S. Wager, “Quasi-Oracle Estimation of Heterogeneous Treatment Effects.” """ def __init__( self, learner=None, outcome_learner=None, effect_learner=None, random_state: StateType = None, ): """Setup an RLearner Args: learner: default learner for both outcome and effect outcome_learner: specific learner for outcome effect_learner: specific learner for effect random_state: RandomState or int to be used for K-fold splitting. NOT used in the learners, this has to be done by the user. """ from causalml.inference.meta import BaseRRegressor if learner is None and (outcome_learner is None and effect_learner is None): learner = LinearRegression() self.random_state = check_random_state(random_state) self.model = BaseRRegressor(learner, outcome_learner, effect_learner, random_state=random_state) def __str__(self): """Simple string representation for logs and outputs""" return "{}(outcome={}, effect={})".format( self.__class__.__name__, self.model.model_mu.__class__.__name__, self.model.model_tau.__class__.__name__, ) def __repr__(self): return self.__str__() def fit(self, x: np.array, t: np.array, y: np.array, p: np.array = None) -> None: """Fits the RLearner on given samples. Defaults to `justcause.learners.propensities.estimate_propensities` for ``p`` if not given explicitly, in order to allow a generic call to the fit() method Args: x: covariate matrix of shape (num_instances, num_features) t: treatment indicator vector, shape (num_instances) y: factual outcomes, (num_instances) p: propensities, shape (num_instances) """ if p is None: # Propensity is needed by CausalML, so we estimate it, # if it was not provided p = estimate_propensities(x, t) self.model.fit(x, p, t, y) def predict_ite(self, x: np.array, *args) -> np.array: """Predicts ITE for given samples; ignores the factual outcome and treatment Args: x: covariates used for precition *args: NOT USED but kept to work with the standard ``fit(x, t, y)`` call """ # assert t is None and y is None, "The R-Learner does not use factual outcomes" return self.model.predict(x).flatten() def estimate_ate(self, x: np.array, t: np.array, y: np.array, p: Optional[np.array] = None) -> float: """Estimate the average treatment effect (ATE) by fit and predict on given data Estimates the ATE as the mean of ITE predictions on the given data. Args: x: covariates of shape (num_samples, num_covariates) t: treatment indicator vector, shape (num_instances) y: factual outcomes, (num_instances) p: propensities, shape (num_instances) Returns: the average treatment effect estimate """ self.fit(x, t, y, p) ite = self.predict_ite(x, t, y) return float(np.mean(ite))