def test_predict_proba(self): self.estimator = Standardization(LogisticRegression(C=1e6, solver='lbfgs'), predict_proba=True) ind_outcome = self.verify_individual_multiclass_output() with self.subTest("Test results are probabilities - sum to 1:"): for treatment_value, y_pred in ind_outcome.groupby(level="a", axis="columns"): pd.testing.assert_series_equal( pd.Series(1.0, index=y_pred.index), y_pred.sum(axis="columns"))
def test_predict(self): self.estimator = Standardization(LogisticRegression(C=1e6, solver='lbfgs'), predict_proba=False) self.estimator.fit(self.data_3cls["X"], self.data_3cls["a"], self.data_3cls["y"]) ind_outcome = self.estimator.estimate_individual_outcome( self.data_3cls["X"], self.data_3cls["a"]) with self.subTest("Output size, # predictions:"): self.assertEqual(self.data_3cls["a"].nunique(), ind_outcome.shape[1]) self.assertNotEqual(self.data_3cls["y"].nunique(), ind_outcome.shape[1])
def test_treatment_encoding(self): self.estimator = Standardization(LinearRegression(), encode_treatment=True) a = self.data_lin["a"].replace({0: "p", 1: "q"}) self.estimator.fit(self.data_lin["X"], a, self.data_lin["y"]) with self.subTest("Treatment encoder created:"): self.assertTrue(hasattr(self.estimator, "treatment_encoder_")) with self.subTest("Treatment categories properly encoded"): self.assertSetEqual( {"p", "q"}, set(*self.estimator.treatment_encoder_.categories_)) with self.subTest("Fitted model has the right size"): self.assertEqual(len(self.estimator.learner.coef_), self.data_lin["X"].shape[1] + a.nunique())
def setUpClass(cls): TestDoublyRobustBase.setUpClass() # Avoids regularization of the model: ipw = IPW(LogisticRegression(C=1e6, solver='lbfgs'), use_stabilized=False) std = Standardization(LinearRegression(normalize=True)) cls.estimator = DoublyRobustIpFeature(std, ipw)
def fit_and_predict_all_learners(self, data, estimator): X, a, y = data["X"], data["a"], data["y"] self.estimator.fit(X, a, y) doubly_res = self.estimator.estimate_population_outcome(X, a) std_res = Standardization(LinearRegression(normalize=True)).fit(X, a, y).estimate_population_outcome(X, a) ipw_res = self.estimator.weight_model.estimate_population_outcome(X, a, y) return doubly_res, std_res, ipw_res
class TestStandardization(TestStandardizationCommon): @classmethod def setUpClass(cls): TestStandardizationCommon.setUpClass() # Avoids regularization of the model: cls.estimator = Standardization(LinearRegression(normalize=True)) def setUp(self): self.estimator.fit(self.data_lin["X"], self.data_lin["a"], self.data_lin["y"]) def test_is_fitted(self): self.assertTrue(hasattr(self.estimator.learner, "coef_")) def test_effect_estimation(self): with self.subTest("Check by model coefficient:"): self.assertAlmostEqual(self.estimator.learner.coef_[0], self.data_lin["beta"], places=5) self.ensure_effect_estimation() def test_observed_prediction(self): self.ensure_observed_prediction() def test_counterfactual_outcomes(self): self.ensure_counterfactual_outcomes() def test_treatment_encoding(self): self.estimator = Standardization(LinearRegression(), encode_treatment=True) a = self.data_lin["a"].replace({0: "p", 1: "q"}) self.estimator.fit(self.data_lin["X"], a, self.data_lin["y"]) with self.subTest("Treatment encoder created:"): self.assertTrue(hasattr(self.estimator, "treatment_encoder_")) with self.subTest("Treatment categories properly encoded"): self.assertSetEqual( {"p", "q"}, set(*self.estimator.treatment_encoder_.categories_)) with self.subTest("Fitted model has the right size"): self.assertEqual(len(self.estimator.learner.coef_), self.data_lin["X"].shape[1] + a.nunique()) def test_pipeline_learner(self): self.ensure_pipeline_learner() def test_many_models(self): self.ensure_many_models()
def ensure_many_models(self, clip_min=None, clip_max=None): from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor from sklearn.neural_network import MLPRegressor from sklearn.linear_model import ElasticNet, RANSACRegressor, HuberRegressor, PassiveAggressiveRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.svm import SVR, LinearSVR from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.exceptions import ConvergenceWarning warnings.filterwarnings('ignore', category=ConvergenceWarning) data = self.create_uninformative_ox_dataset() for propensity_learner in [ GradientBoostingClassifier(n_estimators=10), RandomForestClassifier(n_estimators=100), MLPClassifier(hidden_layer_sizes=(5, )), KNeighborsClassifier(n_neighbors=20) ]: weight_model = IPW(propensity_learner, clip_min=clip_min, clip_max=clip_max) propensity_learner_name = str(propensity_learner).split( "(", maxsplit=1)[0] for outcome_learner in [ GradientBoostingRegressor(n_estimators=10), RandomForestRegressor(n_estimators=10), MLPRegressor(hidden_layer_sizes=(5, )), ElasticNet(), RANSACRegressor(), HuberRegressor(), PassiveAggressiveRegressor(), KNeighborsRegressor(), SVR(), LinearSVR() ]: outcome_learner_name = str(outcome_learner).split( "(", maxsplit=1)[0] outcome_model = Standardization(outcome_learner) with self.subTest("Test fit & predict using {} & {}".format( propensity_learner_name, outcome_learner_name)): model = self.estimator.__class__(outcome_model, weight_model) model.fit(data["X"], data["a"], data["y"], refit_weight_model=False) model.estimate_individual_outcome(data["X"], data["a"]) self.assertTrue(True) # Fit did not crash
def test_standardization_matches_causallib(linear_data_pandas): w, t, y = linear_data_pandas causallib_standardization = Standardization(LinearRegression()) causallib_standardization.fit(w, t, y) individual_potential_outcomes = causallib_standardization.estimate_individual_outcome( w, t) causallib_ite_estimates = individual_potential_outcomes[ 1] - individual_potential_outcomes[0] mean_potential_outcomes = causallib_standardization.estimate_population_outcome( w, t) causallib_ate_estimate = mean_potential_outcomes[ 1] - mean_potential_outcomes[0] standardization = StandardizationEstimator() standardization.fit(w, t, y) assert causallib_ate_estimate == standardization.estimate_ate() pd.testing.assert_series_equal(causallib_ite_estimates, standardization.estimate_ite())
def __init__(self, outcome_model=LinearRegression()): super().__init__(causallib_estimator=Standardization(learner=outcome_model))
def test_many_models(self): from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor from sklearn.neural_network import MLPRegressor from sklearn.linear_model import ElasticNet, RANSACRegressor, HuberRegressor, PassiveAggressiveRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.svm import SVR, LinearSVR from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.exceptions import ConvergenceWarning warnings.filterwarnings('ignore', category=ConvergenceWarning) data = self.create_uninformative_ox_dataset() for propensity_learner in [ GradientBoostingClassifier(n_estimators=10), RandomForestClassifier(n_estimators=100), MLPClassifier(hidden_layer_sizes=(5, )), KNeighborsClassifier(n_neighbors=20) ]: weight_model = IPW(propensity_learner) propensity_learner_name = str(propensity_learner).split( "(", maxsplit=1)[0] for outcome_learner in [ GradientBoostingRegressor(n_estimators=10), RandomForestRegressor(n_estimators=10), RANSACRegressor(), HuberRegressor(), SVR(), LinearSVR() ]: outcome_learner_name = str(outcome_learner).split( "(", maxsplit=1)[0] outcome_model = Standardization(outcome_learner) with self.subTest("Test fit using {} & {}".format( propensity_learner_name, outcome_learner_name)): model = self.estimator.__class__(outcome_model, weight_model) model.fit(data["X"], data["a"], data["y"], refit_weight_model=False) self.assertTrue(True) # Fit did not crash for outcome_learner in [ MLPRegressor(hidden_layer_sizes=(5, )), # ElasticNet(), # supports sample_weights since v0.23, remove to support v<0.23 PassiveAggressiveRegressor(), KNeighborsRegressor() ]: outcome_learner_name = str(outcome_learner).split( "(", maxsplit=1)[0] outcome_model = Standardization(outcome_learner) with self.subTest("Test fit using {} & {}".format( propensity_learner_name, outcome_learner_name)): model = self.estimator.__class__(outcome_model, weight_model) with self.assertRaises(TypeError): # Joffe forces learning with sample_weights, # not all ML models support that and so calling should fail model.fit(data["X"], data["a"], data["y"], refit_weight_model=False)
def test_decision_function(self): self.estimator = Standardization(SVC(decision_function_shape='ovr'), predict_proba=True) self.verify_individual_multiclass_output()
class TestStandardizationClassification(TestStandardizationCommon): @classmethod def setUpClass(cls): # Three-class outcome, since decision_function might return a vector when n_classes=2, and we wish to check the # matrix form of the output behaves as expected: X, y = make_classification(n_features=3, n_informative=2, n_redundant=0, n_repeated=0, n_classes=3, n_clusters_per_class=1, flip_y=0.0, class_sep=10.0) X, a = X[:, :-1], X[:, -1] a = (a > np.median(a)).astype(int) cls.data_3cls = { "X": pd.DataFrame(X), "a": pd.Series(a), "y": pd.Series(y) } # X, y = make_classification(n_features=2, n_informative=1, n_redundant=0, n_repeated=0, n_classes=2, # n_clusters_per_class=1, flip_y=0.0, class_sep=10.0) # X, a = X[:, :-1], X[:, -1] # a = (a > np.median(a)).astype(int) # cls.data_2cls = {"X": pd.DataFrame(X), "a": pd.Series(a), "y": pd.Series(y)} def verify_individual_multiclass_output(self): self.estimator.fit(self.data_3cls["X"], self.data_3cls["a"], self.data_3cls["y"]) ind_outcome = self.estimator.estimate_individual_outcome( self.data_3cls["X"], self.data_3cls["a"]) with self.subTest("Output size, # samples:"): self.assertEqual(self.data_3cls["X"].shape[0], ind_outcome.shape[0]) with self.subTest("Output size, # predictions:"): with self.subTest( "Output's multiindex level names are describing treatment and outcome" ): self.assertEqual(["a", "y"], ind_outcome.columns.names) with self.subTest( "Output's number of predictions is the same as number of outcome and treatment values" ): self.assertEqual( self.data_3cls["a"].nunique() * self.data_3cls["y"].nunique(), ind_outcome.shape[1]) self.assertEqual( self.data_3cls["a"].nunique(), ind_outcome.columns.get_level_values("a").unique().size) self.assertEqual( self.data_3cls["y"].nunique(), ind_outcome.columns.get_level_values("y").unique().size) return ind_outcome def test_predict_proba(self): self.estimator = Standardization(LogisticRegression(C=1e6, solver='lbfgs'), predict_proba=True) ind_outcome = self.verify_individual_multiclass_output() with self.subTest("Test results are probabilities - sum to 1:"): for treatment_value, y_pred in ind_outcome.groupby(level="a", axis="columns"): pd.testing.assert_series_equal( pd.Series(1.0, index=y_pred.index), y_pred.sum(axis="columns")) def test_decision_function(self): self.estimator = Standardization(SVC(decision_function_shape='ovr'), predict_proba=True) self.verify_individual_multiclass_output() def test_predict(self): self.estimator = Standardization(LogisticRegression(C=1e6, solver='lbfgs'), predict_proba=False) self.estimator.fit(self.data_3cls["X"], self.data_3cls["a"], self.data_3cls["y"]) ind_outcome = self.estimator.estimate_individual_outcome( self.data_3cls["X"], self.data_3cls["a"]) with self.subTest("Output size, # predictions:"): self.assertEqual(self.data_3cls["a"].nunique(), ind_outcome.shape[1]) self.assertNotEqual(self.data_3cls["y"].nunique(), ind_outcome.shape[1])
def setUpClass(cls): TestStandardizationCommon.setUpClass() # Avoids regularization of the model: cls.estimator = Standardization(LinearRegression(normalize=True))
def init(self, reduced, importance_sampling): self._estimator = TMLE( Standardization(self.outcome_model_cont), IPW(self.treatment_model), reduced=reduced, importance_sampling=importance_sampling, )