Beispiel #1
0
    def test_accuracy(self):
        np.random.seed(123)
        # dgp (binary T, binary Z)

        def dgp(n, p, true_fn):
            X = np.random.normal(0, 1, size=(n, p))
            Z = np.random.binomial(1, 0.5, size=(n,))
            nu = np.random.uniform(0, 10, size=(n,))
            coef_Z = 0.8
            C = np.random.binomial(
                1, coef_Z * special.expit(0.4 * X[:, 0] + nu)
            )  # Compliers when recomended
            C0 = np.random.binomial(
                1, 0.06 * np.ones(X.shape[0])
            )  # Non-compliers when not recommended
            T = C * Z + C0 * (1 - Z)
            y = true_fn(X) * T + 2 * nu + 5 * (X[:, 3] > 0) + 0.1 * np.random.uniform(0, 1, size=(n,))
            return y, T, Z, X

        ests_list = [LinearIntentToTreatDRIV(
            flexible_model_effect=StatsModelsLinearRegression(fit_intercept=False), fit_cate_intercept=True
        ), LinearDRIV(
            fit_cate_intercept=True,
            projection=False,
            discrete_instrument=True,
            discrete_treatment=True,
            flexible_model_effect=StatsModelsLinearRegression(fit_intercept=False)
        )]

        # no heterogeneity
        n = 1000
        p = 10
        true_ate = 10

        def true_fn(X):
            return true_ate
        y, T, Z, X = dgp(n, p, true_fn)
        for est in ests_list:
            with self.subTest(est=est):
                est.fit(y, T, Z=Z, X=None, W=X, inference="auto")
                ate_lb, ate_ub = est.ate_interval()
                np.testing.assert_array_less(ate_lb, true_ate)
                np.testing.assert_array_less(true_ate, ate_ub)

        # with heterogeneity
        true_coef = 10

        def true_fn(X):
            return true_coef * X[:, 0]
        y, T, Z, X = dgp(n, p, true_fn)
        for est in ests_list:
            with self.subTest(est=est):
                est.fit(y, T, Z=Z, X=X[:, [0]], W=X[:, 1:], inference="auto")
                coef_lb, coef_ub = est.coef__interval()
                intercept_lb, intercept_ub = est.intercept__interval(alpha=0.05)
                np.testing.assert_array_less(coef_lb, true_coef)
                np.testing.assert_array_less(true_coef, coef_ub)
                np.testing.assert_array_less(intercept_lb, 0)
                np.testing.assert_array_less(0, intercept_ub)
Beispiel #2
0
    def test_auto_inference(self):
        Y, T, X, W = TestInference.Y, TestInference.T, TestInference.X, TestInference.W
        est = DRLearner(model_regression=LinearRegression(),
                        model_propensity=LogisticRegression(),
                        model_final=StatsModelsLinearRegression())
        est.fit(Y, T, X=X, W=W)
        est.effect_inference(X).summary_frame()
        est.effect_inference(X).population_summary()
        est.const_marginal_effect_inference(X).summary_frame()
        est.marginal_effect_inference(T, X).summary_frame()
        est = DRLearner(model_regression=LinearRegression(),
                        model_propensity=LogisticRegression(),
                        model_final=LinearRegression(),
                        multitask_model_final=True)
        est.fit(Y, T, X=X, W=W)
        with pytest.raises(AttributeError):
            est.effect_inference(X)

        est = DML(model_y=LinearRegression(),
                  model_t=LinearRegression(),
                  model_final=StatsModelsLinearRegression(fit_intercept=False),
                  random_state=123)
        est.fit(Y, T, X=X, W=W)
        est.summary()
        est.coef__inference().summary_frame()
        assert est.coef__inference().stderr is not None
        est.intercept__inference().summary_frame()
        assert est.intercept__inference().stderr is not None
        est.effect_inference(X).summary_frame()
        assert est.effect_inference(X).stderr is not None
        est.effect_inference(X).population_summary()
        est.const_marginal_effect_inference(X).summary_frame()
        assert est.const_marginal_effect_inference(X).stderr is not None
        est.marginal_effect_inference(T, X).summary_frame()
        assert est.marginal_effect_inference(T, X).stderr is not None

        est = NonParamDML(model_y=LinearRegression(),
                          model_t=LinearRegression(),
                          model_final=DebiasedLasso(),
                          random_state=123)
        est.fit(Y, T, X=X, W=W)
        est.effect_inference(X).summary_frame()
        assert est.effect_inference(X).stderr is not None
        est.effect_inference(X).population_summary()
        est.const_marginal_effect_inference(X).summary_frame()
        assert est.const_marginal_effect_inference(X).stderr is not None
        est.marginal_effect_inference(T, X).summary_frame()
        assert est.marginal_effect_inference(T, X).stderr is not None
Beispiel #3
0
    def test_dowhy(self):
        def reg():
            return LinearRegression()

        def clf():
            return LogisticRegression()

        Y, T, X, W, Z = self._get_data()
        # test at least one estimator from each category
        models = {"dml": LinearDML(model_y=reg(), model_t=clf(), discrete_treatment=True,
                                   linear_first_stages=False),
                  "dr": DRLearner(model_propensity=clf(), model_regression=reg(),
                                  model_final=reg()),
                  "forestdr": ForestDRLearner(model_propensity=clf(), model_regression=reg()),
                  "xlearner": XLearner(models=reg(), cate_models=reg(), propensity_model=clf()),
                  "cfdml": CausalForestDML(model_y=reg(), model_t=clf(), discrete_treatment=True),
                  "orf": DROrthoForest(n_trees=10, propensity_model=clf(), model_Y=reg()),
                  "orthoiv": OrthoIV(model_y_xw=reg(),
                                     model_t_xw=clf(),
                                     model_z_xw=reg(),
                                     discrete_treatment=True,
                                     discrete_instrument=False),
                  "dmliv": DMLIV(fit_cate_intercept=True,
                                 discrete_treatment=True,
                                 discrete_instrument=False),
                  "driv": LinearDRIV(flexible_model_effect=StatsModelsLinearRegression(fit_intercept=False),
                                     fit_cate_intercept=True,
                                     discrete_instrument=False,
                                     discrete_treatment=True)}
        for name, model in models.items():
            with self.subTest(name=name):
                est = model
                if name == "xlearner":
                    est_dowhy = est.dowhy.fit(Y, T, X=np.hstack((X, W)), W=None)
                elif name in ["orthoiv", "dmliv", "driv"]:
                    est_dowhy = est.dowhy.fit(Y, T, Z=Z, X=X, W=W)
                else:
                    est_dowhy = est.dowhy.fit(Y, T, X=X, W=W)
                # test causal graph
                est_dowhy.view_model()
                # test refutation estimate
                est_dowhy.refute_estimate(method_name="random_common_cause")
                if name != "orf":
                    est_dowhy.refute_estimate(method_name="add_unobserved_common_cause",
                                              confounders_effect_on_treatment="binary_flip",
                                              confounders_effect_on_outcome="linear",
                                              effect_strength_on_treatment=0.1,
                                              effect_strength_on_outcome=0.1,)
                    est_dowhy.refute_estimate(method_name="placebo_treatment_refuter", placebo_type="permute",
                                              num_simulations=3)
                    est_dowhy.refute_estimate(method_name="data_subset_refuter", subset_fraction=0.8,
                                              num_simulations=3)
Beispiel #4
0
    def test_rlearner_residuals(self):
        y, T, X, W = self._get_data()

        dml = DML(model_y=LinearRegression(),
                  model_t=LinearRegression(),
                  cv=1,
                  model_final=StatsModelsLinearRegression(fit_intercept=False),
                  linear_first_stages=False,
                  random_state=123)
        with pytest.raises(AttributeError):
            y_res, T_res, X_res, W_res = dml.residuals_
        dml.fit(y, T, X=X, W=W)
        with pytest.raises(AttributeError):
            y_res, T_res, X_res, W_res = dml.residuals_
        dml.fit(y, T, X=X, W=W, cache_values=True)
        y_res, T_res, X_res, W_res = dml.residuals_
        np.testing.assert_array_equal(X, X_res)
        np.testing.assert_array_equal(W, W_res)
        XW = np.hstack([X, W])
        np.testing.assert_array_equal(y_res, y - LinearRegression().fit(XW, y).predict(XW))
        np.testing.assert_array_equal(T_res, T - LinearRegression().fit(XW, T).predict(XW))
Beispiel #5
0
 def test_orthoiv_random_state(self):
     Y, T, X, W, X_test = self._make_data(500, 2)
     for est in [
             OrthoIV(model_y_xw=RandomForestRegressor(n_estimators=10,
                                                      max_depth=4,
                                                      random_state=123),
                     model_t_xw=RandomForestClassifier(n_estimators=10,
                                                       max_depth=4,
                                                       random_state=123),
                     model_z_xw=RandomForestClassifier(n_estimators=10,
                                                       max_depth=4,
                                                       random_state=123),
                     discrete_treatment=True,
                     discrete_instrument=True,
                     cv=2,
                     random_state=123),
             NonParamDMLIV(
                 model_y_xw=RandomForestRegressor(n_estimators=10,
                                                  max_depth=4,
                                                  random_state=123),
                 model_t_xw=RandomForestClassifier(n_estimators=10,
                                                   max_depth=4,
                                                   random_state=123),
                 model_t_xwz=RandomForestClassifier(n_estimators=10,
                                                    max_depth=4,
                                                    random_state=123),
                 model_final=LinearRegression(),
                 discrete_treatment=True,
                 discrete_instrument=True,
                 cv=2,
                 random_state=123),
             LinearDRIV(model_y_xw=RandomForestRegressor(n_estimators=10,
                                                         max_depth=4,
                                                         random_state=123),
                        model_t_xw=RandomForestClassifier(n_estimators=10,
                                                          max_depth=4,
                                                          random_state=123),
                        model_z_xw=RandomForestClassifier(n_estimators=10,
                                                          max_depth=4,
                                                          random_state=123),
                        model_tz_xw=RandomForestClassifier(
                            n_estimators=10, max_depth=4, random_state=123),
                        flexible_model_effect=StatsModelsLinearRegression(
                            fit_intercept=False),
                        discrete_treatment=True,
                        discrete_instrument=True,
                        cv=2,
                        random_state=123),
             IntentToTreatDRIV(
                 model_y_xw=RandomForestRegressor(n_estimators=10,
                                                  max_depth=4,
                                                  random_state=123),
                 model_t_xwz=RandomForestClassifier(n_estimators=10,
                                                    max_depth=4,
                                                    random_state=123),
                 flexible_model_effect=RandomForestRegressor(
                     n_estimators=10, max_depth=4, random_state=123),
                 cv=2,
                 random_state=123),
             LinearIntentToTreatDRIV(
                 model_y_xw=RandomForestRegressor(n_estimators=10,
                                                  max_depth=4,
                                                  random_state=123),
                 model_t_xwz=RandomForestClassifier(n_estimators=10,
                                                    max_depth=4,
                                                    random_state=123),
                 flexible_model_effect=RandomForestRegressor(
                     n_estimators=10, max_depth=4, random_state=123),
                 cv=2,
                 random_state=123)
     ]:
         TestRandomState._test_random_state(est,
                                            X_test,
                                            Y,
                                            T,
                                            X=X,
                                            W=W,
                                            Z=T)
Beispiel #6
0
    def test_dml(self):
        """Test setting attributes and refitting"""
        y, T, X, W = self._get_data()

        dml = DML(model_y=LinearRegression(),
                  model_t=LinearRegression(),
                  model_final=StatsModelsLinearRegression(fit_intercept=False),
                  linear_first_stages=False,
                  random_state=123)
        dml.fit(y, T, X=X, W=W)
        with pytest.raises(Exception):
            dml.refit_final()
        dml.fit(y, T, X=X, W=W, cache_values=True)
        dml.model_final = StatsModelsRLM(fit_intercept=False)
        dml.refit_final()
        assert isinstance(dml.model_cate, StatsModelsRLM)
        np.testing.assert_array_equal(dml.model_cate.coef_[1:].flatten(), dml.coef_.flatten())
        lb, ub = dml.model_cate.coef__interval(alpha=0.01)
        lbt, ubt = dml.coef__interval(alpha=0.01)
        np.testing.assert_array_equal(lb[1:].flatten(), lbt.flatten())
        np.testing.assert_array_equal(ub[1:].flatten(), ubt.flatten())
        intcpt = dml.intercept_
        dml.fit_cate_intercept = False
        np.testing.assert_equal(dml.intercept_, intcpt)
        dml.refit_final()
        np.testing.assert_array_equal(dml.model_cate.coef_.flatten(), dml.coef_.flatten())
        lb, ub = dml.model_cate.coef__interval(alpha=0.01)
        lbt, ubt = dml.coef__interval(alpha=0.01)
        np.testing.assert_array_equal(lb.flatten(), lbt.flatten())
        np.testing.assert_array_equal(ub.flatten(), ubt.flatten())
        with pytest.raises(AttributeError):
            dml.intercept_
        with pytest.raises(AttributeError):
            dml.intercept__interval()
        dml.model_final = DebiasedLasso(fit_intercept=False)
        dml.refit_final()
        assert isinstance(dml.model_cate, DebiasedLasso)
        dml.featurizer = PolynomialFeatures(degree=2, include_bias=False)
        dml.model_final = StatsModelsLinearRegression(fit_intercept=False)
        dml.refit_final()
        assert isinstance(dml.featurizer_, PolynomialFeatures)
        dml.fit_cate_intercept = True
        dml.refit_final()
        assert isinstance(dml.featurizer_, Pipeline)
        np.testing.assert_array_equal(dml.coef_.shape, (X.shape[1]**2))
        np.testing.assert_array_equal(dml.coef__interval()[0].shape, (X.shape[1]**2))
        coefpre = dml.coef_
        coefpreint = dml.coef__interval()
        dml.fit(y, T, X=X, W=W)
        np.testing.assert_array_equal(coefpre, dml.coef_)
        np.testing.assert_array_equal(coefpreint[0], dml.coef__interval()[0])
        dml.discrete_treatment = True
        dml.featurizer = None
        dml.linear_first_stages = True
        dml.model_t = LogisticRegression()
        dml.fit(y, T, X=X, W=W)
        newdml = DML(model_y=LinearRegression(),
                     model_t=LogisticRegression(),
                     model_final=StatsModelsLinearRegression(fit_intercept=False),
                     discrete_treatment=True,
                     linear_first_stages=True,
                     random_state=123).fit(y, T, X=X, W=W)
        np.testing.assert_array_equal(dml.coef_, newdml.coef_)
        np.testing.assert_array_equal(dml.coef__interval()[0], newdml.coef__interval()[0])

        ldml = LinearDML(model_y=LinearRegression(),
                         model_t=LinearRegression(),
                         linear_first_stages=False)
        ldml.fit(y, T, X=X, W=W, cache_values=True)
        # can set final model for plain DML, but can't for LinearDML (hardcoded to StatsModelsRegression)
        with pytest.raises(ValueError):
            ldml.model_final = StatsModelsRLM()

        ldml = SparseLinearDML(model_y=LinearRegression(),
                               model_t=LinearRegression(),
                               linear_first_stages=False)
        ldml.fit(y, T, X=X, W=W, cache_values=True)
        # can set final model for plain DML, but can't for LinearDML (hardcoded to StatsModelsRegression)
        with pytest.raises(ValueError):
            ldml.model_final = StatsModelsRLM()
        ldml.alpha = 0.01
        ldml.max_iter = 10
        ldml.tol = 0.01
        ldml.refit_final()
        np.testing.assert_equal(ldml.model_cate.estimators_[0].alpha, 0.01)
        np.testing.assert_equal(ldml.model_cate.estimators_[0].max_iter, 10)
        np.testing.assert_equal(ldml.model_cate.estimators_[0].tol, 0.01)
Beispiel #7
0
    def test_comp_with_statsmodels(self):
        """ Comparing with confidence intervals and standard errors of statsmodels in the un-weighted case """
        np.random.seed(123)

        # Single dimensional output y
        n = 1000
        d = 3
        X = np.random.binomial(1, .8, size=(n, d))
        T = np.random.binomial(1, .5 * X[:, 0] + .25, size=(n,))

        def true_effect(x):
            return x[:, 0] + .5
        y = true_effect(X) * T + X[:, 0] + X[:, 2] + np.random.normal(0, 1, size=(n,))
        X_test = np.unique(np.random.binomial(1, .5, size=(n, d)), axis=0)
        for fit_intercept in [True, False]:
            for cov_type in ['nonrobust', 'HC0', 'HC1']:
                est = OLS(fit_intercept=fit_intercept, cov_type=cov_type).fit(X, y)
                lr = StatsModelsOLS(fit_intercept=fit_intercept, fit_args={
                                    'cov_type': cov_type, 'use_t': False}).fit(X, y)
                _compare_classes(est, lr, X_test)

        n = 1000
        d = 3
        X = np.random.normal(0, 1, size=(n, d))
        y = X[:, 0] + X[:, 2] + np.random.normal(0, 1, size=(n,))
        X_test = np.unique(np.random.binomial(1, .5, size=(n, d)), axis=0)
        for fit_intercept in [True, False]:
            for cov_type in ['nonrobust', 'HC0', 'HC1']:
                est = OLS(fit_intercept=fit_intercept, cov_type=cov_type).fit(X, y)
                lr = StatsModelsOLS(fit_intercept=fit_intercept, fit_args={
                                    'cov_type': cov_type, 'use_t': False}).fit(X, y)
                _compare_classes(est, lr, X_test)

        d = 3
        X = np.vstack([np.eye(d)])
        y = np.concatenate((X[:, 0] - 1, X[:, 0] + 1))
        X = np.vstack([X, X])
        X_test = np.unique(np.random.binomial(1, .5, size=(n, d)), axis=0)

        for cov_type in ['nonrobust', 'HC0', 'HC1']:
            for alpha in [.01, .05, .1]:
                _compare_classes(OLS(fit_intercept=False, cov_type=cov_type).fit(X, y),
                                 StatsModelsOLS(fit_intercept=False, fit_args={
                                                'cov_type': cov_type, 'use_t': False}).fit(X, y),
                                 X_test, alpha=alpha)

        d = 3
        X = np.vstack([np.eye(d), np.ones((1, d)), np.zeros((1, d))])
        y = np.concatenate((X[:, 0] - 1, X[:, 0] + 1))
        X = np.vstack([X, X])
        X_test = np.unique(np.random.binomial(1, .5, size=(n, d)), axis=0)
        for cov_type in ['nonrobust', 'HC0', 'HC1']:
            _compare_classes(OLS(fit_intercept=True, cov_type=cov_type).fit(X, y),
                             StatsModelsOLS(fit_intercept=True,
                                            fit_args={'cov_type': cov_type, 'use_t': False}).fit(X, y), X_test)

        # Multi-dimensional output y
        n = 1000
        d = 3
        for p in np.arange(1, 4):
            X = np.random.binomial(1, .8, size=(n, d))
            T = np.random.binomial(1, .5 * X[:, 0] + .25, size=(n,))

            def true_effect(x):
                return np.hstack([x[:, [0]] + .5 + t for t in range(p)])
            y = np.zeros((n, p))
            y = true_effect(X) * T.reshape(-1, 1) + X[:, [0] * p] + \
                (0 * X[:, [0] * p] + 1) * np.random.normal(0, 1, size=(n, p))

            for cov_type in ['nonrobust', 'HC0', 'HC1']:
                for fit_intercept in [True, False]:
                    for alpha in [.01, .05, .2]:
                        est = OLS(fit_intercept=fit_intercept, cov_type=cov_type).fit(X, y)
                        lr = [StatsModelsOLS(fit_intercept=fit_intercept, fit_args={
                                             'cov_type': cov_type, 'use_t': False}).fit(X, y[:, t]) for t in range(p)]
                        for t in range(p):
                            assert np.all(np.abs(est.coef_[t] - lr[t].coef_) < 1e-12),\
                                "{}, {}, {}: {}, {}".format(cov_type, fit_intercept, t, est.coef_[t], lr[t].coef_)
                            assert np.all(np.abs(np.array(est.coef__interval(alpha=alpha))[:, t] -
                                                 lr[t].coef__interval(alpha=alpha)) < 1e-12),\
                                "{}, {}, {}: {} vs {}".format(cov_type, fit_intercept, t,
                                                              np.array(est.coef__interval(alpha=alpha))[:, t],
                                                              lr[t].coef__interval(alpha=alpha))
                            assert np.all(np.abs(est.intercept_[t] - lr[t].intercept_) < 1e-12),\
                                "{}, {}, {}: {} vs {}".format(cov_type, fit_intercept, t,
                                                              est.intercept_[t], lr[t].intercept_)
                            assert np.all(np.abs(np.array(est.intercept__interval(alpha=alpha))[:, t] -
                                                 lr[t].intercept__interval(alpha=alpha)) < 1e-12),\
                                "{}, {}, {}: {} vs {}".format(cov_type, fit_intercept, t,
                                                              np.array(est.intercept__interval(alpha=alpha))[:, t],
                                                              lr[t].intercept__interval(alpha=alpha))
                            assert np.all(np.abs(est.predict(X_test)[:, t] - lr[t].predict(X_test)) < 1e-12),\
                                "{}, {}, {}: {} vs {}".format(cov_type, fit_intercept, t, est.predict(X_test)[
                                                              :, t], lr[t].predict(X_test))
                            assert np.all(np.abs(np.array(est.predict_interval(X_test, alpha=alpha))[:, :, t] -
                                                 lr[t].predict_interval(X_test, alpha=alpha)) < 1e-12),\
                                "{}, {}, {}: {} vs {}".format(cov_type, fit_intercept, t,
                                                              np.array(est.predict_interval(X_test,
                                                                                            alpha=alpha))[:, :, t],
                                                              lr[t].predict_interval(X_test, alpha=alpha))
Beispiel #8
0
    def test_inference(self):
        """ Testing that we recover the expected standard errors and confidence intervals in a known example """

        # 1-d output
        d = 3
        X = np.vstack([np.eye(d)])
        y = X[:, 0]
        est = OLS(fit_intercept=False).fit(X, y)
        assert np.all(np.abs(est.coef_ - [1, 0, 0]) <= 1e-12), "{}, {}".format(est.coef_, [1, 0, 0])
        assert np.all(np.abs(est.coef__interval() - np.array([[1, 0, 0], [1, 0, 0]])) <= 1e-12),\
            "{}, {}".format(est.coef__interval(), np.array([[1, 0, 0], [1, 0, 0]]))
        assert np.all(est.coef_stderr_ <= 1e-12)
        assert np.all(est._param_var <= 1e-12)

        d = 3
        X = np.vstack([np.eye(d), np.ones((1, d)), np.zeros((1, d))])
        y = X[:, 0]
        est = OLS(fit_intercept=True).fit(X, y)
        assert np.all(np.abs(est.coef_ - np.array([1] + [0] * (d - 1))) <=
                      1e-12), "{}, {}".format(est.coef_, [1] + [0] * (d - 1))
        assert np.all(np.abs(est.coef__interval() - np.array([[1] + [0] * (d - 1), [1] + [0] * (d - 1)])) <= 1e-12),\
            "{}, {}".format(est.coef__interval(), np.array([[1] + [0] * (d - 1), [1] + [0] * (d - 1)]))
        assert np.all(est.coef_stderr_ <= 1e-12)
        assert np.all(est._param_var <= 1e-12)
        assert np.abs(est.intercept_) <= 1e-12
        assert np.all(np.abs(est.intercept__interval()) <= 1e-12)

        d = 3
        X = np.vstack([np.eye(d)])
        y = np.concatenate((X[:, 0] - 1, X[:, 0] + 1))
        X = np.vstack([X, X])
        est = OLS(fit_intercept=False).fit(X, y)
        assert np.all(np.abs(est.coef_ - ([1] + [0] * (d - 1))) <=
                      1e-12), "{}, {}".format(est.coef_, [1] + [0] * (d - 1))
        assert np.all(np.abs(est.coef_stderr_ - np.array([1] * d)) <= 1e-12)
        assert np.all(np.abs(est.coef__interval()[0] -
                             np.array([scipy.stats.norm.ppf(.025, loc=1, scale=1)] +
                                      [scipy.stats.norm.ppf(.025, loc=0, scale=1)] * (d - 1))) <= 1e-12),\
            "{}, {}".format(est.coef__interval()[0], np.array([scipy.stats.norm.ppf(.025, loc=1, scale=1)] +
                                                              [scipy.stats.norm.ppf(.025, loc=0, scale=1)] * (d - 1)))
        assert np.all(np.abs(est.coef__interval()[1] -
                             np.array([scipy.stats.norm.ppf(.975, loc=1, scale=1)] +
                                      [scipy.stats.norm.ppf(.975, loc=0, scale=1)] * (d - 1))) <= 1e-12),\
            "{}, {}".format(est.coef__interval()[1], np.array([scipy.stats.norm.ppf(.975, loc=1, scale=1)] +
                                                              [scipy.stats.norm.ppf(.975, loc=0, scale=1)] * (d - 1)))

        # 2-d output
        d = 3
        p = 4
        X = np.vstack([np.eye(d)])
        y = np.vstack((X[:, [0] * p] - 1, X[:, [0] * p] + 1))
        X = np.vstack([X, X])
        est = OLS(fit_intercept=False).fit(X, y)
        for t in range(p):
            assert np.all(np.abs(est.coef_[t] - ([1] + [0] * (d - 1))) <=
                          1e-12), "{}, {}".format(est.coef_[t], [1] + [0] * (d - 1))
            assert np.all(np.abs(est.coef_stderr_[t] - np.array([1] * d)) <= 1e-12), "{}".format(est.coef_stderr_[t])
            assert np.all(np.abs(est.coef__interval()[0][t] -
                                 np.array([scipy.stats.norm.ppf(.025, loc=1, scale=1)] +
                                          [scipy.stats.norm.ppf(.025, loc=0, scale=1)] * (d - 1))) <= 1e-12),\
                "{}, {}".format(est.coef__interval()[0][t],
                                np.array([scipy.stats.norm.ppf(.025, loc=1, scale=1)] +
                                         [scipy.stats.norm.ppf(.025, loc=0, scale=1)] * (d - 1)))
            assert np.all(np.abs(est.coef__interval()[1][t] -
                                 np.array([scipy.stats.norm.ppf(.975, loc=1, scale=1)] +
                                          [scipy.stats.norm.ppf(.975, loc=0, scale=1)] * (d - 1))) <= 1e-12),\
                "{}, {}".format(est.coef__interval()[1][t],
                                np.array([scipy.stats.norm.ppf(.975, loc=1, scale=1)] +
                                         [scipy.stats.norm.ppf(.975, loc=0, scale=1)] * (d - 1)))
            assert np.all(np.abs(est.intercept_[t]) <= 1e-12), "{}, {}".format(est.intercept_[t])
            assert np.all(np.abs(est.intercept_stderr_[t]) <= 1e-12), "{}".format(est.intercept_stderr_[t])
            assert np.all(np.abs(est.intercept__interval()[0][t]) <=
                          1e-12), "{}".format(est.intercept__interval()[0][t])

        d = 3
        p = 4
        X = np.vstack([np.eye(d), np.zeros((1, d))])
        y = np.vstack((X[:, [0] * p] - 1, X[:, [0] * p] + 1))
        X = np.vstack([X, X])
        est = OLS(fit_intercept=True).fit(X, y)
        for t in range(p):
            assert np.all(np.abs(est.coef_[t] - ([1] + [0] * (d - 1))) <=
                          1e-12), "{}, {}".format(est.coef_[t], [1] + [0] * (d - 1))
            assert np.all(np.abs(est.coef_stderr_[t] - np.array([np.sqrt(2)] * d)) <=
                          1e-12), "{}".format(est.coef_stderr_[t])
            assert np.all(np.abs(est.coef__interval()[0][t] -
                                 np.array([scipy.stats.norm.ppf(.025, loc=1, scale=np.sqrt(2))] +
                                          [scipy.stats.norm.ppf(.025, loc=0, scale=np.sqrt(2))] * (d - 1))) <= 1e-12),\
                "{}, {}".format(est.coef__interval()[0][t],
                                np.array([scipy.stats.norm.ppf(.025, loc=1, scale=np.sqrt(2))] +
                                         [scipy.stats.norm.ppf(.025, loc=0, scale=np.sqrt(2))] * (d - 1)))
            assert np.all(np.abs(est.coef__interval()[1][t] -
                                 np.array([scipy.stats.norm.ppf(.975, loc=1, scale=np.sqrt(2))] +
                                          [scipy.stats.norm.ppf(.975, loc=0, scale=np.sqrt(2))] * (d - 1))) <= 1e-12),\
                "{}, {}".format(est.coef__interval()[1][t],
                                np.array([scipy.stats.norm.ppf(.975, loc=1, scale=np.sqrt(2))] +
                                         [scipy.stats.norm.ppf(.975, loc=0, scale=np.sqrt(2))] * (d - 1)))
            assert np.all(np.abs(est.intercept_[t]) <= 1e-12), "{}, {}".format(est.intercept_[t])
            assert np.all(np.abs(est.intercept_stderr_[t] - 1) <= 1e-12), "{}".format(est.intercept_stderr_[t])
            assert np.all(np.abs(est.intercept__interval()[0][t] -
                                 scipy.stats.norm.ppf(.025, loc=0, scale=1)) <= 1e-12),\
                "{}, {}".format(est.intercept__interval()[0][t], scipy.stats.norm.ppf(.025, loc=0, scale=1))
Beispiel #9
0
    def test_cate_api(self):
        def const_marg_eff_shape(n, d_x, binary_T):
            return (n if d_x else 1, ) + ((1, ) if binary_T else ())

        def marg_eff_shape(n, binary_T):
            return (n, ) + ((1, ) if binary_T else ())

        def eff_shape(n, d_x):
            return (n if d_x else 1, )

        n = 1000
        y = np.random.normal(size=(n, ))

        for d_w in [None, 10]:
            if d_w is None:
                W = None
            else:
                W = np.random.normal(size=(n, d_w))
            for d_x in [None, 3]:
                if d_x is None:
                    X = None
                else:
                    X = np.random.normal(size=(n, d_x))
                for binary_T in [True, False]:
                    if binary_T:
                        T = np.random.choice(["a", "b"], size=(n, ))
                    else:
                        T = np.random.normal(size=(n, ))
                    for binary_Z in [True, False]:
                        if binary_Z:
                            Z = np.random.choice(["c", "d"], size=(n, ))
                        else:
                            Z = np.random.normal(size=(n, ))
                        for projection in [True, False]:
                            for featurizer in [
                                    None,
                                    PolynomialFeatures(degree=2,
                                                       include_bias=False),
                            ]:
                                est_list = [
                                    DRIV(
                                        flexible_model_effect=
                                        StatsModelsLinearRegression(
                                            fit_intercept=False),
                                        model_final=StatsModelsLinearRegression(
                                            fit_intercept=False),
                                        fit_cate_intercept=True,
                                        projection=projection,
                                        discrete_instrument=binary_Z,
                                        discrete_treatment=binary_T,
                                        featurizer=featurizer,
                                    ),
                                    LinearDRIV(
                                        flexible_model_effect=
                                        StatsModelsLinearRegression(
                                            fit_intercept=False),
                                        fit_cate_intercept=True,
                                        projection=projection,
                                        discrete_instrument=binary_Z,
                                        discrete_treatment=binary_T,
                                        featurizer=featurizer,
                                    ),
                                    SparseLinearDRIV(
                                        flexible_model_effect=
                                        StatsModelsLinearRegression(
                                            fit_intercept=False),
                                        fit_cate_intercept=True,
                                        projection=projection,
                                        discrete_instrument=binary_Z,
                                        discrete_treatment=binary_T,
                                        featurizer=featurizer,
                                    ),
                                    ForestDRIV(
                                        flexible_model_effect=
                                        StatsModelsLinearRegression(
                                            fit_intercept=False),
                                        projection=projection,
                                        discrete_instrument=binary_Z,
                                        discrete_treatment=binary_T,
                                        featurizer=featurizer,
                                    ),
                                ]

                                if X is None:
                                    est_list = est_list[:-1]

                                if binary_T and binary_Z:
                                    est_list += [
                                        IntentToTreatDRIV(
                                            flexible_model_effect=
                                            StatsModelsLinearRegression(
                                                fit_intercept=False),
                                            fit_cate_intercept=True,
                                            featurizer=featurizer,
                                        ),
                                        LinearIntentToTreatDRIV(
                                            flexible_model_effect=
                                            StatsModelsLinearRegression(
                                                fit_intercept=False),
                                            featurizer=featurizer,
                                        ),
                                    ]

                                for est in est_list:
                                    with self.subTest(d_w=d_w,
                                                      d_x=d_x,
                                                      binary_T=binary_T,
                                                      binary_Z=binary_Z,
                                                      projection=projection,
                                                      featurizer=featurizer,
                                                      est=est):

                                        # ensure we can serialize unfit estimator
                                        pickle.dumps(est)

                                        est.fit(y, T, Z=Z, X=X, W=W)

                                        # ensure we can serialize fit estimator
                                        pickle.dumps(est)

                                        # expected effect size
                                        const_marginal_effect_shape = const_marg_eff_shape(
                                            n, d_x, binary_T)
                                        marginal_effect_shape = marg_eff_shape(
                                            n, binary_T)
                                        effect_shape = eff_shape(n, d_x)
                                        # test effect
                                        const_marg_eff = est.const_marginal_effect(
                                            X)
                                        self.assertEqual(
                                            shape(const_marg_eff),
                                            const_marginal_effect_shape)
                                        marg_eff = est.marginal_effect(T, X)
                                        self.assertEqual(
                                            shape(marg_eff),
                                            marginal_effect_shape)
                                        T0 = "a" if binary_T else 0
                                        T1 = "b" if binary_T else 1
                                        eff = est.effect(X, T0=T0, T1=T1)
                                        self.assertEqual(
                                            shape(eff), effect_shape)

                                        # test inference
                                        const_marg_eff_int = est.const_marginal_effect_interval(
                                            X)
                                        marg_eff_int = est.marginal_effect_interval(
                                            T, X)
                                        eff_int = est.effect_interval(X,
                                                                      T0=T0,
                                                                      T1=T1)
                                        self.assertEqual(
                                            shape(const_marg_eff_int), (2, ) +
                                            const_marginal_effect_shape)
                                        self.assertEqual(
                                            shape(marg_eff_int),
                                            (2, ) + marginal_effect_shape)
                                        self.assertEqual(
                                            shape(eff_int),
                                            (2, ) + effect_shape)

                                        # test can run score
                                        est.score(y, T, Z=Z, X=X, W=W)

                                        if X is not None:
                                            # test cate_feature_names
                                            expect_feat_len = featurizer.fit(
                                                X
                                            ).n_output_features_ if featurizer else d_x
                                            self.assertEqual(
                                                len(est.cate_feature_names()),
                                                expect_feat_len)

                                            # test can run shap values
                                            shap_values = est.shap_values(
                                                X[:10])
Beispiel #10
0
    def test_cate_api(self):
        def const_marg_eff_shape(n, d_x, binary_T):
            """Constant marginal effect shape."""
            return (n if d_x else 1,) + ((1,) if binary_T else ())

        def marg_eff_shape(n, binary_T):
            """Marginal effect shape."""
            return (n,) + ((1,) if binary_T else ())

        def eff_shape(n, d_x):
            "Effect shape."
            return (n if d_x else 1,)

        n = 500
        y = np.random.normal(size=(n,))

        # parameter combinations to test
        for d_w, d_x, binary_T, binary_Z, projection, featurizer\
            in itertools.product(
                [None, 10],     # d_w
                [None, 3],      # d_x
                [True, False],  # binary_T
                [True, False],  # binary_Z
                [True, False],  # projection
                [None, PolynomialFeatures(degree=2, include_bias=False), ]):    # featurizer

            if d_w is None:
                W = None
            else:
                W = np.random.normal(size=(n, d_w))

            if d_x is None:
                X = None
            else:
                X = np.random.normal(size=(n, d_x))

            if binary_T:
                T = np.random.choice(["a", "b"], size=(n,))
            else:
                T = np.random.normal(size=(n,))

            if binary_Z:
                Z = np.random.choice(["c", "d"], size=(n,))
            else:
                Z = np.random.normal(size=(n,))

            est_list = [
                DRIV(
                    flexible_model_effect=StatsModelsLinearRegression(fit_intercept=False),
                    model_final=StatsModelsLinearRegression(
                        fit_intercept=False
                    ),
                    fit_cate_intercept=True,
                    projection=projection,
                    discrete_instrument=binary_Z,
                    discrete_treatment=binary_T,
                    featurizer=featurizer,
                ),
                LinearDRIV(
                    flexible_model_effect=StatsModelsLinearRegression(fit_intercept=False),
                    fit_cate_intercept=True,
                    projection=projection,
                    discrete_instrument=binary_Z,
                    discrete_treatment=binary_T,
                    featurizer=featurizer,
                ),
                SparseLinearDRIV(
                    flexible_model_effect=StatsModelsLinearRegression(fit_intercept=False),
                    fit_cate_intercept=True,
                    projection=projection,
                    discrete_instrument=binary_Z,
                    discrete_treatment=binary_T,
                    featurizer=featurizer,
                ),
                ForestDRIV(
                    flexible_model_effect=StatsModelsLinearRegression(fit_intercept=False),
                    projection=projection,
                    discrete_instrument=binary_Z,
                    discrete_treatment=binary_T,
                    featurizer=featurizer,
                ),
            ]

            if X is None:
                est_list = est_list[:-1]

            if binary_T and binary_Z:
                est_list += [
                    IntentToTreatDRIV(
                        flexible_model_effect=StatsModelsLinearRegression(
                            fit_intercept=False
                        ),
                        fit_cate_intercept=True,
                        featurizer=featurizer,
                    ),
                    LinearIntentToTreatDRIV(
                        flexible_model_effect=StatsModelsLinearRegression(
                            fit_intercept=False
                        ),
                        featurizer=featurizer,
                    ),
                ]

            for est in est_list:
                with self.subTest(d_w=d_w, d_x=d_x, binary_T=binary_T,
                                  binary_Z=binary_Z, projection=projection, featurizer=featurizer,
                                  est=est):

                    # TODO: serializing/deserializing for every combination -- is this necessary?
                    # ensure we can serialize unfit estimator
                    pickle.dumps(est)

                    est.fit(y, T, Z=Z, X=X, W=W)

                    # ensure we can serialize fit estimator
                    pickle.dumps(est)

                    # expected effect size
                    exp_const_marginal_effect_shape = const_marg_eff_shape(n, d_x, binary_T)
                    marginal_effect_shape = marg_eff_shape(n, binary_T)
                    effect_shape = eff_shape(n, d_x)

                    # assert calculated constant marginal effect shape is expected
                    # const_marginal effect is defined in LinearCateEstimator class
                    const_marg_eff = est.const_marginal_effect(X)
                    self.assertEqual(shape(const_marg_eff), exp_const_marginal_effect_shape)

                    # assert calculated marginal effect shape is expected
                    marg_eff = est.marginal_effect(T, X)
                    self.assertEqual(shape(marg_eff), marginal_effect_shape)

                    T0 = "a" if binary_T else 0
                    T1 = "b" if binary_T else 1
                    eff = est.effect(X, T0=T0, T1=T1)
                    self.assertEqual(shape(eff), effect_shape)

                    # test inference
                    const_marg_eff_int = est.const_marginal_effect_interval(X)
                    marg_eff_int = est.marginal_effect_interval(T, X)
                    eff_int = est.effect_interval(X, T0=T0, T1=T1)
                    self.assertEqual(shape(const_marg_eff_int), (2,) + exp_const_marginal_effect_shape)
                    self.assertEqual(shape(marg_eff_int), (2,) + marginal_effect_shape)
                    self.assertEqual(shape(eff_int), (2,) + effect_shape)

                    # test can run score
                    est.score(y, T, Z=Z, X=X, W=W)

                    if X is not None:
                        # test cate_feature_names
                        expect_feat_len = featurizer.fit(
                            X).n_output_features_ if featurizer else d_x
                        self.assertEqual(len(est.cate_feature_names()), expect_feat_len)

                        # test can run shap values
                        _ = est.shap_values(X[:10])