Ejemplo n.º 1
0
 def __init__(self,
              model_propensity=LogisticRegressionCV(cv=3, solver='lbfgs', multi_class='auto'),
              model_regression=WeightedLassoCV(cv=3),
              featurizer=None,
              fit_cate_intercept=True,
              n_splits=2, random_state=None):
     super().__init__(model_propensity=model_propensity,
                      model_regression=model_regression,
                      model_final=StatsModelsLinearRegression(fit_intercept=fit_cate_intercept),
                      featurizer=featurizer,
                      multitask_model_final=False,
                      n_splits=n_splits,
                      random_state=random_state)
Ejemplo n.º 2
0
    def test_cate_api(self):
        """Test that we correctly implement the CATE API."""
        n = 20

        def make_random(is_discrete, d):
            if d is None:
                return None
            sz = (n, d) if d > 0 else (n,)
            if is_discrete:
                while True:
                    arr = np.random.choice(['a', 'b', 'c'], size=sz)
                    # ensure that we've got at least two of every element
                    _, counts = np.unique(arr, return_counts=True)
                    if len(counts) == 3 and counts.min() > 1:
                        return arr
            else:
                return np.random.normal(size=sz)

        for d_y in [0, 1]:
            is_discrete = True
            for d_t in [0, 1]:
                for d_x in [2, None]:
                    for d_w in [2, None]:
                        W, X, Y, T = [make_random(is_discrete, d)
                                      for is_discrete, d in [(False, d_w),
                                                             (False, d_x),
                                                             (False, d_y),
                                                             (is_discrete, d_t)]]

                        if (X is None) and (W is None):
                            continue
                        d_t_final = 2 if is_discrete else d_t

                        effect_shape = (n,) + ((d_y,) if d_y > 0 else ())
                        effect_summaryframe_shape = (
                            n * (d_y if d_y > 0 else 1), 6)
                        marginal_effect_shape = ((n,) +
                                                 ((d_y,) if d_y > 0 else ()) +
                                                 ((d_t_final,) if d_t_final > 0 else ()))
                        marginal_effect_summaryframe_shape = (n * (d_y if d_y > 0 else 1),
                                                              6 * (d_t_final if d_t_final > 0 else 1))

                        # since T isn't passed to const_marginal_effect, defaults to one row if X is None
                        const_marginal_effect_shape = ((n if d_x else 1,) +
                                                       ((d_y,) if d_y > 0 else ()) +
                                                       ((d_t_final,) if d_t_final > 0 else()))
                        const_marginal_effect_summaryframe_shape = (
                            (n if d_x else 1) * (d_y if d_y > 0 else 1),
                            6 * (d_t_final if d_t_final > 0 else 1))

                        for est in [LinearDRLearner(model_propensity=LogisticRegression(C=1000, solver='lbfgs',
                                                                                        multi_class='auto')),
                                    DRLearner(model_propensity=LogisticRegression(multi_class='auto'),
                                              model_regression=LinearRegression(),
                                              model_final=StatsModelsLinearRegression(),
                                              multitask_model_final=True)]:

                            # TODO: add stratification to bootstrap so that we can use it even with discrete treatments
                            infs = [None]
                            if isinstance(est, LinearDRLearner):
                                infs.append('statsmodels')

                            for inf in infs:
                                with self.subTest(d_w=d_w, d_x=d_x, d_y=d_y, d_t=d_t,
                                                  is_discrete=is_discrete, est=est, inf=inf):
                                    est.fit(Y, T, X, W, inference=inf)
                                    # make sure we can call the marginal_effect and effect methods
                                    const_marg_eff = est.const_marginal_effect(
                                        X)
                                    marg_eff = est.marginal_effect(T, X)
                                    self.assertEqual(
                                        shape(marg_eff), marginal_effect_shape)
                                    self.assertEqual(
                                        shape(const_marg_eff), const_marginal_effect_shape)

                                    np.testing.assert_array_equal(
                                        marg_eff if d_x else marg_eff[0:1], const_marg_eff)

                                    T0 = np.full_like(T, 'a')
                                    eff = est.effect(X, T0=T0, T1=T)
                                    self.assertEqual(shape(eff), effect_shape)
                                    if inf is not None:
                                        const_marg_eff_int = est.const_marginal_effect_interval(
                                            X)
                                        marg_eff_int = est.marginal_effect_interval(
                                            T, X)
                                        const_marg_effect_inf = est.const_marginal_effect_inference(
                                            X)
                                        T1 = np.full_like(T, 'b')
                                        effect_inf = est.effect_inference(
                                            X, T0=T0, T1=T1)
                                        marg_effect_inf = est.marginal_effect_inference(
                                            T, X)
                                        self.assertEqual(shape(marg_eff_int),
                                                         (2,) + marginal_effect_shape)
                                        self.assertEqual(shape(const_marg_eff_int),
                                                         (2,) + const_marginal_effect_shape)
                                        self.assertEqual(shape(est.effect_interval(X, T0=T0, T1=T)),
                                                         (2,) + effect_shape)

                                        # test const marginal inference
                                        self.assertEqual(shape(const_marg_effect_inf.summary_frame()),
                                                         const_marginal_effect_summaryframe_shape)
                                        self.assertEqual(shape(const_marg_effect_inf.point_estimate),
                                                         const_marginal_effect_shape)
                                        self.assertEqual(shape(const_marg_effect_inf.stderr),
                                                         const_marginal_effect_shape)
                                        self.assertEqual(shape(const_marg_effect_inf.var),
                                                         const_marginal_effect_shape)
                                        self.assertEqual(shape(const_marg_effect_inf.pvalue()),
                                                         const_marginal_effect_shape)
                                        self.assertEqual(shape(const_marg_effect_inf.zstat()),
                                                         const_marginal_effect_shape)
                                        self.assertEqual(shape(const_marg_effect_inf.conf_int()),
                                                         (2,) + const_marginal_effect_shape)
                                        np.testing.assert_array_almost_equal(const_marg_effect_inf.conf_int()
                                                                             [0], const_marg_eff_int[0], decimal=5)
                                        const_marg_effect_inf.population_summary()._repr_html_()

                                        # test effect inference
                                        self.assertEqual(shape(effect_inf.summary_frame()),
                                                         effect_summaryframe_shape)
                                        self.assertEqual(shape(effect_inf.point_estimate),
                                                         effect_shape)
                                        self.assertEqual(shape(effect_inf.stderr),
                                                         effect_shape)
                                        self.assertEqual(shape(effect_inf.var),
                                                         effect_shape)
                                        self.assertEqual(shape(effect_inf.pvalue()),
                                                         effect_shape)
                                        self.assertEqual(shape(effect_inf.zstat()),
                                                         effect_shape)
                                        self.assertEqual(shape(effect_inf.conf_int()),
                                                         (2,) + effect_shape)
                                        np.testing.assert_array_almost_equal(effect_inf.conf_int()
                                                                             [0], est.effect_interval(
                                                                                 X, T0=T0, T1=T1)
                                                                             [0], decimal=5)
                                        effect_inf.population_summary()._repr_html_()

                                        # test marginal effect inference
                                        self.assertEqual(shape(marg_effect_inf.summary_frame()),
                                                         marginal_effect_summaryframe_shape)
                                        self.assertEqual(shape(marg_effect_inf.point_estimate),
                                                         marginal_effect_shape)
                                        self.assertEqual(shape(marg_effect_inf.stderr),
                                                         marginal_effect_shape)
                                        self.assertEqual(shape(marg_effect_inf.var),
                                                         marginal_effect_shape)
                                        self.assertEqual(shape(marg_effect_inf.pvalue()),
                                                         marginal_effect_shape)
                                        self.assertEqual(shape(marg_effect_inf.zstat()),
                                                         marginal_effect_shape)
                                        self.assertEqual(shape(marg_effect_inf.conf_int()),
                                                         (2,) + marginal_effect_shape)
                                        np.testing.assert_array_almost_equal(marg_effect_inf.conf_int()
                                                                             [0], marg_eff_int[0], decimal=5)
                                        marg_effect_inf.population_summary()._repr_html_()

                                    est.score(Y, T, X, W)

                                    # make sure we can call effect with implied scalar treatments, no matter the
                                    # dimensions of T, and also that we warn when there are multiple treatments
                                    if d_t > 1:
                                        cm = self.assertWarns(Warning)
                                    else:
                                        cm = ExitStack()  # ExitStack can be used as a "do nothing" ContextManager
                                    with cm:
                                        effect_shape2 = (
                                            n if d_x else 1,) + ((d_y,) if d_y > 0 else())
                                        eff = est.effect(X, T0='a', T1='b')
                                        self.assertEqual(
                                            shape(eff), effect_shape2)
Ejemplo n.º 3
0
    def test_drlearner_all_attributes(self):
        from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, RandomForestRegressor
        from sklearn.linear_model import LinearRegression, LogisticRegression
        from econml.utilities import StatsModelsLinearRegression
        import scipy.special
        np.random.seed(123)
        controls = np.random.uniform(-1, 1, size=(5000, 3))
        T = np.random.binomial(2, scipy.special.expit(controls[:, 0]))
        sigma = 0.01
        y = (1 + .5 * controls[:, 0]) * T + controls[:,
                                                     0] + np.random.normal(0, sigma, size=(5000,))
        for X in [controls]:
            for W in [None, controls]:
                for sample_weight in [None, 1 + np.random.randint(10, size=X.shape[0])]:
                    for sample_var in [None, 1 + np.random.randint(10, size=X.shape[0])]:
                        for featurizer in [None, PolynomialFeatures(degree=2, include_bias=False)]:
                            for models in [(GradientBoostingClassifier(), GradientBoostingRegressor(),
                                            RandomForestRegressor(n_estimators=100,
                                                                  max_depth=5, min_samples_leaf=50)),
                                           (GradientBoostingClassifier(), GradientBoostingRegressor(),
                                            RandomForestRegressor(n_estimators=100,
                                                                  max_depth=5, min_samples_leaf=50)),
                                           (LogisticRegression(solver='lbfgs', multi_class='auto'),
                                            LinearRegression(), StatsModelsLinearRegression())]:
                                for multitask_model_final in [False, True]:
                                    if (not isinstance(models, StatsModelsLinearRegression))\
                                            and (sample_var is not None):
                                        continue
                                    with self.subTest(X=X, W=W, sample_weight=sample_weight, sample_var=sample_var,
                                                      featurizer=featurizer, models=models,
                                                      multitask_model_final=multitask_model_final):
                                        est = DRLearner(model_propensity=models[0],
                                                        model_regression=models[1],
                                                        model_final=models[2],
                                                        featurizer=featurizer,
                                                        multitask_model_final=multitask_model_final)
                                        if (X is None) and (W is None):
                                            with pytest.raises(AttributeError) as e_info:
                                                est.fit(y, T, X=X, W=W,
                                                        sample_weight=sample_weight, sample_var=sample_var)
                                            continue
                                        est.fit(
                                            y, T, X=X, W=W, sample_weight=sample_weight, sample_var=sample_var)
                                        np.testing.assert_allclose(est.effect(X[:3], T0=0, T1=1), 1 + .5 * X[:3, 0],
                                                                   rtol=0, atol=.15)
                                        np.testing.assert_allclose(est.const_marginal_effect(X[:3]),
                                                                   np.hstack(
                                                                       [1 + .5 * X[:3, [0]],
                                                                        2 * (1 + .5 * X[:3, [0]])]),
                                                                   rtol=0, atol=.15)
                                        for t in [1, 2]:
                                            np.testing.assert_allclose(est.marginal_effect(t, X[:3]),
                                                                       np.hstack([1 + .5 * X[:3, [0]],
                                                                                  2 * (1 + .5 * X[:3, [0]])]),
                                                                       rtol=0, atol=.15)
                                        assert isinstance(est.score_, float)
                                        assert isinstance(
                                            est.score(y, T, X=X, W=W), float)

                                        feat_names = ['A', 'B', 'C']
                                        out_feat_names = feat_names
                                        if featurizer is not None:
                                            out_feat_names = featurizer.fit(
                                                X).get_feature_names(feat_names)
                                            np.testing.assert_array_equal(
                                                est.featurizer.n_input_features_, 3)
                                        np.testing.assert_array_equal(est.cate_feature_names(feat_names),
                                                                      out_feat_names)

                                        if isinstance(models[0], GradientBoostingClassifier):
                                            np.testing.assert_array_equal(np.array([mdl.feature_importances_
                                                                                    for mdl
                                                                                    in est.models_regression]).shape,
                                                                          [2, 2 + X.shape[1] +
                                                                           (W.shape[1] if W is not None else 0)])
                                            np.testing.assert_array_equal(np.array([mdl.feature_importances_
                                                                                    for mdl
                                                                                    in est.models_propensity]).shape,
                                                                          [2, X.shape[1] +
                                                                           (W.shape[1] if W is not None else 0)])
                                        else:
                                            np.testing.assert_array_equal(np.array([mdl.coef_
                                                                                    for mdl
                                                                                    in est.models_regression]).shape,
                                                                          [2, 2 + X.shape[1] +
                                                                           (W.shape[1] if W is not None else 0)])
                                            np.testing.assert_array_equal(np.array([mdl.coef_
                                                                                    for mdl
                                                                                    in est.models_propensity]).shape,
                                                                          [2, 3, X.shape[1] +
                                                                           (W.shape[1] if W is not None else 0)])
                                        if multitask_model_final:
                                            if isinstance(models[2], RandomForestRegressor):
                                                np.testing.assert_equal(np.argsort(
                                                    est.multitask_model_cate.feature_importances_)[-1], 0)
                                            else:
                                                true_coef = np.zeros(
                                                    (2, len(out_feat_names)))
                                                true_coef[:, 0] = [.5, 1]
                                                np.testing.assert_allclose(
                                                    est.multitask_model_cate.coef_, true_coef, rtol=0, atol=.15)
                                                np.testing.assert_allclose(
                                                    est.multitask_model_cate.intercept_, [1, 2], rtol=0, atol=.15)
                                        else:
                                            for t in [1, 2]:
                                                if isinstance(models[2], RandomForestRegressor):
                                                    np.testing.assert_equal(np.argsort(
                                                        est.model_cate(T=t).feature_importances_)[-1], 0)
                                                else:
                                                    true_coef = np.zeros(
                                                        len(out_feat_names))
                                                    true_coef[0] = .5 * t
                                                    np.testing.assert_allclose(
                                                        est.model_cate(T=t).coef_, true_coef, rtol=0, atol=.15)
                                                    np.testing.assert_allclose(
                                                        est.model_cate(T=t).intercept_, t, rtol=0, atol=.15)
Ejemplo n.º 4
0
    def __init__(self, model_propensity=LogisticRegressionCV(cv=3, solver='lbfgs', multi_class='auto'),
                 model_regression=WeightedLassoCV(cv=3),
                 model_final=StatsModelsLinearRegression(),
                 multitask_model_final=False,
                 featurizer=None,
                 n_splits=2,
                 random_state=None):
        class ModelNuisance:
            def __init__(self, model_propensity, model_regression):
                self._model_propensity = model_propensity
                self._model_regression = model_regression

            def _combine(self, X, W):
                return np.hstack([arr for arr in [X, W] if arr is not None])

            def fit(self, Y, T, X=None, W=None, *, sample_weight=None):
                # TODO Allow for non-vector y, i.e. of shape (n, 1)
                assert np.ndim(Y) == 1, "Can only accept single dimensional outcomes Y! Use Y.ravel()."
                if (X is None) and (W is None):
                    raise AttributeError("At least one of X or W has to not be None!")
                if np.any(np.all(T == 0, axis=0)) or (not np.any(np.all(T == 0, axis=1))):
                    raise AttributeError("Provided crossfit folds contain training splits that " +
                                         "don't contain all treatments")
                XW = self._combine(X, W)
                filtered_kwargs = _filter_none_kwargs(sample_weight=sample_weight)
                self._model_propensity.fit(XW, inverse_onehot(T), **filtered_kwargs)
                self._model_regression.fit(np.hstack([XW, T]), Y, **filtered_kwargs)
                return self

            def predict(self, Y, T, X=None, W=None, *, sample_weight=None):
                XW = self._combine(X, W)
                propensities = self._model_propensity.predict_proba(XW)
                Y_pred = np.zeros((T.shape[0], T.shape[1] + 1))
                T_counter = np.zeros(T.shape)
                Y_pred[:, 0] = self._model_regression.predict(np.hstack([XW, T_counter]))
                Y_pred[:, 0] += (Y - Y_pred[:, 0]) * np.all(T == 0, axis=1) / propensities[:, 0]
                for t in np.arange(T.shape[1]):
                    T_counter = np.zeros(T.shape)
                    T_counter[:, t] = 1
                    Y_pred[:, t + 1] = self._model_regression.predict(np.hstack([XW, T_counter]))
                    Y_pred[:, t + 1] += (Y - Y_pred[:, t + 1]) * (T[:, t] == 1) / propensities[:, t + 1]
                return Y_pred

        class ModelFinal:
            # Coding Remark: The reasoning around the multitask_model_final could have been simplified if
            # we simply wrapped the model_final with a MultiOutputRegressor. However, because we also want
            # to allow even for model_final objects whose fit(X, y) can accept X=None
            # (e.g. the StatsModelsLinearRegression), we cannot take that route, because the MultiOutputRegressor
            # checks that X is 2D array.
            def __init__(self, model_final, featurizer, multitask_model_final):
                self._model_final = clone(model_final, safe=False)
                self._featurizer = clone(featurizer, safe=False)
                self._multitask_model_final = multitask_model_final
                return

            def fit(self, Y, T, X=None, W=None, *, nuisances, sample_weight=None, sample_var=None):
                Y_pred, = nuisances
                if (X is not None) and (self._featurizer is not None):
                    X = self._featurizer.fit_transform(X)
                filtered_kwargs = _filter_none_kwargs(sample_weight=sample_weight, sample_var=sample_var)
                if self._multitask_model_final:
                    self.model_cate = clone(self._model_final, safe=False).fit(
                        X, Y_pred[:, 1:] - Y_pred[:, [0]], **filtered_kwargs)
                else:
                    self.models_cate = [clone(self._model_final, safe=False).fit(X, Y_pred[:, t] - Y_pred[:, 0],
                                                                                 **filtered_kwargs)
                                        for t in np.arange(1, Y_pred.shape[1])]
                return self

            def predict(self, X=None):
                if (X is not None) and (self._featurizer is not None):
                    X = self._featurizer.transform(X)
                if self._multitask_model_final:
                    return self.model_cate.predict(X)
                else:
                    return np.array([mdl.predict(X) for mdl in self.models_cate]).T

            def score(self, Y, T, X=None, W=None, *, nuisances, sample_weight=None, sample_var=None):
                if (X is not None) and (self._featurizer is not None):
                    X = self._featurizer.transform(X)
                Y_pred, = nuisances
                if self._multitask_model_final:
                    return np.mean(np.average((Y_pred[:, 1:] - Y_pred[:, [0]] - self.model_cate.predict(X))**2,
                                              weights=sample_weight, axis=0))
                else:
                    return np.mean([np.average((Y_pred[:, t] - Y_pred[:, 0] - self.models_cate[t - 1].predict(X))**2,
                                               weights=sample_weight, axis=0)
                                    for t in np.arange(1, Y_pred.shape[1])])

        self._multitask_model_final = multitask_model_final
        super().__init__(ModelNuisance(model_propensity, model_regression),
                         ModelFinal(model_final, featurizer, multitask_model_final),
                         n_splits=n_splits, discrete_treatment=True,
                         random_state=random_state)
Ejemplo n.º 5
0
    def test_comp_with_statsmodels(self):
        """ Comparing with confidence intervals and standard errors of statsmodels in the un-weighted case """
        np.random.seed(123)

        # Single dimensional output y
        n = 1000
        d = 3
        X = np.random.binomial(1, .8, size=(n, d))
        T = np.random.binomial(1, .5 * X[:, 0] + .25, size=(n,))

        def true_effect(x):
            return x[:, 0] + .5
        y = true_effect(X) * T + X[:, 0] + X[:, 2] + np.random.normal(0, 1, size=(n,))
        X_test = np.unique(np.random.binomial(1, .5, size=(n, d)), axis=0)
        for fit_intercept in [True, False]:
            for cov_type in ['nonrobust', 'HC0', 'HC1']:
                est = OLS(fit_intercept=fit_intercept, cov_type=cov_type).fit(X, y)
                lr = StatsModelsOLS(fit_intercept=fit_intercept, fit_args={
                                    'cov_type': cov_type, 'use_t': False}).fit(X, y)
                _compare_classes(est, lr, X_test)

        n = 1000
        d = 3
        X = np.random.normal(0, 1, size=(n, d))
        y = X[:, 0] + X[:, 2] + np.random.normal(0, 1, size=(n,))
        X_test = np.unique(np.random.binomial(1, .5, size=(n, d)), axis=0)
        for fit_intercept in [True, False]:
            for cov_type in ['nonrobust', 'HC0', 'HC1']:
                est = OLS(fit_intercept=fit_intercept, cov_type=cov_type).fit(X, y)
                lr = StatsModelsOLS(fit_intercept=fit_intercept, fit_args={
                                    'cov_type': cov_type, 'use_t': False}).fit(X, y)
                _compare_classes(est, lr, X_test)

        d = 3
        X = np.vstack([np.eye(d)])
        y = np.concatenate((X[:, 0] - 1, X[:, 0] + 1))
        X = np.vstack([X, X])
        X_test = np.unique(np.random.binomial(1, .5, size=(n, d)), axis=0)

        for cov_type in ['nonrobust', 'HC0', 'HC1']:
            for alpha in [.01, .05, .1]:
                _compare_classes(OLS(fit_intercept=False, cov_type=cov_type).fit(X, y),
                                 StatsModelsOLS(fit_intercept=False, fit_args={
                                                'cov_type': cov_type, 'use_t': False}).fit(X, y),
                                 X_test, alpha=alpha)

        d = 3
        X = np.vstack([np.eye(d), np.ones((1, d)), np.zeros((1, d))])
        y = np.concatenate((X[:, 0] - 1, X[:, 0] + 1))
        X = np.vstack([X, X])
        X_test = np.unique(np.random.binomial(1, .5, size=(n, d)), axis=0)
        for cov_type in ['nonrobust', 'HC0', 'HC1']:
            _compare_classes(OLS(fit_intercept=True, cov_type=cov_type).fit(X, y),
                             StatsModelsOLS(fit_intercept=True,
                                            fit_args={'cov_type': cov_type, 'use_t': False}).fit(X, y), X_test)

        # Multi-dimensional output y
        n = 1000
        d = 3
        for p in np.arange(1, 4):
            X = np.random.binomial(1, .8, size=(n, d))
            T = np.random.binomial(1, .5 * X[:, 0] + .25, size=(n,))

            def true_effect(x):
                return np.hstack([x[:, [0]] + .5 + t for t in range(p)])
            y = np.zeros((n, p))
            y = true_effect(X) * T.reshape(-1, 1) + X[:, [0] * p] + \
                (0 * X[:, [0] * p] + 1) * np.random.normal(0, 1, size=(n, p))

            for cov_type in ['nonrobust', 'HC0', 'HC1']:
                for fit_intercept in [True, False]:
                    for alpha in [.01, .05, .2]:
                        est = OLS(fit_intercept=fit_intercept, cov_type=cov_type).fit(X, y)
                        lr = [StatsModelsOLS(fit_intercept=fit_intercept, fit_args={
                                             'cov_type': cov_type, 'use_t': False}).fit(X, y[:, t]) for t in range(p)]
                        for t in range(p):
                            assert np.all(np.abs(est.coef_[t] - lr[t].coef_) < 1e-12),\
                                "{}, {}, {}: {}, {}".format(cov_type, fit_intercept, t, est.coef_[t], lr[t].coef_)
                            assert np.all(np.abs(np.array(est.coef__interval(alpha=alpha))[:, t] -
                                                 lr[t].coef__interval(alpha=alpha)) < 1e-12),\
                                "{}, {}, {}: {} vs {}".format(cov_type, fit_intercept, t,
                                                              np.array(est.coef__interval(alpha=alpha))[:, t],
                                                              lr[t].coef__interval(alpha=alpha))
                            assert np.all(np.abs(est.intercept_[t] - lr[t].intercept_) < 1e-12),\
                                "{}, {}, {}: {} vs {}".format(cov_type, fit_intercept, t,
                                                              est.intercept_[t], lr[t].intercept_)
                            assert np.all(np.abs(np.array(est.intercept__interval(alpha=alpha))[:, t] -
                                                 lr[t].intercept__interval(alpha=alpha)) < 1e-12),\
                                "{}, {}, {}: {} vs {}".format(cov_type, fit_intercept, t,
                                                              np.array(est.intercept__interval(alpha=alpha))[:, t],
                                                              lr[t].intercept__interval(alpha=alpha))
                            assert np.all(np.abs(est.predict(X_test)[:, t] - lr[t].predict(X_test)) < 1e-12),\
                                "{}, {}, {}: {} vs {}".format(cov_type, fit_intercept, t, est.predict(X_test)[
                                                              :, t], lr[t].predict(X_test))
                            assert np.all(np.abs(np.array(est.predict_interval(X_test, alpha=alpha))[:, :, t] -
                                                 lr[t].predict_interval(X_test, alpha=alpha)) < 1e-12),\
                                "{}, {}, {}: {} vs {}".format(cov_type, fit_intercept, t,
                                                              np.array(est.predict_interval(X_test,
                                                                                            alpha=alpha))[:, :, t],
                                                              lr[t].predict_interval(X_test, alpha=alpha))
Ejemplo n.º 6
0
    def test_inference(self):
        """ Testing that we recover the expected standard errors and confidence intervals in a known example """

        # 1-d output
        d = 3
        X = np.vstack([np.eye(d)])
        y = X[:, 0]
        est = OLS(fit_intercept=False).fit(X, y)
        assert np.all(np.abs(est.coef_ - [1, 0, 0]) <= 1e-12), "{}, {}".format(est.coef_, [1, 0, 0])
        assert np.all(np.abs(est.coef__interval() - np.array([[1, 0, 0], [1, 0, 0]])) <= 1e-12),\
            "{}, {}".format(est.coef__interval(), np.array([[1, 0, 0], [1, 0, 0]]))
        assert np.all(est.coef_stderr_ <= 1e-12)
        assert np.all(est._param_var <= 1e-12)

        d = 3
        X = np.vstack([np.eye(d), np.ones((1, d)), np.zeros((1, d))])
        y = X[:, 0]
        est = OLS(fit_intercept=True).fit(X, y)
        assert np.all(np.abs(est.coef_ - np.array([1] + [0] * (d - 1))) <=
                      1e-12), "{}, {}".format(est.coef_, [1] + [0] * (d - 1))
        assert np.all(np.abs(est.coef__interval() - np.array([[1] + [0] * (d - 1), [1] + [0] * (d - 1)])) <= 1e-12),\
            "{}, {}".format(est.coef__interval(), np.array([[1] + [0] * (d - 1), [1] + [0] * (d - 1)]))
        assert np.all(est.coef_stderr_ <= 1e-12)
        assert np.all(est._param_var <= 1e-12)
        assert np.abs(est.intercept_) <= 1e-12
        assert np.all(np.abs(est.intercept__interval()) <= 1e-12)

        d = 3
        X = np.vstack([np.eye(d)])
        y = np.concatenate((X[:, 0] - 1, X[:, 0] + 1))
        X = np.vstack([X, X])
        est = OLS(fit_intercept=False).fit(X, y)
        assert np.all(np.abs(est.coef_ - ([1] + [0] * (d - 1))) <=
                      1e-12), "{}, {}".format(est.coef_, [1] + [0] * (d - 1))
        assert np.all(np.abs(est.coef_stderr_ - np.array([1] * d)) <= 1e-12)
        assert np.all(np.abs(est.coef__interval()[0] -
                             np.array([scipy.stats.norm.ppf(.025, loc=1, scale=1)] +
                                      [scipy.stats.norm.ppf(.025, loc=0, scale=1)] * (d - 1))) <= 1e-12),\
            "{}, {}".format(est.coef__interval()[0], np.array([scipy.stats.norm.ppf(.025, loc=1, scale=1)] +
                                                              [scipy.stats.norm.ppf(.025, loc=0, scale=1)] * (d - 1)))
        assert np.all(np.abs(est.coef__interval()[1] -
                             np.array([scipy.stats.norm.ppf(.975, loc=1, scale=1)] +
                                      [scipy.stats.norm.ppf(.975, loc=0, scale=1)] * (d - 1))) <= 1e-12),\
            "{}, {}".format(est.coef__interval()[1], np.array([scipy.stats.norm.ppf(.975, loc=1, scale=1)] +
                                                              [scipy.stats.norm.ppf(.975, loc=0, scale=1)] * (d - 1)))

        # 2-d output
        d = 3
        p = 4
        X = np.vstack([np.eye(d)])
        y = np.vstack((X[:, [0] * p] - 1, X[:, [0] * p] + 1))
        X = np.vstack([X, X])
        est = OLS(fit_intercept=False).fit(X, y)
        for t in range(p):
            assert np.all(np.abs(est.coef_[t] - ([1] + [0] * (d - 1))) <=
                          1e-12), "{}, {}".format(est.coef_[t], [1] + [0] * (d - 1))
            assert np.all(np.abs(est.coef_stderr_[t] - np.array([1] * d)) <= 1e-12), "{}".format(est.coef_stderr_[t])
            assert np.all(np.abs(est.coef__interval()[0][t] -
                                 np.array([scipy.stats.norm.ppf(.025, loc=1, scale=1)] +
                                          [scipy.stats.norm.ppf(.025, loc=0, scale=1)] * (d - 1))) <= 1e-12),\
                "{}, {}".format(est.coef__interval()[0][t],
                                np.array([scipy.stats.norm.ppf(.025, loc=1, scale=1)] +
                                         [scipy.stats.norm.ppf(.025, loc=0, scale=1)] * (d - 1)))
            assert np.all(np.abs(est.coef__interval()[1][t] -
                                 np.array([scipy.stats.norm.ppf(.975, loc=1, scale=1)] +
                                          [scipy.stats.norm.ppf(.975, loc=0, scale=1)] * (d - 1))) <= 1e-12),\
                "{}, {}".format(est.coef__interval()[1][t],
                                np.array([scipy.stats.norm.ppf(.975, loc=1, scale=1)] +
                                         [scipy.stats.norm.ppf(.975, loc=0, scale=1)] * (d - 1)))
            assert np.all(np.abs(est.intercept_[t]) <= 1e-12), "{}, {}".format(est.intercept_[t])
            assert np.all(np.abs(est.intercept_stderr_[t]) <= 1e-12), "{}".format(est.intercept_stderr_[t])
            assert np.all(np.abs(est.intercept__interval()[0][t]) <=
                          1e-12), "{}".format(est.intercept__interval()[0][t])

        d = 3
        p = 4
        X = np.vstack([np.eye(d), np.zeros((1, d))])
        y = np.vstack((X[:, [0] * p] - 1, X[:, [0] * p] + 1))
        X = np.vstack([X, X])
        est = OLS(fit_intercept=True).fit(X, y)
        for t in range(p):
            assert np.all(np.abs(est.coef_[t] - ([1] + [0] * (d - 1))) <=
                          1e-12), "{}, {}".format(est.coef_[t], [1] + [0] * (d - 1))
            assert np.all(np.abs(est.coef_stderr_[t] - np.array([np.sqrt(2)] * d)) <=
                          1e-12), "{}".format(est.coef_stderr_[t])
            assert np.all(np.abs(est.coef__interval()[0][t] -
                                 np.array([scipy.stats.norm.ppf(.025, loc=1, scale=np.sqrt(2))] +
                                          [scipy.stats.norm.ppf(.025, loc=0, scale=np.sqrt(2))] * (d - 1))) <= 1e-12),\
                "{}, {}".format(est.coef__interval()[0][t],
                                np.array([scipy.stats.norm.ppf(.025, loc=1, scale=np.sqrt(2))] +
                                         [scipy.stats.norm.ppf(.025, loc=0, scale=np.sqrt(2))] * (d - 1)))
            assert np.all(np.abs(est.coef__interval()[1][t] -
                                 np.array([scipy.stats.norm.ppf(.975, loc=1, scale=np.sqrt(2))] +
                                          [scipy.stats.norm.ppf(.975, loc=0, scale=np.sqrt(2))] * (d - 1))) <= 1e-12),\
                "{}, {}".format(est.coef__interval()[1][t],
                                np.array([scipy.stats.norm.ppf(.975, loc=1, scale=np.sqrt(2))] +
                                         [scipy.stats.norm.ppf(.975, loc=0, scale=np.sqrt(2))] * (d - 1)))
            assert np.all(np.abs(est.intercept_[t]) <= 1e-12), "{}, {}".format(est.intercept_[t])
            assert np.all(np.abs(est.intercept_stderr_[t] - 1) <= 1e-12), "{}".format(est.intercept_stderr_[t])
            assert np.all(np.abs(est.intercept__interval()[0][t] -
                                 scipy.stats.norm.ppf(.025, loc=0, scale=1)) <= 1e-12),\
                "{}, {}".format(est.intercept__interval()[0][t], scipy.stats.norm.ppf(.025, loc=0, scale=1))