Beispiel #1
0
 def test_can_use_vectors(self):
     """Test that we can pass vectors for T and Y (not only 2-dimensional arrays)."""
     dml = DMLCateEstimator(LinearRegression(),
                            LinearRegression(),
                            featurizer=FunctionTransformer())
     dml.fit(np.array([1, 2, 3, 1, 2, 3]), np.array([1, 2, 3, 1, 2, 3]),
             np.ones((6, 1)))
     self.assertAlmostEqual(dml.coef_.reshape(())[()], 1)
Beispiel #2
0
    def test_cate_api(self):
        """Test that we correctly implement the CATE API."""
        for d_t in [2, -1]:
            for d_y in [2, -1]:
                for d_x in [2, None]:
                    for d_w in [2, None]:
                        for est in [
                                DMLCateEstimator(model_y=LinearRegression(),
                                                 model_t=LinearRegression()),
                                SparseLinearDMLCateEstimator(
                                    linear_model_y=LinearRegression(),
                                    linear_model_t=LinearRegression()),
                                KernelDMLCateEstimator(
                                    model_y=LinearRegression(),
                                    model_t=LinearRegression())
                        ]:
                            n = 20
                            with self.subTest(d_w=d_w,
                                              d_x=d_x,
                                              d_y=d_y,
                                              d_t=d_t):
                                W, X, Y, T = [
                                    np.random.normal(size=(n, d)) if
                                    (d and d >= 0) else np.random.normal(
                                        size=(n, )) if (d and d < 0) else None
                                    for d in [d_w, d_x, d_y, d_t]
                                ]

                                est.fit(Y, T, X, W)
                                # just make sure we can call the marginal_effect and effect methods
                                est.marginal_effect(None, X)
                                est.effect(0, T, X)
Beispiel #3
0
 def test_discrete_treatments(self):
     """Test that we can use discrete treatments"""
     dml = DMLCateEstimator(LinearRegression(),
                            LogisticRegression(C=1000),
                            featurizer=FunctionTransformer(),
                            discrete_treatment=True)
     # create a simple artificial setup where effect of moving from treatment
     #     1 -> 2 is 2,
     #     1 -> 3 is 1, and
     #     2 -> 3 is -1 (necessarily, by composing the previous two effects)
     # Using an uneven number of examples from different classes,
     # and having the treatments in non-lexicographic order,
     # Should rule out some basic issues.
     dml.fit(np.array([2, 3, 1, 3, 2, 1, 1, 1]),
             np.array([3, 2, 1, 2, 3, 1, 1, 1]), np.ones((8, 1)))
     np.testing.assert_almost_equal(
         dml.effect(np.array([1, 1, 1, 2, 2, 2, 3, 3, 3]),
                    np.array([1, 2, 3, 1, 2, 3, 1, 2, 3]), np.ones((9, 1))),
         [0, 2, 1, -2, 0, -1, -1, 1, 0])
Beispiel #4
0
    def test_ignores_final_intercept(self):
        """Test that final model intercepts are ignored (with a warning)"""
        class InterceptModel:
            def fit(Y, X):
                pass

            def predict(X):
                return X + 1

        # (incorrectly) use a final model with an intercept
        dml = DMLCateEstimator(LinearRegression(),
                               LinearRegression(),
                               model_final=InterceptModel)
        # Because final model is fixed, actual values of T and Y don't matter
        t = np.random.normal(size=100)
        y = np.random.normal(size=100)
        with self.assertWarns(
                Warning):  # we should warn whenever there's an intercept
            dml.fit(y, t)
        assert dml.const_marginal_effect(
        ) == 1  # coefficient on X in InterceptModel is 1
Beispiel #5
0
    def test_complex_features(self):
        # recover simple features by initializing complex features appropriately
        for _ in range(10):
            d_w = np.random.randint(0, 4)
            d_x = np.random.randint(1, 3)
            d_y = np.random.randint(1, 3)
            d_t = np.random.randint(1, 3)
            n = 20
            with self.subTest(d_w=d_w, d_x=d_x, d_y=d_y, d_t=d_t):
                W, X, Y, T = [np.random.normal(size=(n, d)) for d in [d_w, d_x, d_y, d_t]]
                # using full set of matrix features should be equivalent to using non-matrix featurizer
                dml = DMLCateEstimator(model_y=LinearRegression(), model_t=LinearRegression(),
                                       featurizer=MatrixFeatures(d_y, d_t))
                dml.fit(Y, T, X, W)
                coef1 = dml.coef_

                dml = DMLCateEstimator(model_y=LinearRegression(), model_t=LinearRegression())
                dml.fit(Y, T, X, W)
                coef2 = dml.coef_

                np.testing.assert_allclose(coef1, reshape(coef2, -1))
Beispiel #6
0
    def test_with_econml(self):
        """
        Test that we can bootstrap econml estimators
        """
        x = np.random.normal(size=(1000, 2))
        t = np.random.normal(size=(1000, 1))
        t2 = np.random.normal(size=(1000, 1))
        y = x[:, 0] * 0.5 + t + np.random.normal(size=(1000, 1))

        est = DMLCateEstimator(LinearRegression(), LinearRegression())
        est.fit(y, t, x)

        bs = BootstrapEstimator(est, 50)
        # test that we can fit with the same arguments as the base estimator
        bs.fit(y, t, x)

        # test that we can get the same attribute for the bootstrap as the original, with the same shape
        self.assertEqual(np.shape(est.coef_), np.shape(bs.coef_))

        # test that we can get an interval for the same attribute for the bootstrap as the original,
        # with the same shape for the lower and upper bounds
        lower, upper = bs.coef__interval()
        for bound in [lower, upper]:
            self.assertEqual(np.shape(est.coef_), np.shape(bound))

        # test that we can do the same thing once we provide percentile bounds
        lower, upper = bs.coef__interval(lower=10, upper=90)
        for bound in [lower, upper]:
            self.assertEqual(np.shape(est.coef_), np.shape(bound))

        # test that we can do the same thing with the results of a method, rather than an attribute
        self.assertEqual(np.shape(est.effect(t, t2, x)),
                         np.shape(bs.effect(t, t2, x)))

        # test that we can get an interval for the same attribute for the bootstrap as the original,
        # with the same shape for the lower and upper bounds
        lower, upper = bs.effect_interval(t, t2, x)
        for bound in [lower, upper]:
            self.assertEqual(np.shape(est.effect(t, t2, x)), np.shape(bound))

        # test that we can do the same thing once we provide percentile bounds
        lower, upper = bs.effect_interval(t, t2, x, lower=10, upper=90)
        for bound in [lower, upper]:
            self.assertEqual(np.shape(est.effect(t, t2, x)), np.shape(bound))
Beispiel #7
0
    def test_internal(self):
        """Test that the internal use of bootstrap within an estimator works."""
        x = np.random.normal(size=(1000, 2))
        t = np.random.normal(size=(1000, 1))
        t2 = np.random.normal(size=(1000, 1))
        y = x[:, 0] * 0.5 + t + np.random.normal(size=(1000, 1))

        est = DMLCateEstimator(LinearRegression(),
                               LinearRegression(),
                               inference='bootstrap')
        est.fit(y, t, x)

        # test that we can get an interval for the same attribute for the bootstrap as the original,
        # with the same shape for the lower and upper bounds
        lower, upper = est.effect_interval(x, t, t2)
        for bound in [lower, upper]:
            self.assertEqual(np.shape(est.effect(x, t, t2)), np.shape(bound))
        self.assertFalse(np.allclose(lower, upper))

        # test that we can do the same thing once we provide percentile bounds
        lower, upper = est.effect_interval(x, t, t2, lower=10, upper=90)
        for bound in [lower, upper]:
            self.assertEqual(np.shape(est.effect(x, t, t2)), np.shape(bound))
        self.assertFalse(np.allclose(lower, upper))
Beispiel #8
0
'''
EconML from Microsoft
python econometrics library with ML automation for causal inference
'''

# Double ML
from econml.dml import DMLCateEstimator
from sklearn.linear_model import LassoCV

est = DMLCateEstimator(model_y=LassoCV(), model_t=LassoCV())
est.fit(Y, T, X, W) # W -> high-dimensional confounders, X -> features
treatment_effects = est.effect(X_test)

# Orthogonal Random Forest

from econml.ortho_forest import ContinuousTreatmentOrthoForest
# Use defaults
est = ContinuousTreatmentOrthoForest()
# Or specify hyperparameters
est = ContinuousTreatmentOrthoForest(n_trees=500, min_leaf_size=10, max_depth=10, 
                                     subsample_ratio=0.7, lambda_reg=0.01,
                                     model_T=LassoCV(cv=3), model_Y=LassoCV(cv=3)
                                     )
est.fit(Y, T, X, W)
treatment_effects = est.effect(X_test)
Beispiel #9
0
    def test_cate_api(self):
        """Test that we correctly implement the CATE API."""
        d_w = 2
        d_x = 2
        d_y = 2
        d_t = 2
        n = 20
        with self.subTest(d_w=d_w, d_x=d_x, d_y=d_y, d_t=d_t):
            W, X, Y, T = [np.random.normal(size=(n, d)) for d in [d_w, d_x, d_y, d_t]]

            dml = DMLCateEstimator(model_y=LinearRegression(), model_t=LinearRegression())
            dml.fit(Y, T, X, W)
            # just make sure we can call the marginal_effect and effect methods
            dml.marginal_effect(None, X)
            dml.effect(0, T, X)

            dml = DMLCateEstimator(model_y=LinearRegression(), model_t=LinearRegression(),
                                   featurizer=MatrixFeatures(d_y, d_t))
            dml.fit(Y, T, X, W)
            # just make sure we can call the marginal_effect and effect methods
            dml.marginal_effect(None, X)
            dml.effect(0, T, X)
Beispiel #10
0
def test_dominicks():
    file_name = "oj_large.csv"
    if not os.path.isfile(file_name):
        print("Downloading file (this might take a few seconds)...")
        urllib.request.urlretrieve(
            "https://msalicedatapublic.blob.core.windows.net/datasets/OrangeJuice/oj_large.csv",
            file_name)
    oj_data = pd.read_csv(file_name)

    brands = sorted(set(oj_data["brand"]))
    stores = sorted(set(oj_data["store"]))

    featnames = ["week", "feat"] + list(oj_data.columns[6:])

    # Preprocess data
    import datetime
    import numpy as np

    # Convert 'week' to a date
    # week_zero = datetime.datetime.strptime("09/07/89", "%m/%d/%y")
    # oj_data["week"] = pd.to_timedelta(oj_data["week"], unit='w') + week_zero

    # Take log of price
    oj_data["logprice"] = np.log(oj_data["price"])
    oj_data.drop("price", axis=1, inplace=True)

    # Make brand numeric
    oj_data["brand"] = [brands.index(b) for b in oj_data["brand"]]

    class PriceFeaturizer(TransformerMixin):
        def __init__(self,
                     n_prods,
                     own_price=True,
                     cross_price_groups=False,
                     cross_price_indiv=True,
                     per_product_effects=True):
            base_arrays = []
            effect_names = []
            one_hots = [(0, ) * p + (1, ) + (0, ) * (n_prods - p - 1)
                        for p in range(n_prods)]
            if own_price:
                base_arrays.append(np.eye(n_prods))
                effect_names.append("own price")
            if cross_price_groups:
                base_arrays.append((np.ones(
                    (n_prods, n_prods)) - np.eye(n_prods)) / (n_prods - 1))
                effect_names.append("group cross price")
            if cross_price_indiv:
                for p in range(n_prods):
                    base_arrays.append(one_hots[p] * np.ones((n_prods, 1)) -
                                       np.diag(one_hots[p]))
                    effect_names.append("cross price effect {} ->".format(p))
            if per_product_effects:
                all = [(np.diag(one_hots[p]) @ arr, nm + " {}".format(p))
                       for arr, nm in zip(base_arrays, effect_names)
                       for p in range(n_prods)]
                # remove meaningless features (e.g. cross-price effects of products on themselves),
                # which have all zero coeffs
                nonempty = [(arr, nm) for arr, nm in all
                            if np.count_nonzero(arr) > 0]
                self._features = [arr for arr, _ in nonempty]
                self._names = [nm for _, nm in nonempty]
            else:
                self._features = base_arrays
                self._names = effect_names

        def fit(self, X):
            self._is_fitted = True
            assert shape(X)[1] == 0
            return self

        def transform(self, X):
            assert self._is_fitted
            assert shape(X)[1] == 0
            return np.tile(self._features, (shape(X)[0], 1, 1, 1))

        @property
        def names(self):
            return self._names

    for name, op, xp_g, xp_i, pp in [
        ("Homogeneous treatment effect", True, False, False, False),
        ("Heterogeneous treatment effects", True, False, False, True),
        (("Heterogeneous treatment effects"
          " with group effects"), True, True, False, True),
        (("Heterogeneous treatment effects"
          " with cross price effects"), True, False, True, True)
    ]:

        print(name)
        np.random.seed(42)

        ft = PriceFeaturizer(n_prods=3,
                             own_price=op,
                             cross_price_groups=xp_g,
                             cross_price_indiv=xp_i,
                             per_product_effects=pp)
        names = ft.names
        dml = DMLCateEstimator(model_y=RandomForestRegressor(),
                               model_t=RandomForestRegressor(),
                               featurizer=ft,
                               n_splits=2)

        effects = []
        for store in stores:
            data = oj_data[oj_data['store'] == store].sort_values(
                by=['week', 'brand'])
            dml.fit(T=reshape(data.as_matrix(["logprice"]), (-1, 3)),
                    Y=reshape(data.as_matrix(["logmove"]), (-1, 3)),
                    W=reshape(data.as_matrix(featnames),
                              (-1, 3 * len(featnames))))
            effects.append(dml.coef_)
        effects = np.array(effects)
        for nm, eff in zip(names, effects.T):
            print(" Effect: {}".format(nm))
            print("   Mean: {}".format(np.mean(eff)))
            print("   Std.: {}".format(np.std(eff)))

    class ConstFt(TransformerMixin):
        def fit(self, X):
            return self

        def transform(self, X):
            return np.ones((shape(X)[0], 1))

    print("Vanilla HTE+XP")

    np.random.seed(42)
    dml = DMLCateEstimator(model_y=RandomForestRegressor(),
                           model_t=RandomForestRegressor(),
                           featurizer=ConstFt(),
                           n_splits=2)

    effects = []
    for store in stores:
        data = oj_data[oj_data['store'] == store].sort_values(
            by=['week', 'brand'])
        dml.fit(T=reshape(data.as_matrix(["logprice"]), (-1, 3)),
                Y=reshape(data.as_matrix(["logmove"]), (-1, 3)),
                W=reshape(data.as_matrix(featnames), (-1, 3 * len(featnames))))
        effects.append(dml.coef_)
    effects = np.array(effects)
    names = ["{} on {}".format(i, j) for j in range(3) for i in range(3)]
    for nm, eff in zip(names, reshape(effects, (-1, 9)).T):
        print(" Effect: {}".format(nm))
        print("   Mean: {}".format(np.mean(eff)))
        print("   Std.: {}".format(np.std(eff)))
Beispiel #11
0
 def test_access_to_internal_models(self):
     """
     Test that API related to accessing the nuisance models, cate_model and featurizer is working.
     """
     Y = np.array([2, 3, 1, 3, 2, 1, 1, 1])
     T = np.array([3, 2, 1, 2, 1, 2, 1, 3])
     X = np.ones((8, 1))
     est = DMLCateEstimator(model_y=WeightedLasso(),
                            model_t=LogisticRegression(),
                            model_final=WeightedLasso(),
                            featurizer=PolynomialFeatures(
                                degree=2, include_bias=False),
                            fit_cate_intercept=True,
                            discrete_treatment=True)
     est.fit(Y, T, X)
     assert isinstance(est.original_featurizer, PolynomialFeatures)
     assert isinstance(est.featurizer, Pipeline)
     assert isinstance(est.model_cate, WeightedLasso)
     for mdl in est.models_y:
         assert isinstance(mdl, WeightedLasso)
     for mdl in est.models_t:
         assert isinstance(mdl, LogisticRegression)
     np.testing.assert_array_equal(est.cate_feature_names(['A']),
                                   ['A', 'A^2'])
     np.testing.assert_array_equal(est.cate_feature_names(), ['x0', 'x0^2'])
     est = DMLCateEstimator(model_y=WeightedLasso(),
                            model_t=LogisticRegression(),
                            model_final=WeightedLasso(),
                            featurizer=None,
                            fit_cate_intercept=True,
                            discrete_treatment=True)
     est.fit(Y, T, X)
     assert est.original_featurizer is None
     assert isinstance(est.featurizer, FunctionTransformer)
     assert isinstance(est.model_cate, WeightedLasso)
     for mdl in est.models_y:
         assert isinstance(mdl, WeightedLasso)
     for mdl in est.models_t:
         assert isinstance(mdl, LogisticRegression)
     np.testing.assert_array_equal(est.cate_feature_names(['A']), ['A'])