コード例 #1
0
 def transform(self, X):
     # add column of ones to X
     X = hstack([np.ones((shape(X)[0], 1)), X])
     d_x = shape(X)[1]
     d_y, d_t = self._d_y, self._d_t
     # for each row, create the d_y*d_t*(d_x+1) features (which are matrices of size d_y by d_t)
     return reshape(np.einsum('nx,fyt->nfxyt', X, self._fts), (shape(X)[0], d_y * d_t * d_x, d_y, d_t))
コード例 #2
0
ファイル: test_statsmodels.py プロジェクト: tommylegit/EconML
    def fit(self, X, y, sample_weight=None):
        """
        Fit the ordinary least squares model.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data
        y : array_like, shape (n_samples, 1) or (n_samples,)
            Target values
        sample_weight : array_like, shape (n_samples,)
            Individual weights for each sample

        Returns
        -------
        self
        """
        assert ndim(y) == 1 or (ndim(y) == 2 and shape(y)[1] == 1)
        y = reshape(y, (-1,))
        if self.fit_intercept:
            X = add_constant(X, has_constant='add')
        if sample_weight is not None:
            ols = WLS(y, X, weights=sample_weight, hasconst=self.fit_intercept)
        else:
            ols = WLS(y, X, hasconst=self.fit_intercept)
        self.results = ols.fit(**self.fit_args)
        return self
コード例 #3
0
    def test_hermite_results(self):
        inputs = np.random.normal(size=(5, 1))
        hf = HermiteFeatures(3).fit_transform(inputs)
        # first polynomials are 1, x, x*x-1, x*x*x-3*x
        ones = np.ones(shape(inputs))
        polys = np.hstack([ones, inputs, inputs * inputs - ones, inputs * inputs * inputs - 3 * inputs])
        assert(np.allclose(hf, polys * np.exp(-inputs * inputs / 2)))

        for j in [True, False]:
            hf = HermiteFeatures(1, shift=1, joint=j).fit_transform(inputs)
            # first derivatives are -x, -x^2+1 (since there's just one column, joint-ness doesn't matter)
            polys = np.hstack([-inputs, -inputs * inputs + ones])
            assert(np.allclose(hf, reshape(polys * np.exp(-inputs * inputs / 2), (5, 1, 2))))
コード例 #4
0
    def _test_sparse(n_p, d_w, n_r):
        # need at least as many rows in e_y as there are distinct columns
        # in [X;X⊗W;W⊗W;X⊗e_t] to find a solution for e_t
        assert n_p * n_r >= 2 * n_p + n_p * d_w + d_w * (d_w + 1) / 2
        a = np.random.normal(size=(n_p,))  # one effect per product
        n = n_p * n_r
        p = np.tile(range(n_p), n_r)  # product id

        b = np.random.normal(size=(d_w + n_p,))
        g = np.random.normal(size=(d_w + n_p,))

        x = np.empty((2 * n, n_p))  # product dummies
        w = np.empty((2 * n, d_w))
        y = np.empty(2 * n)
        t = np.empty(2 * n)

        for fold in range(0, 2):
            x_f = OneHotEncoder().fit_transform(np.reshape(p, (-1, 1))).toarray()
            w_f = np.random.normal(size=(n, d_w))
            xw_f = hstack([x_f, w_f])
            e_t_f, e_y_f = TestDML._generate_recoverable_errors(a, x_f, W=w_f)

            t_f = xw_f @ b + e_t_f
            y_f = t_f * np.choose(p, a) + xw_f @ g + e_y_f

            x[fold * n:(fold + 1) * n, :] = x_f
            w[fold * n:(fold + 1) * n, :] = w_f
            y[fold * n:(fold + 1) * n] = y_f
            t[fold * n:(fold + 1) * n] = t_f

        dml = SparseLinearDMLCateEstimator(LinearRegression(fit_intercept=False), LinearRegression(
            fit_intercept=False), featurizer=FunctionTransformer())
        dml.fit(y, t, x, w)

        # note that this would fail for the non-sparse DMLCateEstimator

        np.testing.assert_allclose(a, dml.coef_.reshape(-1))
        eff = reshape(t * np.choose(np.tile(p, 2), a), (-1, 1))
        np.testing.assert_allclose(eff, dml.effect(0, t, x))

        dml = SparseLinearDMLCateEstimator(LinearRegression(fit_intercept=False),
                                           LinearRegression(fit_intercept=False),
                                           featurizer=Pipeline([("id", FunctionTransformer()),
                                                                ("matrix", MatrixFeatures(1, 1))]))
        dml.fit(y, t, x, w)
        np.testing.assert_allclose(eff, dml.effect(0, t, x))
コード例 #5
0
    def test_complex_features(self):
        # recover simple features by initializing complex features appropriately
        for _ in range(10):
            d_w = np.random.randint(0, 4)
            d_x = np.random.randint(1, 3)
            d_y = np.random.randint(1, 3)
            d_t = np.random.randint(1, 3)
            n = 20
            with self.subTest(d_w=d_w, d_x=d_x, d_y=d_y, d_t=d_t):
                W, X, Y, T = [np.random.normal(size=(n, d)) for d in [d_w, d_x, d_y, d_t]]
                # using full set of matrix features should be equivalent to using non-matrix featurizer
                dml = DMLCateEstimator(model_y=LinearRegression(), model_t=LinearRegression(),
                                       featurizer=MatrixFeatures(d_y, d_t))
                dml.fit(Y, T, X, W)
                coef1 = dml.coef_

                dml = DMLCateEstimator(model_y=LinearRegression(), model_t=LinearRegression())
                dml.fit(Y, T, X, W)
                coef2 = dml.coef_

                np.testing.assert_allclose(coef1, reshape(coef2, -1))
コード例 #6
0
ファイル: test_dominicks.py プロジェクト: ashlili0401/econmic
def test_dominicks():
    file_name = "oj_large.csv"
    if not os.path.isfile(file_name):
        print("Downloading file (this might take a few seconds)...")
        urllib.request.urlretrieve(
            "https://msalicedatapublic.blob.core.windows.net/datasets/OrangeJuice/oj_large.csv", file_name)
    oj_data = pd.read_csv(file_name)

    brands = sorted(set(oj_data["brand"]))
    stores = sorted(set(oj_data["store"]))

    featnames = ["week", "feat"] + list(oj_data.columns[6:])

    # Preprocess data
    import datetime
    import numpy as np

    # Convert 'week' to a date
    # week_zero = datetime.datetime.strptime("09/07/89", "%m/%d/%y")
    # oj_data["week"] = pd.to_timedelta(oj_data["week"], unit='w') + week_zero

    # Take log of price
    oj_data["logprice"] = np.log(oj_data["price"])
    oj_data.drop("price", axis=1, inplace=True)

    # Make brand numeric
    oj_data["brand"] = [brands.index(b) for b in oj_data["brand"]]

    class PriceFeaturizer(TransformerMixin):
        def __init__(self, n_prods, own_price=True,
                     cross_price_groups=False, cross_price_indiv=True, per_product_effects=True):
            base_arrays = []
            effect_names = []
            one_hots = [(0,) * p + (1,) + (0,) * (n_prods - p - 1) for p in range(n_prods)]
            if own_price:
                base_arrays.append(np.eye(n_prods))
                effect_names.append("own price")
            if cross_price_groups:
                base_arrays.append((np.ones((n_prods, n_prods)) - np.eye(n_prods)) / (n_prods - 1))
                effect_names.append("group cross price")
            if cross_price_indiv:
                for p in range(n_prods):
                    base_arrays.append(one_hots[p] * np.ones((n_prods, 1)) - np.diag(one_hots[p]))
                    effect_names.append("cross price effect {} ->".format(p))
            if per_product_effects:
                all = [(np.diag(one_hots[p]) @ arr, nm + " {}".format(p))
                       for arr, nm in zip(base_arrays, effect_names) for p in range(n_prods)]
                # remove meaningless features (e.g. cross-price effects of products on themselves),
                # which have all zero coeffs
                nonempty = [(arr, nm) for arr, nm in all if np.count_nonzero(arr) > 0]
                self._features = [arr for arr, _ in nonempty]
                self._names = [nm for _, nm in nonempty]
            else:
                self._features = base_arrays
                self._names = effect_names

        def fit(self, X):
            self._is_fitted = True
            assert shape(X)[1] == 0
            return self

        def transform(self, X):
            assert self._is_fitted
            assert shape(X)[1] == 0
            return np.tile(self._features, (shape(X)[0], 1, 1, 1))

        @property
        def names(self):
            return self._names

    for name, op, xp_g, xp_i, pp in [("Homogeneous treatment effect", True, False, False, False),
                                     ("Heterogeneous treatment effects", True, False, False, True),
                                     (("Heterogeneous treatment effects"
                                       " with group effects"), True, True, False, True),
                                     (("Heterogeneous treatment effects"
                                       " with cross price effects"), True, False, True, True)]:

        print(name)
        np.random.seed(42)

        ft = PriceFeaturizer(n_prods=3, own_price=op, cross_price_groups=xp_g,
                             cross_price_indiv=xp_i, per_product_effects=pp)
        names = ft.names
        dml = LinearDMLCateEstimator(model_y=RandomForestRegressor(),
                                     model_t=RandomForestRegressor(),
                                     featurizer=ft,
                                     n_splits=2)

        effects = []
        for store in stores:
            data = oj_data[oj_data['store'] == store].sort_values(by=['week', 'brand'])
            dml.fit(T=reshape(data.as_matrix(["logprice"]), (-1, 3)),
                    Y=reshape(data.as_matrix(["logmove"]), (-1, 3)),
                    W=reshape(data.as_matrix(featnames), (-1, 3 * len(featnames))))
            effects.append(dml.coef_)
        effects = np.array(effects)
        for nm, eff in zip(names, effects.T):
            print(" Effect: {}".format(nm))
            print("   Mean: {}".format(np.mean(eff)))
            print("   Std.: {}".format(np.std(eff)))

    class ConstFt(TransformerMixin):
        def fit(self, X):
            return self

        def transform(self, X):
            return np.ones((shape(X)[0], 1))

    print("Vanilla HTE+XP")

    np.random.seed(42)
    dml = LinearDMLCateEstimator(model_y=RandomForestRegressor(),
                                 model_t=RandomForestRegressor(),
                                 featurizer=ConstFt(),
                                 n_splits=2)

    effects = []
    for store in stores:
        data = oj_data[oj_data['store'] == store].sort_values(by=['week', 'brand'])
        dml.fit(T=reshape(data.as_matrix(["logprice"]), (-1, 3)),
                Y=reshape(data.as_matrix(["logmove"]), (-1, 3)),
                W=reshape(data.as_matrix(featnames), (-1, 3 * len(featnames))))
        effects.append(dml.coef_)
    effects = np.array(effects)
    names = ["{} on {}".format(i, j) for j in range(3) for i in range(3)]
    for nm, eff in zip(names, reshape(effects, (-1, 9)).T):
        print(" Effect: {}".format(nm))
        print("   Mean: {}".format(np.mean(eff)))
        print("   Std.: {}".format(np.std(eff)))
コード例 #7
0
ファイル: linear_model.py プロジェクト: liwusen/EconML
 def predict(self, X):
     predictions = self.model.predict(X)
     return reshape(predictions,
                    (-1, 1)) if self.needs_unravel else predictions