def _nuisance_estimates(self, y, T, X, Z): n_samples = y.shape[0] prel_theta = np.zeros(n_samples) res_t = np.zeros(n_samples) res_y = np.zeros(n_samples) delta = np.zeros(n_samples) splits = self._get_split_enum(y, T, X, Z) for idx, (train, test) in enumerate(splits): # Estimate preliminary theta in cross fitting manner prel_theta[test] = self.prel_model_effect[idx].fit( y[train], T[train], X[train], Z[train]).effect(X[test]).flatten() # Estimate p(X) = E[T | X] in cross fitting manner self.model_T_XZ[idx].fit( hstack([X[train], Z[train].reshape(-1, 1)]), T[train]) Z_one = np.ones((Z[test].shape[0], 1)) Z_zero = np.zeros((Z[test].shape[0], 1)) pr_t_test_one = self.model_T_XZ[idx].predict( hstack([X[test], Z_one])) pr_t_test_zero = self.model_T_XZ[idx].predict( hstack([X[test], Z_zero])) delta[test] = (pr_t_test_one - pr_t_test_zero) / 2 pr_t_test = (pr_t_test_one + pr_t_test_zero) / 2 res_t[test] = T[test] - pr_t_test # Estimate residual Y_res = Y - q(X) = Y - E[Y | X] in cross fitting manner res_y[test] = y[test] - \ self.model_Y_X[idx].fit(X[train], y[train]).predict(X[test]) return prel_theta, res_t, res_y, 2 * Z - 1, delta
def _generate_recoverable_errors(a_X, X, a_W=None, W=None, featurizer=FunctionTransformer()): """Return error vectors e_t and e_y such that OLS can recover the true coefficients from both stages.""" if W is None: W = np.empty((shape(X)[0], 0)) if a_W is None: a_W = np.zeros((shape(W)[1], )) # to correctly recover coefficients for T via OLS, we need e_t to be orthogonal to [W;X] WX = hstack([W, X]) e_t = rand_sol(WX.T, np.zeros((shape(WX)[1], ))) # to correctly recover coefficients for Y via OLS, we need ([X; W]⊗[1; ϕ(X); W])⁺ e_y = # -([X; W]⊗[1; ϕ(X); W])⁺ ((ϕ(X)⊗e_t)a_X+(W⊗e_t)a_W) # then, to correctly recover a in the third stage, we additionally need (ϕ(X)⊗e_t)ᵀ e_y = 0 ϕ = featurizer.fit_transform(X) v_X = cross_product(ϕ, e_t) v_W = cross_product(W, e_t) M = np.linalg.pinv( cross_product(WX, hstack([np.ones((shape(WX)[0], 1)), ϕ, W]))) e_y = rand_sol( vstack([M, v_X.T]), vstack([-M @ (v_X @ a_X + v_W @ a_W), np.zeros((shape(v_X)[1], ))])) return e_t, e_y
def fit(self, y, T, X, Z): """ Parameters ---------- y : outcome T : treatment (single dimensional) X : features/controls Z : instrument """ if len(T.shape) > 1 and T.shape[1] > 1: raise AssertionError( "Can only accept single dimensional treatment") if len(y.shape) > 1 and y.shape[1] > 1: raise AssertionError("Can only accept single dimensional outcome") if len(Z.shape) == 1: Z = Z.reshape(-1, 1) if (Z.shape[1] > 1) and self.binary_instrument: raise AssertionError( "Binary instrument flag is True, but instrument is multi-dimensional" ) T = T.flatten() y = y.flatten() n_samples = y.shape[0] pred_t = np.zeros(n_samples) proj_t = np.zeros(n_samples) res_y = np.zeros(n_samples) if self.n_splits == 1: splits = [(np.arange(X.shape[0]), np.arange(X.shape[0]))] # TODO. Deal with multi-class instrument elif self.binary_instrument or self.binary_treatment: group = 2 * T * self.binary_treatment + Z.flatten( ) * self.binary_instrument splits = StratifiedKFold(n_splits=self.n_splits, shuffle=True).split(X, group) else: splits = KFold(n_splits=self.n_splits, shuffle=True).split(X) for idx, (train, test) in enumerate(splits): # Calculate nuisances pred_t[test] = self.model_T_X[idx].fit(X[train], T[train]).predict(X[test]) proj_t[test] = self.model_T_XZ[idx].fit( hstack([X[train], Z[train]]), T[train]).predict(hstack([X[test], Z[test]])) res_y[test] = y[test] - \ self.model_Y_X[idx].fit(X[train], y[train]).predict(X[test]) # Estimate E[T_res | Z_res] res_z = proj_t - pred_t res_t = T - pred_t self._effect = np.mean(res_y * res_z) / np.mean(res_t * res_z) self._std = np.std(res_y * res_z) / (np.sqrt(res_y.shape[0]) * np.abs(np.mean(res_t * res_z))) return self
def effect(self, X, T0=0, T1=1): """ Parameters ---------- X : features """ if not hasattr(T0, "__len__"): T0 = np.ones(X.shape[0]) * T0 if not hasattr(T1, "__len__"): T1 = np.ones(X.shape[0]) * T1 X0 = hstack([T0.reshape(-1, 1), X]) X1 = hstack([T1.reshape(-1, 1), X]) return self.model_final.predict(X1) - self.model_final.predict(X0)
def transform(self, X): # add column of ones to X X = hstack([np.ones((shape(X)[0], 1)), X]) d_x = shape(X)[1] d_y, d_t = self._d_y, self._d_t # for each row, create the d_y*d_t*(d_x+1) features (which are matrices of size d_y by d_t) return reshape(np.einsum('nx,fyt->nfxyt', X, self._fts), (shape(X)[0], d_y * d_t * d_x, d_y, d_t))
def _nuisance_estimates(self, y, T, X, Z): n_samples = y.shape[0] prel_theta = np.zeros(n_samples) res_t = np.zeros(n_samples) res_y = np.zeros(n_samples) res_z = np.zeros(n_samples) cov = np.zeros(n_samples) proj_t = np.zeros(n_samples) splits = self._get_split_enum(y, T, X, Z) # TODO. The solution below is not really a valid cross-fitting # as the test data are used to create the proj_t on the train # which in the second train-test loop is used to create the nuisance # cov on the test data. Hence the T variable of some sample # is implicitly correlated with its cov nuisance, through this flow # of information. However, this seems a rather weak correlation. # The more kosher would be to do an internal nested cv loop for the T_XZ # model. splits, splits_one = tee(splits) # Estimate h(X, Z) = E[T | X, Z] in cross fitting manner for idx, (train, test) in enumerate(splits_one): self.model_T_XZ[idx].fit(hstack([X[train], Z[train]]), T[train]) proj_t[test] = self.model_T_XZ[idx].predict( hstack([X[test], Z[test]])) for idx, (train, test) in enumerate(splits): # Estimate preliminary theta in cross fitting manner prel_theta[test] = self.prel_model_effect[idx].fit( y[train], T[train], X[train], Z[train]).effect(X[test]).flatten() # Estimate p(X) = E[T | X] in cross fitting manner self.model_T_X[idx].fit(X[train], T[train]) pr_t_test = self.model_T_X[idx].predict(X[test]) # Calculate residual T_res = T - p(X) and Z_res = h(Z, X) - p(X) res_t[test] = T[test] - pr_t_test res_z[test] = proj_t[test] - pr_t_test # Estimate residual Y_res = Y - q(X) = Y - E[Y | X] in cross fitting manner res_y[test] = y[test] - \ self.model_Y_X[idx].fit(X[train], y[train]).predict(X[test]) # Estimate cov[T, E[T|X,Z] | X] = E[T * E[T|X,Z]] - E[T|X]^2 cov[test] = self.model_TZ_X[idx].fit( X[train], T[train] * proj_t[train]).predict( X[test]) - pr_t_test**2 return prel_theta, res_t, res_y, res_z, cov
def fit(self, y, T, X, Z): """ Parameters ---------- y : outcome T : treatment (single dimensional) X : features/controls Z : instrument """ if len(T.shape) > 1 and T.shape[1] > 1: raise AssertionError( "Can only accept single dimensional treatment") if len(y.shape) > 1 and y.shape[1] > 1: raise AssertionError("Can only accept single dimensional outcome") if len(Z.shape) == 1: Z = Z.reshape(-1, 1) T = T.flatten() y = y.flatten() pred_t = self.model_T_XZ.fit(hstack([X, Z]), T).predict(hstack([X, Z])) self.model_final.fit(hstack([pred_t.reshape(-1, 1), X]), y) return self
def _test_sparse(n_p, d_w, n_r): # need at least as many rows in e_y as there are distinct columns # in [X;X⊗W;W⊗W;X⊗e_t] to find a solution for e_t assert n_p * n_r >= 2 * n_p + n_p * d_w + d_w * (d_w + 1) / 2 a = np.random.normal(size=(n_p,)) # one effect per product n = n_p * n_r p = np.tile(range(n_p), n_r) # product id b = np.random.normal(size=(d_w + n_p,)) g = np.random.normal(size=(d_w + n_p,)) x = np.empty((2 * n, n_p)) # product dummies w = np.empty((2 * n, d_w)) y = np.empty(2 * n) t = np.empty(2 * n) for fold in range(0, 2): x_f = OneHotEncoder().fit_transform(np.reshape(p, (-1, 1))).toarray() w_f = np.random.normal(size=(n, d_w)) xw_f = hstack([x_f, w_f]) e_t_f, e_y_f = TestDML._generate_recoverable_errors(a, x_f, W=w_f) t_f = xw_f @ b + e_t_f y_f = t_f * np.choose(p, a) + xw_f @ g + e_y_f x[fold * n:(fold + 1) * n, :] = x_f w[fold * n:(fold + 1) * n, :] = w_f y[fold * n:(fold + 1) * n] = y_f t[fold * n:(fold + 1) * n] = t_f dml = SparseLinearDMLCateEstimator(LinearRegression(fit_intercept=False), LinearRegression( fit_intercept=False), featurizer=FunctionTransformer()) dml.fit(y, t, x, w) # note that this would fail for the non-sparse DMLCateEstimator np.testing.assert_allclose(a, dml.coef_.reshape(-1)) eff = reshape(t * np.choose(np.tile(p, 2), a), (-1, 1)) np.testing.assert_allclose(eff, dml.effect(0, t, x)) dml = SparseLinearDMLCateEstimator(LinearRegression(fit_intercept=False), LinearRegression(fit_intercept=False), featurizer=Pipeline([("id", FunctionTransformer()), ("matrix", MatrixFeatures(1, 1))])) dml.fit(y, t, x, w) np.testing.assert_allclose(eff, dml.effect(0, t, x))
def fit(self, y, T, X, Z, store_final=False): """ Parameters ---------- y : outcome T : treatment (single dimensional) X : features/controls Z : instrument (single dimensional) store_final (bool) : whether to store the estimated nuisance values for fitting a different final stage model without the need of refitting the nuisance values. Increases memory usage. """ if len(T.shape) > 1 and T.shape[1] > 1: raise AssertionError( "Can only accept single dimensional treatment") if len(y.shape) > 1 and y.shape[1] > 1: raise AssertionError("Can only accept single dimensional outcome") if len(Z.shape) == 1: Z = Z.reshape(-1, 1) if (Z.shape[1] > 1) and self.binary_instrument: raise AssertionError( "Binary instrument flag is True, but instrument is multi-dimensional") T = T.flatten() y = y.flatten() n_samples = y.shape[0] proj_t = np.zeros(n_samples) pred_t = np.zeros(n_samples) res_y = np.zeros(n_samples) if self.n_splits == 1: splits = [(np.arange(X.shape[0]), np.arange(X.shape[0]))] # TODO. Deal with multi-class instrument/treatment elif self.binary_instrument or self.binary_treatment: group = 2*T*self.binary_treatment + Z.flatten()*self.binary_instrument splits = StratifiedKFold( n_splits=self.n_splits, shuffle=True).split(X, group) else: splits = KFold(n_splits=self.n_splits, shuffle=True).split(X) for idx, (train, test) in enumerate(splits): # Estimate h(Z, X) = E[T | Z, X] in cross-fitting manner proj_t[test] = self.model_T_XZ[idx].fit(hstack([X[train], Z[train]]), T[train]).predict(hstack([X[test], Z[test]])) # Estimate residual Y_res = Y - q(X) = Y - E[Y | X] in cross-fitting manner res_y[test] = y[test] - \ self.model_Y_X[idx].fit(X[train], y[train]).predict(X[test]) # Estimate p(X) = E[T | X] in cross-fitting manner pred_t[test] = self.model_T_X[idx].fit( X[train], T[train]).predict(X[test]) # Estimate theta by minimizing square loss (Y_res - theta(X) * (h(Z, X) - p(X)))^2 self.model_effect.fit(res_y, (proj_t-pred_t).reshape(-1, 1), X) if store_final: self.stored_final_data = True self.X = X self.res_t = (proj_t-pred_t).reshape(-1, 1) self.res_y = res_y return self