def test_make_pliv_CHS2015_return_types(): np.random.seed(3141) res = make_pliv_CHS2015(n_obs=100, return_type='DoubleMLData') assert isinstance(res, DoubleMLData) res = make_pliv_CHS2015(n_obs=100, return_type='DataFrame') assert isinstance(res, pd.DataFrame) x, y, d, z = make_pliv_CHS2015(n_obs=100, return_type='array') assert isinstance(x, np.ndarray) assert isinstance(y, np.ndarray) assert isinstance(d, np.ndarray) assert isinstance(z, np.ndarray) with pytest.raises(ValueError, match=msg_inv_return_type): _ = make_pliv_CHS2015(n_obs=100, return_type='matrix')
def test_pliv_callable_not_implemented(): np.random.seed(3141) dml_data_pliv_2z = make_pliv_CHS2015(n_obs=100, dim_z=2) pliv_score = dml_pliv._score_elements dml_pliv_callable_score = DoubleMLPLIV._partialX(dml_data_pliv_2z, Lasso(), Lasso(), Lasso(), score=pliv_score) msg = 'Callable score not implemented for DoubleMLPLIV.partialX with several instruments.' with pytest.raises(NotImplementedError, match=msg): dml_pliv_callable_score.fit() dml_pliv_callable_score = DoubleMLPLIV._partialZ(dml_data_pliv_2z, Lasso(), score=pliv_score) msg = 'Callable score not implemented for DoubleMLPLIV.partialZ.' with pytest.raises(NotImplementedError, match=msg): dml_pliv_callable_score.fit() dml_pliv_callable_score = DoubleMLPLIV._partialXZ(dml_data_pliv_2z, Lasso(), Lasso(), Lasso(), score=pliv_score) msg = 'Callable score not implemented for DoubleMLPLIV.partialXZ.' with pytest.raises(NotImplementedError, match=msg): dml_pliv_callable_score.fit()
def test_obj_vs_from_arrays(): np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=100) dml_data_from_array = DoubleMLData.from_arrays( dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols]) assert dml_data_from_array.data.equals(dml_data.data) dml_data = _make_pliv_data(n_obs=100) dml_data_from_array = DoubleMLData.from_arrays( dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], dml_data.data[dml_data.z_cols]) assert dml_data_from_array.data.equals(dml_data.data) dml_data = make_pliv_CHS2015(n_obs=100, dim_z=5) dml_data_from_array = DoubleMLData.from_arrays( dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], dml_data.data[dml_data.z_cols]) assert np.array_equal(dml_data_from_array.data, dml_data.data) # z_cols name differ dml_data = make_plr_CCDDHNR2018(n_obs=100) df = dml_data.data.copy().iloc[:, :10] df.columns = [f'X{i+1}' for i in np.arange(7)] + ['y', 'd1', 'd2'] dml_data = DoubleMLData(df, 'y', ['d1', 'd2'], [f'X{i+1}' for i in np.arange(7)]) dml_data_from_array = DoubleMLData.from_arrays( dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols]) assert np.array_equal(dml_data_from_array.data, dml_data.data)
def test_obj_vs_from_arrays(): np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=100) dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols]) assert dml_data_from_array.data.equals(dml_data.data) dml_data = _make_pliv_data(n_obs=100) dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], dml_data.data[dml_data.z_cols]) assert dml_data_from_array.data.equals(dml_data.data) dml_data = make_pliv_CHS2015(n_obs=100, dim_z=5) dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], dml_data.data[dml_data.z_cols]) assert np.array_equal(dml_data_from_array.data, dml_data.data) # z_cols name differ dml_data = make_plr_CCDDHNR2018(n_obs=100) df = dml_data.data.copy().iloc[:, :10] df.columns = [f'X{i+1}' for i in np.arange(7)] + ['y', 'd1', 'd2'] dml_data = DoubleMLData(df, 'y', ['d1', 'd2'], [f'X{i+1}' for i in np.arange(7)]) dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols]) assert np.array_equal(dml_data_from_array.data, dml_data.data) dml_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) dml_data_from_array = DoubleMLClusterData.from_arrays(dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], dml_data.data[dml_data.cluster_cols], dml_data.data[dml_data.z_cols]) df = dml_data.data.copy() df.rename(columns={'cluster_var_i': 'cluster_var1', 'cluster_var_j': 'cluster_var2', 'Y': 'y', 'D': 'd', 'Z': 'z'}, inplace=True) assert dml_data_from_array.data.equals(df) # with a single cluster variable dml_data_from_array = DoubleMLClusterData.from_arrays(dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], dml_data.data[dml_data.cluster_cols[1]], dml_data.data[dml_data.z_cols]) df = dml_data.data.copy().drop(columns='cluster_var_i') df.rename(columns={'cluster_var_j': 'cluster_var', 'Y': 'y', 'D': 'd', 'Z': 'z'}, inplace=True) assert dml_data_from_array.data.equals(df)
def generate_data_pliv_partialX(request): n_p = request.param np.random.seed(1111) # setting parameters n = n_p theta = 1. # generating data data = make_pliv_CHS2015(n, alpha=theta, dim_z=5) return data
def generate_data_iv(request): n_p = request.param np.random.seed(1111) # setting parameters n = n_p[0] p = n_p[1] theta = 0.5 # generating data data = make_pliv_CHS2015(n_obs=n, dim_x=p, alpha=theta, dim_z=1, return_type=pd.DataFrame) return data
def generate_data_pliv_partialX(request): N_p = request.param np.random.seed(1111) # setting parameters N = N_p theta = 1. # generating data datasets = [] for i in range(n_datasets): data = make_pliv_CHS2015(N, alpha=theta, dim_z=5) datasets.append(data) return datasets
def generate_data_iv(request): N_p = request.param np.random.seed(1111) # setting parameters N = N_p[0] p = N_p[1] theta = 0.5 # generating data datasets = [] for i in range(n_datasets): data = make_pliv_CHS2015(n_obs=N, dim_x=p, alpha=theta, dim_z=1, return_type=pd.DataFrame) datasets.append(data) return datasets
from doubleml.datasets import make_plr_CCDDHNR2018, make_irm_data, make_pliv_CHS2015, make_iivm_data,\ make_pliv_multiway_cluster_CKMS2021 from sklearn.linear_model import Lasso, LogisticRegression from sklearn.base import BaseEstimator np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=10) ml_g = Lasso() ml_m = Lasso() ml_r = Lasso() dml_plr = DoubleMLPLR(dml_data, ml_g, ml_m) dml_data_irm = make_irm_data(n_obs=10) dml_data_iivm = make_iivm_data(n_obs=10) dml_data_pliv = make_pliv_CHS2015(n_obs=10, dim_z=1) dml_cluster_data_pliv = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) (x, y, d, z) = make_iivm_data(n_obs=30, return_type="array") y[y > 0] = 1 y[y < 0] = 0 dml_data_irm_binary_outcome = DoubleMLData.from_arrays(x, y, d) dml_data_iivm_binary_outcome = DoubleMLData.from_arrays(x, y, d, z) @pytest.mark.ci def test_doubleml_exception_data(): msg = 'The data must be of DoubleMLData type.' with pytest.raises(TypeError, match=msg): _ = DoubleMLPLR(pd.DataFrame(), ml_g, ml_m) # PLR with IV