def test_obj_vs_from_arrays(): np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=100) dml_data_from_array = DoubleMLData.from_arrays( dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols]) assert dml_data_from_array.data.equals(dml_data.data) dml_data = _make_pliv_data(n_obs=100) dml_data_from_array = DoubleMLData.from_arrays( dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], dml_data.data[dml_data.z_cols]) assert dml_data_from_array.data.equals(dml_data.data) dml_data = make_pliv_CHS2015(n_obs=100, dim_z=5) dml_data_from_array = DoubleMLData.from_arrays( dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], dml_data.data[dml_data.z_cols]) assert np.array_equal(dml_data_from_array.data, dml_data.data) # z_cols name differ dml_data = make_plr_CCDDHNR2018(n_obs=100) df = dml_data.data.copy().iloc[:, :10] df.columns = [f'X{i+1}' for i in np.arange(7)] + ['y', 'd1', 'd2'] dml_data = DoubleMLData(df, 'y', ['d1', 'd2'], [f'X{i+1}' for i in np.arange(7)]) dml_data_from_array = DoubleMLData.from_arrays( dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols]) assert np.array_equal(dml_data_from_array.data, dml_data.data)
def test_dml_data_no_instr(): np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=100) assert dml_data.z is None assert dml_data.n_instr == 0 x, y, d = make_plr_CCDDHNR2018(n_obs=100, return_type='array') dml_data = DoubleMLData.from_arrays(x, y, d) assert dml_data.z is None assert dml_data.n_instr == 0
def test_obj_vs_from_arrays(): np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=100) dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols]) assert dml_data_from_array.data.equals(dml_data.data) dml_data = _make_pliv_data(n_obs=100) dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], dml_data.data[dml_data.z_cols]) assert dml_data_from_array.data.equals(dml_data.data) dml_data = make_pliv_CHS2015(n_obs=100, dim_z=5) dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], dml_data.data[dml_data.z_cols]) assert np.array_equal(dml_data_from_array.data, dml_data.data) # z_cols name differ dml_data = make_plr_CCDDHNR2018(n_obs=100) df = dml_data.data.copy().iloc[:, :10] df.columns = [f'X{i+1}' for i in np.arange(7)] + ['y', 'd1', 'd2'] dml_data = DoubleMLData(df, 'y', ['d1', 'd2'], [f'X{i+1}' for i in np.arange(7)]) dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols]) assert np.array_equal(dml_data_from_array.data, dml_data.data) dml_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) dml_data_from_array = DoubleMLClusterData.from_arrays(dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], dml_data.data[dml_data.cluster_cols], dml_data.data[dml_data.z_cols]) df = dml_data.data.copy() df.rename(columns={'cluster_var_i': 'cluster_var1', 'cluster_var_j': 'cluster_var2', 'Y': 'y', 'D': 'd', 'Z': 'z'}, inplace=True) assert dml_data_from_array.data.equals(df) # with a single cluster variable dml_data_from_array = DoubleMLClusterData.from_arrays(dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], dml_data.data[dml_data.cluster_cols[1]], dml_data.data[dml_data.z_cols]) df = dml_data.data.copy().drop(columns='cluster_var_i') df.rename(columns={'cluster_var_j': 'cluster_var', 'Y': 'y', 'D': 'd', 'Z': 'z'}, inplace=True) assert dml_data_from_array.data.equals(df)
def test_make_plr_CCDDHNR2018_return_types(): np.random.seed(3141) res = make_plr_CCDDHNR2018(n_obs=100, return_type=DoubleMLData) assert isinstance(res, DoubleMLData) res = make_plr_CCDDHNR2018(n_obs=100, return_type=pd.DataFrame) assert isinstance(res, pd.DataFrame) x, y, d = make_plr_CCDDHNR2018(n_obs=100, return_type=np.ndarray) assert isinstance(x, np.ndarray) assert isinstance(y, np.ndarray) assert isinstance(d, np.ndarray) with pytest.raises(ValueError, match=msg_inv_return_type): _ = make_plr_CCDDHNR2018(n_obs=100, return_type='matrix')
def test_d_cols_setter(): np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=100) df = dml_data.data.copy().iloc[:, :10] df.columns = [f'X{i + 1}' for i in np.arange(7)] + ['y', 'd1', 'd2'] dml_data = DoubleMLData(df, 'y', ['d1', 'd2'], [f'X{i + 1}' for i in np.arange(7)]) # check that after changing d_cols, the d array gets updated d_comp = dml_data.data['d2'].values dml_data.d_cols = ['d2', 'd1'] assert np.array_equal(dml_data.d, d_comp) msg = r'Invalid treatment variable\(s\) d_cols. At least one treatment variable is no data column.' with pytest.raises(ValueError, match=msg): dml_data.d_cols = ['d1', 'd13'] with pytest.raises(ValueError, match=msg): dml_data.d_cols = 'd13' msg = (r'The treatment variable\(s\) d_cols must be of str or list type. ' "5 of type <class 'int'> was passed.") with pytest.raises(TypeError, match=msg): dml_data.d_cols = 5 # check single covariate d_comp = dml_data.data['d2'].values dml_data.d_cols = 'd2' assert np.array_equal(dml_data.d, d_comp) assert dml_data.n_treat == 1
def test_x_cols_setter(): np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=100) orig_x_cols = dml_data.x_cols # check that after changing the x_cols, the x array gets updated x_comp = dml_data.data[['X1', 'X11', 'X13']].values dml_data.x_cols = ['X1', 'X11', 'X13'] assert np.array_equal(dml_data.x, x_comp) msg = 'Invalid covariates x_cols. At least one covariate is no data column.' with pytest.raises(ValueError, match=msg): dml_data.x_cols = ['X1', 'X11', 'A13'] msg = (r'The covariates x_cols must be of str or list type \(or None\). ' "5 of type <class 'int'> was passed.") with pytest.raises(TypeError, match=msg): dml_data.x_cols = 5 # check single covariate x_comp = dml_data.data[['X13']].values dml_data.x_cols = 'X13' assert np.array_equal(dml_data.x, x_comp) # check setting None brings us back to orig_x_cols x_comp = dml_data.data[orig_x_cols].values dml_data.x_cols = None assert np.array_equal(dml_data.x, x_comp)
def test_cluster_cols_setter(): np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=100) df = dml_data.data.copy().iloc[:, :10] df.columns = [f'X{i + 1}' for i in np.arange(7)] + ['y', 'd1', 'd2'] dml_data = DoubleMLClusterData(df, 'y', ['d1', 'd2'], cluster_cols=[f'X{i + 1}' for i in [5, 6]], x_cols=[f'X{i + 1}' for i in np.arange(5)]) cluster_vars = df[['X6', 'X7']].values assert np.array_equal(dml_data.cluster_vars, cluster_vars) assert dml_data.n_cluster_vars == 2 # check that after changing cluster_cols, the cluster_vars array gets updated cluster_vars = df[['X7', 'X6']].values dml_data.cluster_cols = ['X7', 'X6'] assert np.array_equal(dml_data.cluster_vars, cluster_vars) msg = r'Invalid cluster variable\(s\) cluster_cols. At least one cluster variable is no data column.' with pytest.raises(ValueError, match=msg): dml_data.cluster_cols = ['X6', 'X13'] with pytest.raises(ValueError, match=msg): dml_data.cluster_cols = 'X13' msg = (r'The cluster variable\(s\) cluster_cols must be of str or list type. ' "5 of type <class 'int'> was passed.") with pytest.raises(TypeError, match=msg): dml_data.cluster_cols = 5 # check single cluster variable cluster_vars = df[['X7']].values dml_data.cluster_cols = 'X7' assert np.array_equal(dml_data.cluster_vars, cluster_vars) assert dml_data.n_cluster_vars == 1
def test_z_cols_setter(): np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=100) df = dml_data.data.copy().iloc[:, :10] df.columns = [f'X{i + 1}' for i in np.arange(4)] + [f'z{i + 1}' for i in np.arange(3)] + ['y', 'd1', 'd2'] dml_data = DoubleMLData(df, 'y', ['d1', 'd2'], [f'X{i + 1}' for i in np.arange(4)], [f'z{i + 1}' for i in np.arange(3)]) # check that after changing z_cols, the z array gets updated z_comp = dml_data.data[['z1', 'z2']].values dml_data.z_cols = ['z1', 'z2'] assert np.array_equal(dml_data.z, z_comp) msg = r'Invalid instrumental variable\(s\) z_cols. At least one instrumental variable is no data column.' with pytest.raises(ValueError, match=msg): dml_data.z_cols = ['z1', 'a13'] with pytest.raises(ValueError, match=msg): dml_data.z_cols = 'a13' msg = (r'The instrumental variable\(s\) z_cols must be of str or list type \(or None\). ' "5 of type <class 'int'> was passed.") with pytest.raises(TypeError, match=msg): dml_data.z_cols = 5 # check single instrument z_comp = dml_data.data[['z2']].values dml_data.z_cols = 'z2' assert np.array_equal(dml_data.z, z_comp) # check None dml_data.z_cols = None assert dml_data.n_instr == 0 assert dml_data.z is None
def test_add_vars_in_df(): # additional variables in the df shouldn't affect results np.random.seed(3141) df = make_plr_CCDDHNR2018(n_obs=100, return_type='DataFrame') dml_data_full_df = DoubleMLData(df, 'y', 'd', ['X1', 'X11', 'X13']) dml_data_subset = DoubleMLData(df[['X1', 'X11', 'X13'] + ['y', 'd']], 'y', 'd', ['X1', 'X11', 'X13']) dml_plr_full_df = DoubleMLPLR(dml_data_full_df, Lasso(), Lasso()) dml_plr_subset = DoubleMLPLR(dml_data_subset, Lasso(), Lasso(), draw_sample_splitting=False) dml_plr_subset.set_sample_splitting(dml_plr_full_df.smpls) dml_plr_full_df.fit() dml_plr_subset.fit() assert np.allclose(dml_plr_full_df.coef, dml_plr_subset.coef, rtol=1e-9, atol=1e-4) assert np.allclose(dml_plr_full_df.se, dml_plr_subset.se, rtol=1e-9, atol=1e-4)
def test_use_other_treat_as_covariate(): np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=100) df = dml_data.data.copy().iloc[:, :10] df.columns = [f'X{i + 1}' for i in np.arange(7)] + ['y', 'd1', 'd2'] dml_data = DoubleMLData(df, 'y', ['d1', 'd2'], [f'X{i + 1}' for i in np.arange(7)], use_other_treat_as_covariate=True) dml_data.set_x_d('d1') assert np.array_equal(dml_data.d, df['d1'].values) assert np.array_equal( dml_data.x, df[[f'X{i + 1}' for i in np.arange(7)] + ['d2']].values) dml_data.set_x_d('d2') assert np.array_equal(dml_data.d, df['d2'].values) assert np.array_equal( dml_data.x, df[[f'X{i + 1}' for i in np.arange(7)] + ['d1']].values) dml_data = DoubleMLData(df, 'y', ['d1', 'd2'], [f'X{i + 1}' for i in np.arange(7)], use_other_treat_as_covariate=False) dml_data.set_x_d('d1') assert np.array_equal(dml_data.d, df['d1'].values) assert np.array_equal(dml_data.x, df[[f'X{i + 1}' for i in np.arange(7)]].values) dml_data.set_x_d('d2') assert np.array_equal(dml_data.d, df['d2'].values) assert np.array_equal(dml_data.x, df[[f'X{i + 1}' for i in np.arange(7)]].values) msg = 'use_other_treat_as_covariate must be True or False. Got 1.' with pytest.raises(TypeError, match=msg): _ = DoubleMLData(df, 'y', ['d1', 'd2'], [f'X{i + 1}' for i in np.arange(7)], use_other_treat_as_covariate=1) msg = 'Invalid treatment_var. d3 is not in d_cols.' with pytest.raises(ValueError, match=msg): dml_data.set_x_d('d3') msg = r"treatment_var must be of str type. \['d1', 'd2'\] of type <class 'list'> was passed." with pytest.raises(TypeError, match=msg): dml_data.set_x_d(['d1', 'd2'])
def test_y_col_setter(): np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=100) df = dml_data.data.copy().iloc[:, :10] df.columns = [f'X{i + 1}' for i in np.arange(7)] + ['y', 'y123', 'd'] dml_data = DoubleMLData(df, 'y', 'd', [f'X{i + 1}' for i in np.arange(7)]) # check that after changing y_col, the y array gets updated y_comp = dml_data.data['y123'].values dml_data.y_col = 'y123' assert np.array_equal(dml_data.y, y_comp) msg = r'Invalid outcome variable y_col. d13 is no data column.' with pytest.raises(ValueError, match=msg): dml_data.y_col = 'd13' msg = (r'The outcome variable y_col must be of str type. ' "5 of type <class 'int'> was passed.") with pytest.raises(TypeError, match=msg): dml_data.y_col = 5
def test_duplicates(): np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=100) dml_cluster_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) msg = r'Invalid treatment variable\(s\) d_cols: Contains duplicate values.' with pytest.raises(ValueError, match=msg): _ = DoubleMLData(dml_data.data, y_col='y', d_cols=['d', 'd', 'X1'], x_cols=['X3', 'X2']) with pytest.raises(ValueError, match=msg): dml_data.d_cols = ['d', 'd', 'X1'] msg = 'Invalid covariates x_cols: Contains duplicate values.' with pytest.raises(ValueError, match=msg): _ = DoubleMLData(dml_data.data, y_col='y', d_cols=['d'], x_cols=['X3', 'X2', 'X3']) with pytest.raises(ValueError, match=msg): dml_data.x_cols = ['X3', 'X2', 'X3'] msg = r'Invalid instrumental variable\(s\) z_cols: Contains duplicate values.' with pytest.raises(ValueError, match=msg): _ = DoubleMLData(dml_data.data, y_col='y', d_cols=['d'], x_cols=['X3', 'X2'], z_cols=['X15', 'X12', 'X12', 'X15']) with pytest.raises(ValueError, match=msg): dml_data.z_cols = ['X15', 'X12', 'X12', 'X15'] msg = r'Invalid cluster variable\(s\) cluster_cols: Contains duplicate values.' with pytest.raises(ValueError, match=msg): _ = DoubleMLClusterData(dml_cluster_data.data, y_col='y', d_cols=['d'], cluster_cols=['X3', 'X2', 'X3']) with pytest.raises(ValueError, match=msg): dml_cluster_data.cluster_cols = ['X3', 'X2', 'X3'] msg = 'Invalid pd.DataFrame: Contains duplicate column names.' with pytest.raises(ValueError, match=msg): _ = DoubleMLData(pd.DataFrame(np.zeros((100, 5)), columns=['y', 'd', 'X3', 'X2', 'y']), y_col='y', d_cols=['d'], x_cols=['X3', 'X2']) with pytest.raises(ValueError, match=msg): _ = DoubleMLClusterData(pd.DataFrame(np.zeros((100, 5)), columns=['y', 'd', 'X3', 'X2', 'y']), y_col='y', d_cols=['d'], cluster_cols=['X2'])
import pytest import pandas as pd import numpy as np from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV, DoubleMLData, DoubleMLClusterData from doubleml.datasets import make_plr_CCDDHNR2018, make_irm_data, make_pliv_CHS2015, make_iivm_data,\ make_pliv_multiway_cluster_CKMS2021 from sklearn.linear_model import Lasso, LogisticRegression from sklearn.base import BaseEstimator np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=10) ml_g = Lasso() ml_m = Lasso() ml_r = Lasso() dml_plr = DoubleMLPLR(dml_data, ml_g, ml_m) dml_data_irm = make_irm_data(n_obs=10) dml_data_iivm = make_iivm_data(n_obs=10) dml_data_pliv = make_pliv_CHS2015(n_obs=10, dim_z=1) dml_cluster_data_pliv = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) (x, y, d, z) = make_iivm_data(n_obs=30, return_type="array") y[y > 0] = 1 y[y < 0] = 0 dml_data_irm_binary_outcome = DoubleMLData.from_arrays(x, y, d) dml_data_iivm_binary_outcome = DoubleMLData.from_arrays(x, y, d, z) @pytest.mark.ci def test_doubleml_exception_data():
import pytest import pandas as pd import numpy as np from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV from doubleml.datasets import make_plr_CCDDHNR2018, make_irm_data, make_pliv_CHS2015, make_iivm_data from sklearn.linear_model import Lasso, LogisticRegression np.random.seed(3141) dml_data_plr = make_plr_CCDDHNR2018(n_obs=100) dml_data_pliv = make_pliv_CHS2015(n_obs=100, dim_z=1) dml_data_irm = make_irm_data(n_obs=100) dml_data_iivm = make_iivm_data(n_obs=100) dml_plr = DoubleMLPLR(dml_data_plr, Lasso(), Lasso()) dml_pliv = DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso()) dml_irm = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression()) dml_iivm = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression()) @pytest.mark.ci @pytest.mark.parametrize('dml_obj, cls', [(dml_plr, DoubleMLPLR), (dml_pliv, DoubleMLPLIV), (dml_irm, DoubleMLIRM), (dml_iivm, DoubleMLIIVM)]) def test_plr_return_types(dml_obj, cls): # ToDo: A second test case with multiple treatment variables would be helpful assert isinstance(dml_obj.__str__(), str) assert isinstance(dml_obj.summary, pd.DataFrame)