def test_doubleml_exception_scores(): msg = 'Invalid score IV. Valid score IV-type or partialling out.' with pytest.raises(ValueError, match=msg): _ = DoubleMLPLR(dml_data, ml_g, ml_m, score='IV') msg = 'score should be either a string or a callable. 0 was passed.' with pytest.raises(TypeError, match=msg): _ = DoubleMLPLR(dml_data, ml_g, ml_m, score=0) msg = 'Invalid score IV. Valid score ATE or ATTE.' with pytest.raises(ValueError, match=msg): _ = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression(), score='IV') msg = 'score should be either a string or a callable. 0 was passed.' with pytest.raises(TypeError, match=msg): _ = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression(), score=0) msg = 'Invalid score ATE. Valid score LATE.' with pytest.raises(ValueError, match=msg): _ = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression(), score='ATE') msg = 'score should be either a string or a callable. 0 was passed.' with pytest.raises(TypeError, match=msg): _ = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression(), score=0) msg = 'Invalid score IV. Valid score partialling out.' with pytest.raises(ValueError, match=msg): _ = DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso(), score='IV') msg = 'score should be either a string or a callable. 0 was passed.' with pytest.raises(TypeError, match=msg): _ = DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso(), score=0)
def test_pliv_callable_not_implemented(): np.random.seed(3141) dml_data_pliv_2z = make_pliv_CHS2015(n_obs=100, dim_z=2) pliv_score = dml_pliv._score_elements dml_pliv_callable_score = DoubleMLPLIV._partialX(dml_data_pliv_2z, Lasso(), Lasso(), Lasso(), score=pliv_score) msg = 'Callable score not implemented for DoubleMLPLIV.partialX with several instruments.' with pytest.raises(NotImplementedError, match=msg): dml_pliv_callable_score.fit() dml_pliv_callable_score = DoubleMLPLIV._partialZ(dml_data_pliv_2z, Lasso(), score=pliv_score) msg = 'Callable score not implemented for DoubleMLPLIV.partialZ.' with pytest.raises(NotImplementedError, match=msg): dml_pliv_callable_score.fit() dml_pliv_callable_score = DoubleMLPLIV._partialXZ(dml_data_pliv_2z, Lasso(), Lasso(), Lasso(), score=pliv_score) msg = 'Callable score not implemented for DoubleMLPLIV.partialXZ.' with pytest.raises(NotImplementedError, match=msg): dml_pliv_callable_score.fit()
def test_doubleml_cluster_not_yet_implemented(): dml_pliv_cluster = DoubleMLPLIV(dml_cluster_data_pliv, ml_g, ml_m, ml_r) dml_pliv_cluster.fit() msg = 'bootstrap not yet implemented with clustering.' with pytest.raises(NotImplementedError, match=msg): _ = dml_pliv_cluster.bootstrap() smpls = dml_plr.smpls msg = ('Externally setting the sample splitting for DoubleML is ' 'not yet implemented with clustering.') with pytest.raises(NotImplementedError, match=msg): _ = dml_pliv_cluster.set_sample_splitting(smpls) df = dml_cluster_data_pliv.data.copy() df['cluster_var_k'] = df['cluster_var_i'] + df['cluster_var_j'] - 2 dml_cluster_data_multiway = DoubleMLClusterData(df, y_col='Y', d_cols='D', x_cols=['X1', 'X5'], z_cols='Z', cluster_cols=['cluster_var_i', 'cluster_var_j', 'cluster_var_k']) assert dml_cluster_data_multiway.n_cluster_vars == 3 msg = r'Multi-way \(n_ways > 2\) clustering not yet implemented.' with pytest.raises(NotImplementedError, match=msg): _ = DoubleMLPLIV(dml_cluster_data_multiway, ml_g, ml_m, ml_r) msg = (r'No cross-fitting \(`apply_cross_fitting = False`\) ' 'is not yet implemented with clustering.') with pytest.raises(NotImplementedError, match=msg): _ = DoubleMLPLIV(dml_cluster_data_pliv, ml_g, ml_m, ml_r, n_folds=1) with pytest.raises(NotImplementedError, match=msg): _ = DoubleMLPLIV(dml_cluster_data_pliv, ml_g, ml_m, ml_r, apply_cross_fitting=False, n_folds=2)
def test_doubleml_exception_smpls(): msg = ('Sample splitting not specified. ' r'Either draw samples via .draw_sample splitting\(\) or set external samples via .set_sample_splitting\(\).') dml_plr_no_smpls = DoubleMLPLR(dml_data, ml_g, ml_m, draw_sample_splitting=False) with pytest.raises(ValueError, match=msg): _ = dml_plr_no_smpls.smpls msg = 'Sample splitting not specified. Draw samples via .draw_sample splitting().' dml_pliv_cluster_no_smpls = DoubleMLPLIV(dml_cluster_data_pliv, ml_g, ml_m, ml_r, draw_sample_splitting=False) with pytest.raises(ValueError, match=msg): _ = dml_pliv_cluster_no_smpls.smpls_cluster with pytest.raises(ValueError, match=msg): _ = dml_pliv_cluster_no_smpls.smpls
def test_pliv_callable_vs_str_score(): pliv_score = dml_pliv._score_elements dml_pliv_callable_score = DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso(), score=pliv_score, draw_sample_splitting=False) dml_pliv_callable_score.set_sample_splitting(dml_pliv.smpls) dml_pliv_callable_score.fit() assert np.allclose(dml_pliv.psi, dml_pliv_callable_score.psi, rtol=1e-9, atol=1e-4) assert np.allclose(dml_pliv.coef, dml_pliv_callable_score.coef, rtol=1e-9, atol=1e-4)
def test_doubleml_exception_data(): msg = 'The data must be of DoubleMLData type.' with pytest.raises(TypeError, match=msg): _ = DoubleMLPLR(pd.DataFrame(), ml_g, ml_m) # PLR with IV msg = (r'Incompatible data. Z1 have been set as instrumental variable\(s\). ' 'To fit a partially linear IV regression model use DoubleMLPLIV instead of DoubleMLPLR.') with pytest.raises(ValueError, match=msg): _ = DoubleMLPLR(dml_data_pliv, ml_g, ml_m) # PLIV without IV msg = ('Incompatible data. ' 'At least one variable must be set as instrumental variable. ' r'To fit a partially linear regression model without instrumental variable\(s\) ' 'use DoubleMLPLR instead of DoubleMLPLIV.') with pytest.raises(ValueError, match=msg): _ = DoubleMLPLIV(dml_data, Lasso(), Lasso(), Lasso()) # IRM with IV msg = (r'Incompatible data. z have been set as instrumental variable\(s\). ' 'To fit an interactive IV regression model use DoubleMLIIVM instead of DoubleMLIRM.') with pytest.raises(ValueError, match=msg): _ = DoubleMLIRM(dml_data_iivm, Lasso(), LogisticRegression()) msg = ('Incompatible data. To fit an IRM model with DML exactly one binary variable with values 0 and 1 ' 'needs to be specified as treatment variable.') df_irm = dml_data_irm.data.copy() df_irm['d'] = df_irm['d']*2 with pytest.raises(ValueError, match=msg): # non-binary D for IRM _ = DoubleMLIRM(DoubleMLData(df_irm, 'y', 'd'), Lasso(), LogisticRegression()) df_irm = dml_data_irm.data.copy() with pytest.raises(ValueError, match=msg): # multiple D for IRM _ = DoubleMLIRM(DoubleMLData(df_irm, 'y', ['d', 'X1']), Lasso(), LogisticRegression()) msg = ('Incompatible data. To fit an IIVM model with DML exactly one binary variable with values 0 and 1 ' 'needs to be specified as treatment variable.') df_iivm = dml_data_iivm.data.copy() df_iivm['d'] = df_iivm['d'] * 2 with pytest.raises(ValueError, match=msg): # non-binary D for IIVM _ = DoubleMLIIVM(DoubleMLData(df_iivm, 'y', 'd', z_cols='z'), Lasso(), LogisticRegression(), LogisticRegression()) df_iivm = dml_data_iivm.data.copy() with pytest.raises(ValueError, match=msg): # multiple D for IIVM _ = DoubleMLIIVM(DoubleMLData(df_iivm, 'y', ['d', 'X1'], z_cols='z'), Lasso(), LogisticRegression(), LogisticRegression()) msg = ('Incompatible data. To fit an IIVM model with DML exactly one binary variable with values 0 and 1 ' 'needs to be specified as instrumental variable.') with pytest.raises(ValueError, match=msg): # IIVM without IV _ = DoubleMLIIVM(dml_data_irm, Lasso(), LogisticRegression(), LogisticRegression()) df_iivm = dml_data_iivm.data.copy() df_iivm['z'] = df_iivm['z'] * 2 with pytest.raises(ValueError, match=msg): # non-binary Z for IIVM _ = DoubleMLIIVM(DoubleMLData(df_iivm, 'y', 'd', z_cols='z'), Lasso(), LogisticRegression(), LogisticRegression()) df_iivm = dml_data_iivm.data.copy() with pytest.raises(ValueError, match=msg): # multiple Z for IIVM _ = DoubleMLIIVM(DoubleMLData(df_iivm, 'y', 'd', z_cols=['z', 'X1']), Lasso(), LogisticRegression(), LogisticRegression())
import pandas as pd import numpy as np from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV from doubleml.datasets import make_plr_CCDDHNR2018, make_irm_data, make_pliv_CHS2015, make_iivm_data from sklearn.linear_model import Lasso, LogisticRegression np.random.seed(3141) dml_data_plr = make_plr_CCDDHNR2018(n_obs=100) dml_data_pliv = make_pliv_CHS2015(n_obs=100, dim_z=1) dml_data_irm = make_irm_data(n_obs=100) dml_data_iivm = make_iivm_data(n_obs=100) dml_plr = DoubleMLPLR(dml_data_plr, Lasso(), Lasso()) dml_pliv = DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso()) dml_irm = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression()) dml_iivm = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression()) @pytest.mark.ci @pytest.mark.parametrize('dml_obj, cls', [(dml_plr, DoubleMLPLR), (dml_pliv, DoubleMLPLIV), (dml_irm, DoubleMLIRM), (dml_iivm, DoubleMLIIVM)]) def test_plr_return_types(dml_obj, cls): # ToDo: A second test case with multiple treatment variables would be helpful assert isinstance(dml_obj.__str__(), str) assert isinstance(dml_obj.summary, pd.DataFrame) assert isinstance(dml_obj.draw_sample_splitting(), cls)
import pytest import numpy as np from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV from doubleml.datasets import make_plr_CCDDHNR2018, make_irm_data, make_pliv_CHS2015, make_iivm_data from sklearn.linear_model import Lasso, LogisticRegression np.random.seed(3141) dml_data_plr = make_plr_CCDDHNR2018(n_obs=100) dml_data_pliv = make_pliv_CHS2015(n_obs=100, dim_z=1) dml_data_irm = make_irm_data(n_obs=100) dml_data_iivm = make_iivm_data(n_obs=100) dml_plr = DoubleMLPLR(dml_data_plr, Lasso(), Lasso()) dml_pliv = DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso()) dml_irm = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression()) dml_iivm = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression()) def _assert_resampling_default_settings(dml_obj): assert dml_obj.n_folds == 5 assert dml_obj.n_rep == 1 assert dml_obj.draw_sample_splitting assert dml_obj.apply_cross_fitting @pytest.mark.ci def test_plr_defaults(): _assert_resampling_default_settings(dml_plr)
# %% # Initialize the objects of class DoubleMLData and DoubleMLPLIV # ------------------------------------------------------------- # Set machine learning methods for m & g learner = RandomForestRegressor(max_depth=2, n_estimators=10) ml_g = clone(learner) ml_m = clone(learner) ml_r = clone(learner) # initialize the DoubleMLPLIV object dml_pliv_obj = DoubleMLPLIV(obj_dml_data, ml_g, ml_m, ml_r, score='partialling out', dml_procedure='dml1', draw_sample_splitting=False) # %% # Split samples and transfer the sample splitting to the object # ------------------------------------------------------------- K = 3 # number of folds smpl_sizes = [N, M] obj_dml_multiway_resampling = DoubleMLMultiwayResampling(K, smpl_sizes) smpls_multi_ind, smpls_lin_ind = obj_dml_multiway_resampling.split_samples() dml_pliv_obj.set_sample_splitting([smpls_lin_ind])