def test_doubleml_exception_scores(): msg = 'Invalid score IV. Valid score IV-type or partialling out.' with pytest.raises(ValueError, match=msg): _ = DoubleMLPLR(dml_data, ml_g, ml_m, score='IV') msg = 'score should be either a string or a callable. 0 was passed.' with pytest.raises(TypeError, match=msg): _ = DoubleMLPLR(dml_data, ml_g, ml_m, score=0) msg = 'Invalid score IV. Valid score ATE or ATTE.' with pytest.raises(ValueError, match=msg): _ = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression(), score='IV') msg = 'score should be either a string or a callable. 0 was passed.' with pytest.raises(TypeError, match=msg): _ = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression(), score=0) msg = 'Invalid score ATE. Valid score LATE.' with pytest.raises(ValueError, match=msg): _ = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression(), score='ATE') msg = 'score should be either a string or a callable. 0 was passed.' with pytest.raises(TypeError, match=msg): _ = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression(), score=0) msg = 'Invalid score IV. Valid score partialling out.' with pytest.raises(ValueError, match=msg): _ = DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso(), score='IV') msg = 'score should be either a string or a callable. 0 was passed.' with pytest.raises(TypeError, match=msg): _ = DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso(), score=0)
def test_doubleml_exception_data(): msg = 'The data must be of DoubleMLData type.' with pytest.raises(TypeError, match=msg): _ = DoubleMLPLR(pd.DataFrame(), ml_g, ml_m) msg = ( r'Incompatible data. Z1 have been set as instrumental variable\(s\). ' 'To fit a partially linear IV regression model use DoubleMLPLIV instead of DoubleMLPLR.' ) with pytest.raises(ValueError, match=msg): _ = DoubleMLPLR(dml_data_pliv, ml_g, ml_m) msg = ( r'Incompatible data. z have been set as instrumental variable\(s\). ' 'To fit an interactive IV regression model use DoubleMLIIVM instead of DoubleMLIRM.' ) with pytest.raises(ValueError, match=msg): _ = DoubleMLIRM(dml_data_iivm, Lasso(), LogisticRegression()) msg = ( 'Incompatible data. To fit an IRM model with DML exactly one binary variable with values 0 and 1 ' 'needs to be specified as treatment variable.') df_irm = dml_data_irm.data.copy() df_irm['d'] = df_irm['d'] * 2 with pytest.raises(ValueError, match=msg): _ = DoubleMLIRM(DoubleMLData(df_irm, 'y', 'd'), Lasso(), LogisticRegression()) df_irm = dml_data_irm.data.copy() with pytest.raises(ValueError, match=msg): _ = DoubleMLIRM(DoubleMLData(df_irm, 'y', ['d', 'X1']), Lasso(), LogisticRegression()) msg = ( 'Incompatible data. To fit an IIVM model with DML exactly one binary variable with values 0 and 1 ' 'needs to be specified as treatment variable.') df_iivm = dml_data_iivm.data.copy() df_iivm['d'] = df_iivm['d'] * 2 with pytest.raises(ValueError, match=msg): _ = DoubleMLIIVM(DoubleMLData(df_iivm, 'y', 'd', z_cols='z'), Lasso(), LogisticRegression(), LogisticRegression()) df_iivm = dml_data_iivm.data.copy() with pytest.raises(ValueError, match=msg): _ = DoubleMLIIVM(DoubleMLData(df_iivm, 'y', ['d', 'X1'], z_cols='z'), Lasso(), LogisticRegression(), LogisticRegression()) msg = ( 'Incompatible data. To fit an IIVM model with DML exactly one binary variable with values 0 and 1 ' 'needs to be specified as instrumental variable.') df_iivm = dml_data_iivm.data.copy() df_iivm['z'] = df_iivm['z'] * 2 with pytest.raises(ValueError, match=msg): _ = DoubleMLIIVM(DoubleMLData(df_iivm, 'y', 'd', z_cols='z'), Lasso(), LogisticRegression(), LogisticRegression()) df_iivm = dml_data_iivm.data.copy() with pytest.raises(ValueError, match=msg): _ = DoubleMLIIVM(DoubleMLData(df_iivm, 'y', 'd', z_cols=['z', 'X1']), Lasso(), LogisticRegression(), LogisticRegression())
def test_doubleml_exception_subgroups(): msg = 'Invalid subgroups True. subgroups must be of type dictionary.' with pytest.raises(TypeError, match=msg): _ = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression(), subgroups=True) msg = "Invalid subgroups {'abs': True}. subgroups must be a dictionary with keys always_takers and never_takers." with pytest.raises(ValueError, match=msg): _ = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression(), subgroups={'abs': True}) msg = ("Invalid subgroups {'always_takers': True, 'never_takers': False, 'abs': 5}. " "subgroups must be a dictionary with keys always_takers and never_takers.") with pytest.raises(ValueError, match=msg): _ = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression(), subgroups={'always_takers': True, 'never_takers': False, 'abs': 5}) msg = ("Invalid subgroups {'always_takers': True}. " "subgroups must be a dictionary with keys always_takers and never_takers.") with pytest.raises(ValueError, match=msg): _ = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression(), subgroups={'always_takers': True}) msg = r"subgroups\['always_takers'\] must be True or False. Got 5." with pytest.raises(TypeError, match=msg): _ = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression(), subgroups={'always_takers': 5, 'never_takers': False}) msg = r"subgroups\['never_takers'\] must be True or False. Got 5." with pytest.raises(TypeError, match=msg): _ = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression(), subgroups={'always_takers': True, 'never_takers': 5})
def test_doubleml_exception_trimming_rule(): msg = 'Invalid trimming_rule discard. Valid trimming_rule truncate.' with pytest.raises(ValueError, match=msg): _ = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression(), trimming_rule='discard') with pytest.raises(ValueError, match=msg): _ = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression(), trimming_rule='discard')
def test_doubleml_exception_learner(): err_msg_prefix = 'Invalid learner provided for ml_g: ' warn_msg_prefix = 'Learner provided for ml_g is probably invalid: ' msg = err_msg_prefix + 'provide an instance of a learner instead of a class.' with pytest.raises(TypeError, match=msg): _ = DoubleMLPLR(dml_data, Lasso, ml_m) msg = err_msg_prefix + r'BaseEstimator\(\) has no method .fit\(\).' with pytest.raises(TypeError, match=msg): _ = DoubleMLPLR(dml_data, BaseEstimator(), ml_m) # msg = err_msg_prefix + r'_DummyNoSetParams\(\) has no method .set_params\(\).' with pytest.raises(TypeError): _ = DoubleMLPLR(dml_data, _DummyNoSetParams(), ml_m) # msg = err_msg_prefix + r'_DummyNoSetParams\(\) has no method .get_params\(\).' with pytest.raises(TypeError): _ = DoubleMLPLR(dml_data, _DummyNoGetParams(), ml_m) # msg = 'Learner provided for ml_m is probably invalid: ' + r'_DummyNoClassifier\(\) is \(probably\) no classifier.' with pytest.warns(UserWarning): _ = DoubleMLIRM(dml_data_irm, Lasso(), _DummyNoClassifier()) # ToDo: Currently for ml_g (and others) we only check whether the learner can be identified as regressor. However, # we do not check whether it can instead be identified as classifier, which could be used to throw an error. msg = warn_msg_prefix + r'LogisticRegression\(\) is \(probably\) no regressor.' with pytest.warns(UserWarning, match=msg): _ = DoubleMLPLR(dml_data, LogisticRegression(), Lasso()) # we allow classifiers for ml_m in PLR, but only for binary treatment variables msg = (r'The ml_m learner LogisticRegression\(\) was identified as classifier ' 'but at least one treatment variable is not binary with values 0 and 1.') with pytest.raises(ValueError, match=msg): _ = DoubleMLPLR(dml_data, Lasso(), LogisticRegression()) # we allow classifiers for ml_g for binary treatment variables in IRM msg = (r'The ml_g learner LogisticRegression\(\) was identified as classifier ' 'but the outcome variable is not binary with values 0 and 1.') with pytest.raises(ValueError, match=msg): _ = DoubleMLIRM(dml_data_irm, LogisticRegression(), LogisticRegression()) # we allow classifiers for ml_g for binary treatment variables in IRM msg = (r'The ml_g learner LogisticRegression\(\) was identified as classifier ' 'but the outcome variable is not binary with values 0 and 1.') with pytest.raises(ValueError, match=msg): _ = DoubleMLIIVM(dml_data_iivm, LogisticRegression(), LogisticRegression(), LogisticRegression()) # construct a classifier which is not identifiable as classifier via is_classifier by sklearn # it then predicts labels and therefore an exception will be thrown log_reg = LogisticRegression() log_reg._estimator_type = None msg = (r'Learner provided for ml_m is probably invalid: LogisticRegression\(\) is \(probably\) neither a regressor ' 'nor a classifier. Method predict is used for prediction.') with pytest.warns(UserWarning, match=msg): dml_plr_hidden_classifier = DoubleMLPLR(dml_data_irm, Lasso(), log_reg) msg = (r'For the binary treatment variable d, predictions obtained with the ml_m learner LogisticRegression\(\) ' 'are also observed to be binary with values 0 and 1. Make sure that for classifiers probabilities and not ' 'labels are predicted.') with pytest.raises(ValueError, match=msg): dml_plr_hidden_classifier.fit() # construct a classifier which is not identifiable as classifier via is_classifier by sklearn # it then predicts labels and therefore an exception will be thrown # whether predict() or predict_proba() is being called can also be manipulated via the unrelated max_iter variable log_reg = LogisticRegressionManipulatedPredict() log_reg._estimator_type = None msg = (r'Learner provided for ml_g is probably invalid: LogisticRegressionManipulatedPredict\(\) is \(probably\) ' 'neither a regressor nor a classifier. Method predict is used for prediction.') with pytest.warns(UserWarning, match=msg): dml_irm_hidden_classifier = DoubleMLIRM(dml_data_irm_binary_outcome, log_reg, LogisticRegression()) msg = (r'For the binary outcome variable y, predictions obtained with the ml_g learner ' r'LogisticRegressionManipulatedPredict\(\) are also observed to be binary with values 0 and 1. Make sure ' 'that for classifiers probabilities and not labels are predicted.') with pytest.raises(ValueError, match=msg): dml_irm_hidden_classifier.fit() with pytest.raises(ValueError, match=msg): dml_irm_hidden_classifier.set_ml_nuisance_params('ml_g0', 'd', {'max_iter': 314}) dml_irm_hidden_classifier.fit() msg = (r'Learner provided for ml_g is probably invalid: LogisticRegressionManipulatedPredict\(\) is \(probably\) ' 'neither a regressor nor a classifier. Method predict is used for prediction.') with pytest.warns(UserWarning, match=msg): dml_iivm_hidden_classifier = DoubleMLIIVM(dml_data_iivm_binary_outcome, log_reg, LogisticRegression(), LogisticRegression()) msg = (r'For the binary outcome variable y, predictions obtained with the ml_g learner ' r'LogisticRegressionManipulatedPredict\(\) are also observed to be binary with values 0 and 1. Make sure ' 'that for classifiers probabilities and not labels are predicted.') with pytest.raises(ValueError, match=msg): dml_iivm_hidden_classifier.fit() with pytest.raises(ValueError, match=msg): dml_iivm_hidden_classifier.set_ml_nuisance_params('ml_g0', 'd', {'max_iter': 314}) dml_iivm_hidden_classifier.fit()
from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV from doubleml.datasets import make_plr_CCDDHNR2018, make_irm_data, make_pliv_CHS2015, make_iivm_data from sklearn.linear_model import Lasso, LogisticRegression np.random.seed(3141) dml_data_plr = make_plr_CCDDHNR2018(n_obs=100) dml_data_pliv = make_pliv_CHS2015(n_obs=100, dim_z=1) dml_data_irm = make_irm_data(n_obs=100) dml_data_iivm = make_iivm_data(n_obs=100) dml_plr = DoubleMLPLR(dml_data_plr, Lasso(), Lasso()) dml_pliv = DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso()) dml_irm = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression()) dml_iivm = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression()) @pytest.mark.ci @pytest.mark.parametrize('dml_obj, cls', [(dml_plr, DoubleMLPLR), (dml_pliv, DoubleMLPLIV), (dml_irm, DoubleMLIRM), (dml_iivm, DoubleMLIIVM)]) def test_plr_return_types(dml_obj, cls): # ToDo: A second test case with multiple treatment variables would be helpful assert isinstance(dml_obj.__str__(), str) assert isinstance(dml_obj.summary, pd.DataFrame) assert isinstance(dml_obj.draw_sample_splitting(), cls) assert isinstance(dml_obj.set_sample_splitting(dml_obj.smpls), cls) assert isinstance(dml_obj.fit(), cls) assert isinstance(dml_obj.__str__(),
from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV from doubleml.datasets import make_plr_CCDDHNR2018, make_irm_data, make_pliv_CHS2015, make_iivm_data from sklearn.linear_model import Lasso, LogisticRegression np.random.seed(3141) dml_data_plr = make_plr_CCDDHNR2018(n_obs=100) dml_data_pliv = make_pliv_CHS2015(n_obs=100, dim_z=1) dml_data_irm = make_irm_data(n_obs=100) dml_data_iivm = make_iivm_data(n_obs=100) dml_plr = DoubleMLPLR(dml_data_plr, Lasso(), Lasso()) dml_pliv = DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso()) dml_irm = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression()) dml_iivm = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression()) def _assert_resampling_default_settings(dml_obj): assert dml_obj.n_folds == 5 assert dml_obj.n_rep == 1 assert dml_obj.draw_sample_splitting assert dml_obj.apply_cross_fitting @pytest.mark.ci def test_plr_defaults(): _assert_resampling_default_settings(dml_plr) assert dml_plr.score == 'partialling out' assert dml_plr.dml_procedure == 'dml2'
from sklearn.linear_model import Lasso, LogisticRegression np.random.seed(3141) dml_data_plr = make_plr_CCDDHNR2018(n_obs=100) dml_data_pliv = make_pliv_CHS2015(n_obs=100, dim_z=1) dml_data_irm = make_irm_data(n_obs=100) dml_data_iivm = make_iivm_data(n_obs=100) dml_plr = DoubleMLPLR(dml_data_plr, Lasso(), Lasso()) dml_plr.fit() dml_pliv = DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso()) dml_pliv.fit() dml_irm = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression()) dml_irm.fit() dml_iivm = DoubleMLIIVM(dml_data_iivm, Lasso(), LogisticRegression(), LogisticRegression()) dml_iivm.fit() # fit models with callable scores plr_score = dml_plr._score_elements dml_plr_callable_score = DoubleMLPLR(dml_data_plr, Lasso(), Lasso(), score=plr_score, draw_sample_splitting=False) dml_plr_callable_score.set_sample_splitting(dml_plr.smpls) dml_plr_callable_score.fit(store_predictions=True) irm_score = dml_irm._score_elements dml_irm_callable_score = DoubleMLIRM(dml_data_irm, Lasso(),