def test_irm_exception_with_missings(generate_data_irm_w_missings, learner_sklearn): # collect data (x, y, d) = generate_data_irm_w_missings # Set machine learning methods for m & g ml_g = clone(learner_sklearn[0]) ml_m = clone(learner_sklearn[1]) np.random.seed(3141) obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d, force_all_x_finite='allow-nan') dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, ml_g, ml_m) msg = r"Input contains NaN, infinity or a value too large for dtype\('float64'\)." with pytest.raises(ValueError, match=msg): dml_irm_obj.fit()
def dml_irm_pyvsr_fixture(generate_data_irm, idx, score, dml_procedure): n_folds = 2 # collect data (X, y, d) = generate_data_irm[idx] x_cols = [f'X{i + 1}' for i in np.arange(X.shape[1])] data = pd.DataFrame(np.column_stack((X, y, d)), columns=x_cols + ['y', 'd']) # Set machine learning methods for m & g learner_classif = LogisticRegression(penalty='none', solver='newton-cg') learner_reg = LinearRegression() ml_g = clone(learner_reg) ml_m = clone(learner_classif) obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols) dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure) np.random.seed(3141) dml_irm_obj.fit() # fit the DML model in R all_train, all_test = export_smpl_split_to_r(dml_irm_obj.smpls[0]) r_dataframe = pandas2ri.py2rpy(data) res_r = r_IRM(r_dataframe, score, dml_procedure, all_train, all_test) res_dict = { 'coef_py': dml_irm_obj.coef, 'coef_r': res_r[0], 'se_py': dml_irm_obj.se, 'se_r': res_r[1] } return res_dict
def dml_irm_fixture(generate_data_irm, idx, learner_g, learner_m, score, dml_procedure, tune_on_folds): par_grid = { 'ml_g': get_par_grid(learner_g), 'ml_m': get_par_grid(learner_m) } n_folds_tune = 4 boot_methods = ['normal'] n_folds = 2 n_rep_boot = 499 # collect data (X, y, d) = generate_data_irm[idx] # Set machine learning methods for m & g ml_g = clone(learner_g) ml_m = clone(learner_m) np.random.seed(3141) obj_dml_data = dml.DoubleMLData.from_arrays(X, y, d) dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure) # tune hyperparameters res_tuning = dml_irm_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune) dml_irm_obj.fit() np.random.seed(3141) resampling = KFold(n_splits=n_folds, shuffle=True) smpls = [(train, test) for train, test in resampling.split(X)] if tune_on_folds: g0_params, g1_params, m_params = tune_nuisance_irm( y, X, d, clone(learner_m), clone(learner_g), smpls, score, n_folds_tune, par_grid['ml_g'], par_grid['ml_m']) g_hat0, g_hat1, m_hat, p_hat = fit_nuisance_irm( y, X, d, clone(learner_m), clone(learner_g), smpls, score, g0_params, g1_params, m_params) else: xx = [(np.arange(len(y)), np.array([]))] g0_params, g1_params, m_params = tune_nuisance_irm( y, X, d, clone(learner_m), clone(learner_g), xx, score, n_folds_tune, par_grid['ml_g'], par_grid['ml_m']) if score == 'ATE': g_hat0, g_hat1, m_hat, p_hat = fit_nuisance_irm( y, X, d, clone(learner_m), clone(learner_g), smpls, score, g0_params * n_folds, g1_params * n_folds, m_params * n_folds) elif score == 'ATTE': g_hat0, g_hat1, m_hat, p_hat = fit_nuisance_irm( y, X, d, clone(learner_m), clone(learner_g), smpls, score, g0_params * n_folds, None, m_params * n_folds) if dml_procedure == 'dml1': res_manual, se_manual = irm_dml1(y, X, d, g_hat0, g_hat1, m_hat, p_hat, smpls, score) elif dml_procedure == 'dml2': res_manual, se_manual = irm_dml2(y, X, d, g_hat0, g_hat1, m_hat, p_hat, smpls, score) res_dict = { 'coef': dml_irm_obj.coef, 'coef_manual': res_manual, 'se': dml_irm_obj.se, 'se_manual': se_manual, 'boot_methods': boot_methods } for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_irm(res_manual, y, d, g_hat0, g_hat1, m_hat, p_hat, smpls, score, se_manual, bootstrap, n_rep_boot, dml_procedure) np.random.seed(3141) dml_irm_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_irm_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_irm_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
dml_plr_lasso.fit() dml_plr_lasso.summary # %% # Specify learner and estimate causal parameter: IRM model with random forest as learner # -------------------------------------------------------------------------------------- # Set machine learning methods for m & g ml_g = RandomForestRegressor() ml_m = RandomForestClassifier() n_folds = 2 n_rep = 10 np.random.seed(3141) dml_irm_rf = dml.DoubleMLIRM(dml_data, ml_g, ml_m, n_folds, n_rep, 'ATE', 'dml2') # set some hyperparameters for the learners pars = {'n_estimators': 500, 'max_features': 'sqrt', 'max_depth': 5} dml_irm_rf.set_ml_nuisance_params('ml_g0', 'tg', pars) dml_irm_rf.set_ml_nuisance_params('ml_g1', 'tg', pars) dml_irm_rf.set_ml_nuisance_params('ml_m', 'tg', pars) # %% # dml_irm_rf.fit() dml_irm_rf.summary # %%
def dml_irm_fixture(generate_data_irm, learner, score, dml_procedure, trimming_threshold): boot_methods = ['normal'] n_folds = 2 n_rep_boot = 499 # collect data (x, y, d) = generate_data_irm # Set machine learning methods for m & g ml_g = clone(learner[1]) ml_m = clone(learner[0]) np.random.seed(3141) obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d) dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure, trimming_threshold=trimming_threshold) dml_irm_obj.fit() np.random.seed(3141) resampling = KFold(n_splits=n_folds, shuffle=True) smpls = [(train, test) for train, test in resampling.split(x)] g_hat0, g_hat1, m_hat, p_hat = fit_nuisance_irm( y, x, d, clone(learner[0]), clone(learner[1]), smpls, score, trimming_threshold=trimming_threshold) if dml_procedure == 'dml1': res_manual, se_manual = irm_dml1(y, x, d, g_hat0, g_hat1, m_hat, p_hat, smpls, score) else: assert dml_procedure == 'dml2' res_manual, se_manual = irm_dml2(y, x, d, g_hat0, g_hat1, m_hat, p_hat, smpls, score) res_dict = { 'coef': dml_irm_obj.coef, 'coef_manual': res_manual, 'se': dml_irm_obj.se, 'se_manual': se_manual, 'boot_methods': boot_methods } for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_irm(res_manual, y, d, g_hat0, g_hat1, m_hat, p_hat, smpls, score, se_manual, bootstrap, n_rep_boot, dml_procedure) np.random.seed(3141) dml_irm_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_irm_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_irm_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
def dml_irm_no_cross_fit_fixture(generate_data_irm, learner, score, n_folds): boot_methods = ['normal'] n_rep_boot = 499 dml_procedure = 'dml1' # collect data (x, y, d) = generate_data_irm # Set machine learning methods for m & g ml_g = clone(learner[0]) ml_m = clone(learner[1]) np.random.seed(3141) obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d) dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure, apply_cross_fitting=False) dml_irm_obj.fit() np.random.seed(3141) if n_folds == 1: smpls = [(np.arange(len(y)), np.arange(len(y)))] else: n_obs = len(y) all_smpls = draw_smpls(n_obs, n_folds) smpls = all_smpls[0] smpls = [smpls[0]] res_manual = fit_irm(y, x, d, clone(learner[0]), clone(learner[1]), [smpls], dml_procedure, score) res_dict = { 'coef': dml_irm_obj.coef, 'coef_manual': res_manual['theta'], 'se': dml_irm_obj.se, 'se_manual': res_manual['se'], 'boot_methods': boot_methods } for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_irm(y, d, res_manual['thetas'], res_manual['ses'], res_manual['all_g_hat0'], res_manual['all_g_hat1'], res_manual['all_m_hat'], res_manual['all_p_hat'], [smpls], score, bootstrap, n_rep_boot, apply_cross_fitting=False) np.random.seed(3141) dml_irm_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_irm_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_irm_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
def dml_irm_w_missing_fixture(generate_data_irm_w_missings, learner_xgboost, score, dml_procedure, trimming_threshold): boot_methods = ['normal'] n_folds = 2 n_rep_boot = 499 # collect data (x, y, d) = generate_data_irm_w_missings # Set machine learning methods for m & g ml_g = clone(learner_xgboost[0]) ml_m = clone(learner_xgboost[1]) np.random.seed(3141) obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d, force_all_x_finite='allow-nan') dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure, trimming_threshold=trimming_threshold) dml_irm_obj.fit() np.random.seed(3141) n_obs = len(y) all_smpls = draw_smpls(n_obs, n_folds) res_manual = fit_irm(y, x, d, clone(learner_xgboost[0]), clone(learner_xgboost[1]), all_smpls, dml_procedure, score, trimming_threshold=trimming_threshold) res_dict = { 'coef': dml_irm_obj.coef, 'coef_manual': res_manual['theta'], 'se': dml_irm_obj.se, 'se_manual': res_manual['se'], 'boot_methods': boot_methods } for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_irm( y, d, res_manual['thetas'], res_manual['ses'], res_manual['all_g_hat0'], res_manual['all_g_hat1'], res_manual['all_m_hat'], res_manual['all_p_hat'], all_smpls, score, bootstrap, n_rep_boot) np.random.seed(3141) dml_irm_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_irm_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_irm_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict