def dml_plr_no_cross_fit_fixture(generate_data1, idx, learner, score, n_folds): boot_methods = ['normal'] n_rep_boot = 502 dml_procedure = 'dml1' # collect data data = generate_data1[idx] X_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g ml_g = clone(learner) ml_m = clone(learner) np.random.seed(3141) obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure, apply_cross_fitting=False) dml_plr_obj.fit() np.random.seed(3141) y = data['y'].values X = data.loc[:, X_cols].values d = data['d'].values if n_folds == 1: smpls = [(np.arange(len(y)), np.arange(len(y)))] else: resampling = KFold(n_splits=n_folds, shuffle=True) smpls = [(train, test) for train, test in resampling.split(X)] smpls = [smpls[0]] g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(learner), clone(learner), smpls) assert dml_procedure == 'dml1' res_manual, se_manual = plr_dml1(y, X, d, g_hat, m_hat, smpls, score) res_dict = { 'coef': dml_plr_obj.coef, 'coef_manual': res_manual, 'se': dml_plr_obj.se, 'se_manual': se_manual, 'boot_methods': boot_methods } for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_plr(res_manual, y, d, g_hat, m_hat, smpls, score, se_manual, bootstrap, n_rep_boot, dml_procedure, apply_cross_fitting=False) np.random.seed(3141) dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
def dml_plr_no_cross_fit_tune_fixture(generate_data1, idx, learner, score, tune_on_folds): par_grid = { 'ml_g': { 'alpha': np.linspace(0.05, .95, 7) }, 'ml_m': { 'alpha': np.linspace(0.05, .95, 7) } } n_folds_tune = 3 boot_methods = ['normal'] n_rep_boot = 502 dml_procedure = 'dml1' # collect data data = generate_data1[idx] X_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g ml_g = Lasso() ml_m = Lasso() np.random.seed(3141) obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds=2, score=score, dml_procedure=dml_procedure, apply_cross_fitting=False) # tune hyperparameters res_tuning = dml_plr_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune) # fit with tuned parameters dml_plr_obj.fit() np.random.seed(3141) y = obj_dml_data.y X = obj_dml_data.x d = obj_dml_data.d resampling = KFold(n_splits=2, shuffle=True) smpls = [(train, test) for train, test in resampling.split(X)] smpls = [smpls[0]] if tune_on_folds: g_params, m_params = tune_nuisance_plr(y, X, d, clone(ml_m), clone(ml_g), smpls, n_folds_tune, par_grid['ml_g'], par_grid['ml_m']) g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(ml_m), clone(ml_g), smpls, g_params, m_params) else: xx = [(np.arange(len(y)), np.array([]))] g_params, m_params = tune_nuisance_plr(y, X, d, clone(ml_m), clone(ml_g), xx, n_folds_tune, par_grid['ml_g'], par_grid['ml_m']) g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(ml_m), clone(ml_g), smpls, g_params, m_params) assert dml_procedure == 'dml1' res_manual, se_manual = plr_dml1(y, X, d, g_hat, m_hat, smpls, score) res_dict = { 'coef': dml_plr_obj.coef, 'coef_manual': res_manual, 'se': dml_plr_obj.se, 'se_manual': se_manual, 'boot_methods': boot_methods } for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_plr(res_manual, y, d, g_hat, m_hat, smpls, score, se_manual, bootstrap, n_rep_boot, dml_procedure, apply_cross_fitting=False) np.random.seed(3141) dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
def dml_plr_multitreat_fixture(generate_data_bivariate, generate_data_toeplitz, idx, learner, score, dml_procedure): boot_methods = ['normal'] n_folds = 2 n_rep_boot = 483 # collect data if idx < n_datasets: data = generate_data_bivariate[idx] else: data = generate_data_toeplitz[idx-n_datasets] X_cols = data.columns[data.columns.str.startswith('X')].tolist() d_cols = data.columns[data.columns.str.startswith('d')].tolist() # Set machine learning methods for m & g ml_g = clone(learner) ml_m = clone(learner) np.random.seed(3141) obj_dml_data = dml.DoubleMLData(data, 'y', d_cols, X_cols) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure) dml_plr_obj.fit() np.random.seed(3141) y = data['y'].values X = data.loc[:, X_cols].values d = data.loc[:, d_cols].values resampling = KFold(n_splits=n_folds, shuffle=True) smpls = [(train, test) for train, test in resampling.split(X)] n_d = d.shape[1] coef_manual = np.full(n_d, np.nan) se_manual = np.full(n_d, np.nan) all_g_hat = [] all_m_hat = [] for i_d in range(n_d): Xd = np.hstack((X, np.delete(d, i_d , axis=1))) g_hat, m_hat = fit_nuisance_plr(y, Xd, d[:, i_d], clone(learner), clone(learner), smpls) all_g_hat.append(g_hat) all_m_hat.append(m_hat) if dml_procedure == 'dml1': coef_manual[i_d], se_manual[i_d] = plr_dml1(y, Xd, d[:, i_d], g_hat, m_hat, smpls, score) elif dml_procedure == 'dml2': coef_manual[i_d], se_manual[i_d] = plr_dml2(y, Xd, d[:, i_d], g_hat, m_hat, smpls, score) res_dict = {'coef': dml_plr_obj.coef, 'coef_manual': coef_manual, 'se': dml_plr_obj.se, 'se_manual': se_manual, 'boot_methods': boot_methods} for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_plr(coef_manual, y, d, all_g_hat, all_m_hat, smpls, score, se_manual, bootstrap, n_rep_boot, dml_procedure) np.random.seed(3141) dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
def dml_plr_rep_no_cross_fit_fixture(generate_data1, idx, learner, score, n_rep): boot_methods = ['normal'] n_folds = 2 n_rep_boot = 498 dml_procedure = 'dml1' # collect data data = generate_data1[idx] X_cols = data.columns[data.columns.str.startswith('X')].tolist() # Set machine learning methods for m & g ml_g = clone(learner) ml_m = clone(learner) np.random.seed(3141) obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, n_rep, score, dml_procedure, apply_cross_fitting=False) dml_plr_obj.fit() np.random.seed(3141) y = data['y'].values X = data.loc[:, X_cols].values d = data['d'].values all_smpls = [] for i_rep in range(n_rep): resampling = KFold(n_splits=n_folds, shuffle=True) smpls = [(train, test) for train, test in resampling.split(X)] all_smpls.append(smpls) # adapt to do no-cross-fitting in each repetition all_smpls = [[xx[0]] for xx in all_smpls] thetas = np.zeros(n_rep) ses = np.zeros(n_rep) all_g_hat = list() all_m_hat = list() for i_rep in range(n_rep): smpls = all_smpls[i_rep] g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(learner), clone(learner), smpls) all_g_hat.append(g_hat) all_m_hat.append(m_hat) thetas[i_rep], ses[i_rep] = plr_dml1(y, X, d, all_g_hat[i_rep], all_m_hat[i_rep], smpls, score) res_manual = np.median(thetas) se_manual = np.sqrt( np.median( np.power(ses, 2) * len(smpls[0][1]) + np.power(thetas - res_manual, 2)) / len(smpls[0][1])) res_dict = { 'coef': dml_plr_obj.coef, 'coef_manual': res_manual, 'se': dml_plr_obj.se, 'se_manual': se_manual, 'boot_methods': boot_methods } for bootstrap in boot_methods: np.random.seed(3141) all_boot_theta = list() all_boot_t_stat = list() for i_rep in range(n_rep): smpls = all_smpls[i_rep] boot_theta, boot_t_stat = boot_plr(thetas[i_rep], y, d, all_g_hat[i_rep], all_m_hat[i_rep], smpls, score, ses[i_rep], bootstrap, n_rep_boot, dml_procedure, apply_cross_fitting=False) all_boot_theta.append(boot_theta) all_boot_t_stat.append(boot_t_stat) boot_theta = np.hstack(all_boot_theta) boot_t_stat = np.hstack(all_boot_t_stat) np.random.seed(3141) dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict
def dml_plr_fixture(generate_data2, idx, learner_g, learner_m, score, dml_procedure, tune_on_folds): par_grid = {'ml_g': get_par_grid(learner_g), 'ml_m': get_par_grid(learner_m)} n_folds_tune = 4 boot_methods = ['normal'] n_folds = 2 n_rep_boot = 502 # collect data obj_dml_data = generate_data2[idx] # Set machine learning methods for m & g ml_g = clone(learner_g) ml_m = clone(learner_m) np.random.seed(3141) dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, score=score, dml_procedure=dml_procedure) # tune hyperparameters res_tuning = dml_plr_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune) # fit with tuned parameters dml_plr_obj.fit() np.random.seed(3141) y = obj_dml_data.y X = obj_dml_data.x d = obj_dml_data.d resampling = KFold(n_splits=n_folds, shuffle=True) smpls = [(train, test) for train, test in resampling.split(X)] if tune_on_folds: g_params, m_params = tune_nuisance_plr(y, X, d, clone(learner_m), clone(learner_g), smpls, n_folds_tune, par_grid['ml_g'], par_grid['ml_m']) g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(learner_m), clone(learner_g), smpls, g_params, m_params) else: xx = [(np.arange(len(y)), np.array([]))] g_params, m_params = tune_nuisance_plr(y, X, d, clone(learner_m), clone(learner_g), xx, n_folds_tune, par_grid['ml_g'], par_grid['ml_m']) g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(learner_m), clone(learner_g), smpls, g_params * n_folds, m_params * n_folds) if dml_procedure == 'dml1': res_manual, se_manual = plr_dml1(y, X, d, g_hat, m_hat, smpls, score) elif dml_procedure == 'dml2': res_manual, se_manual = plr_dml2(y, X, d, g_hat, m_hat, smpls, score) res_dict = {'coef': dml_plr_obj.coef, 'coef_manual': res_manual, 'se': dml_plr_obj.se, 'se_manual': se_manual, 'boot_methods': boot_methods} for bootstrap in boot_methods: np.random.seed(3141) boot_theta, boot_t_stat = boot_plr(res_manual, y, d, g_hat, m_hat, smpls, score, se_manual, bootstrap, n_rep_boot, dml_procedure) np.random.seed(3141) dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat return res_dict