Beispiel #1
0
def test_irm_exception_with_missings(generate_data_irm_w_missings,
                                     learner_sklearn):
    # collect data
    (x, y, d) = generate_data_irm_w_missings

    # Set machine learning methods for m & g
    ml_g = clone(learner_sklearn[0])
    ml_m = clone(learner_sklearn[1])

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData.from_arrays(x,
                                                y,
                                                d,
                                                force_all_x_finite='allow-nan')
    dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, ml_g, ml_m)

    msg = r"Input contains NaN, infinity or a value too large for dtype\('float64'\)."
    with pytest.raises(ValueError, match=msg):
        dml_irm_obj.fit()
Beispiel #2
0
def dml_irm_pyvsr_fixture(generate_data_irm, idx, score, dml_procedure):
    n_folds = 2

    # collect data
    (X, y, d) = generate_data_irm[idx]
    x_cols = [f'X{i + 1}' for i in np.arange(X.shape[1])]
    data = pd.DataFrame(np.column_stack((X, y, d)),
                        columns=x_cols + ['y', 'd'])

    # Set machine learning methods for m & g
    learner_classif = LogisticRegression(penalty='none', solver='newton-cg')
    learner_reg = LinearRegression()
    ml_g = clone(learner_reg)
    ml_m = clone(learner_classif)

    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols)
    dml_irm_obj = dml.DoubleMLIRM(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    np.random.seed(3141)
    dml_irm_obj.fit()

    # fit the DML model in R
    all_train, all_test = export_smpl_split_to_r(dml_irm_obj.smpls[0])

    r_dataframe = pandas2ri.py2rpy(data)
    res_r = r_IRM(r_dataframe, score, dml_procedure, all_train, all_test)

    res_dict = {
        'coef_py': dml_irm_obj.coef,
        'coef_r': res_r[0],
        'se_py': dml_irm_obj.se,
        'se_r': res_r[1]
    }

    return res_dict
Beispiel #3
0
def dml_irm_fixture(generate_data_irm, idx, learner_g, learner_m, score,
                    dml_procedure, tune_on_folds):
    par_grid = {
        'ml_g': get_par_grid(learner_g),
        'ml_m': get_par_grid(learner_m)
    }
    n_folds_tune = 4

    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 499

    # collect data
    (X, y, d) = generate_data_irm[idx]

    # Set machine learning methods for m & g
    ml_g = clone(learner_g)
    ml_m = clone(learner_m)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData.from_arrays(X, y, d)
    dml_irm_obj = dml.DoubleMLIRM(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    # tune hyperparameters
    res_tuning = dml_irm_obj.tune(par_grid,
                                  tune_on_folds=tune_on_folds,
                                  n_folds_tune=n_folds_tune)

    dml_irm_obj.fit()

    np.random.seed(3141)
    resampling = KFold(n_splits=n_folds, shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(X)]

    if tune_on_folds:
        g0_params, g1_params, m_params = tune_nuisance_irm(
            y, X, d, clone(learner_m), clone(learner_g), smpls, score,
            n_folds_tune, par_grid['ml_g'], par_grid['ml_m'])

        g_hat0, g_hat1, m_hat, p_hat = fit_nuisance_irm(
            y, X, d, clone(learner_m), clone(learner_g), smpls, score,
            g0_params, g1_params, m_params)
    else:
        xx = [(np.arange(len(y)), np.array([]))]
        g0_params, g1_params, m_params = tune_nuisance_irm(
            y, X, d, clone(learner_m), clone(learner_g), xx, score,
            n_folds_tune, par_grid['ml_g'], par_grid['ml_m'])
        if score == 'ATE':
            g_hat0, g_hat1, m_hat, p_hat = fit_nuisance_irm(
                y, X, d, clone(learner_m), clone(learner_g), smpls, score,
                g0_params * n_folds, g1_params * n_folds, m_params * n_folds)
        elif score == 'ATTE':
            g_hat0, g_hat1, m_hat, p_hat = fit_nuisance_irm(
                y, X, d, clone(learner_m), clone(learner_g), smpls, score,
                g0_params * n_folds, None, m_params * n_folds)

    if dml_procedure == 'dml1':
        res_manual, se_manual = irm_dml1(y, X, d, g_hat0, g_hat1, m_hat, p_hat,
                                         smpls, score)
    elif dml_procedure == 'dml2':
        res_manual, se_manual = irm_dml2(y, X, d, g_hat0, g_hat1, m_hat, p_hat,
                                         smpls, score)

    res_dict = {
        'coef': dml_irm_obj.coef,
        'coef_manual': res_manual,
        'se': dml_irm_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_irm(res_manual, y, d, g_hat0, g_hat1,
                                           m_hat, p_hat, smpls, score,
                                           se_manual, bootstrap, n_rep_boot,
                                           dml_procedure)

        np.random.seed(3141)
        dml_irm_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_irm_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_irm_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Beispiel #4
0
dml_plr_lasso.fit()
dml_plr_lasso.summary

# %%
# Specify learner and estimate causal parameter: IRM model with random forest as learner
# --------------------------------------------------------------------------------------

# Set machine learning methods for m & g
ml_g = RandomForestRegressor()
ml_m = RandomForestClassifier()
n_folds = 2
n_rep = 10

np.random.seed(3141)
dml_irm_rf = dml.DoubleMLIRM(dml_data, ml_g, ml_m, n_folds, n_rep, 'ATE',
                             'dml2')

# set some hyperparameters for the learners
pars = {'n_estimators': 500, 'max_features': 'sqrt', 'max_depth': 5}

dml_irm_rf.set_ml_nuisance_params('ml_g0', 'tg', pars)
dml_irm_rf.set_ml_nuisance_params('ml_g1', 'tg', pars)
dml_irm_rf.set_ml_nuisance_params('ml_m', 'tg', pars)

# %%
#

dml_irm_rf.fit()
dml_irm_rf.summary

# %%
def dml_irm_fixture(generate_data_irm, learner, score, dml_procedure,
                    trimming_threshold):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 499

    # collect data
    (x, y, d) = generate_data_irm

    # Set machine learning methods for m & g
    ml_g = clone(learner[1])
    ml_m = clone(learner[0])

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d)
    dml_irm_obj = dml.DoubleMLIRM(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure,
                                  trimming_threshold=trimming_threshold)

    dml_irm_obj.fit()

    np.random.seed(3141)
    resampling = KFold(n_splits=n_folds, shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(x)]

    g_hat0, g_hat1, m_hat, p_hat = fit_nuisance_irm(
        y,
        x,
        d,
        clone(learner[0]),
        clone(learner[1]),
        smpls,
        score,
        trimming_threshold=trimming_threshold)

    if dml_procedure == 'dml1':
        res_manual, se_manual = irm_dml1(y, x, d, g_hat0, g_hat1, m_hat, p_hat,
                                         smpls, score)
    else:
        assert dml_procedure == 'dml2'
        res_manual, se_manual = irm_dml2(y, x, d, g_hat0, g_hat1, m_hat, p_hat,
                                         smpls, score)

    res_dict = {
        'coef': dml_irm_obj.coef,
        'coef_manual': res_manual,
        'se': dml_irm_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_irm(res_manual, y, d, g_hat0, g_hat1,
                                           m_hat, p_hat, smpls, score,
                                           se_manual, bootstrap, n_rep_boot,
                                           dml_procedure)

        np.random.seed(3141)
        dml_irm_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_irm_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_irm_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Beispiel #6
0
def dml_irm_no_cross_fit_fixture(generate_data_irm, learner, score, n_folds):
    boot_methods = ['normal']
    n_rep_boot = 499
    dml_procedure = 'dml1'

    # collect data
    (x, y, d) = generate_data_irm

    # Set machine learning methods for m & g
    ml_g = clone(learner[0])
    ml_m = clone(learner[1])

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d)
    dml_irm_obj = dml.DoubleMLIRM(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure,
                                  apply_cross_fitting=False)

    dml_irm_obj.fit()

    np.random.seed(3141)
    if n_folds == 1:
        smpls = [(np.arange(len(y)), np.arange(len(y)))]
    else:
        n_obs = len(y)
        all_smpls = draw_smpls(n_obs, n_folds)
        smpls = all_smpls[0]
        smpls = [smpls[0]]

    res_manual = fit_irm(y, x, d, clone(learner[0]), clone(learner[1]),
                         [smpls], dml_procedure, score)

    res_dict = {
        'coef': dml_irm_obj.coef,
        'coef_manual': res_manual['theta'],
        'se': dml_irm_obj.se,
        'se_manual': res_manual['se'],
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_irm(y,
                                           d,
                                           res_manual['thetas'],
                                           res_manual['ses'],
                                           res_manual['all_g_hat0'],
                                           res_manual['all_g_hat1'],
                                           res_manual['all_m_hat'],
                                           res_manual['all_p_hat'], [smpls],
                                           score,
                                           bootstrap,
                                           n_rep_boot,
                                           apply_cross_fitting=False)

        np.random.seed(3141)
        dml_irm_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_irm_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_irm_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Beispiel #7
0
def dml_irm_w_missing_fixture(generate_data_irm_w_missings, learner_xgboost,
                              score, dml_procedure, trimming_threshold):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 499

    # collect data
    (x, y, d) = generate_data_irm_w_missings

    # Set machine learning methods for m & g
    ml_g = clone(learner_xgboost[0])
    ml_m = clone(learner_xgboost[1])

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData.from_arrays(x,
                                                y,
                                                d,
                                                force_all_x_finite='allow-nan')
    dml_irm_obj = dml.DoubleMLIRM(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure,
                                  trimming_threshold=trimming_threshold)

    dml_irm_obj.fit()

    np.random.seed(3141)
    n_obs = len(y)
    all_smpls = draw_smpls(n_obs, n_folds)

    res_manual = fit_irm(y,
                         x,
                         d,
                         clone(learner_xgboost[0]),
                         clone(learner_xgboost[1]),
                         all_smpls,
                         dml_procedure,
                         score,
                         trimming_threshold=trimming_threshold)

    res_dict = {
        'coef': dml_irm_obj.coef,
        'coef_manual': res_manual['theta'],
        'se': dml_irm_obj.se,
        'se_manual': res_manual['se'],
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_irm(
            y, d, res_manual['thetas'], res_manual['ses'],
            res_manual['all_g_hat0'], res_manual['all_g_hat1'],
            res_manual['all_m_hat'], res_manual['all_p_hat'], all_smpls, score,
            bootstrap, n_rep_boot)

        np.random.seed(3141)
        dml_irm_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_irm_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_irm_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict