Ejemplo n.º 1
0
def dml_plr_fixture(generate_data1, learner, score, dml_procedure):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 502

    # collect data
    data = generate_data1
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    dml_plr_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values
    n_obs = len(y)
    all_smpls = draw_smpls(n_obs, n_folds)

    res_manual = fit_plr(y, x, d, clone(learner), clone(learner), all_smpls,
                         dml_procedure, score)

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': res_manual['theta'],
        'se': dml_plr_obj.se,
        'se_manual': res_manual['se'],
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(y, d, res_manual['thetas'],
                                           res_manual['ses'],
                                           res_manual['all_g_hat'],
                                           res_manual['all_m_hat'], all_smpls,
                                           score, bootstrap, n_rep_boot)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
def dml_plr_cluster_with_index(generate_data1, learner, dml_procedure):
    # in the one-way cluster case with exactly one observation per cluster, we get the same result w & w/o clustering
    n_folds = 2

    # collect data
    data = generate_data1
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols)
    np.random.seed(3141)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  dml_procedure=dml_procedure)
    dml_plr_obj.fit()

    df = data.reset_index()
    dml_cluster_data = dml.DoubleMLClusterData(df,
                                               y_col='y',
                                               d_cols='d',
                                               x_cols=x_cols,
                                               cluster_cols='index')
    np.random.seed(3141)
    dml_plr_cluster_obj = dml.DoubleMLPLR(dml_cluster_data,
                                          ml_g,
                                          ml_m,
                                          n_folds,
                                          dml_procedure=dml_procedure)
    dml_plr_cluster_obj.fit()

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': dml_plr_cluster_obj.coef,
        'se': dml_plr_obj.se,
        'se_manual': dml_plr_cluster_obj.se
    }

    return res_dict
def dml_plr_reestimate_fixture(generate_data1, learner, score, dml_procedure, n_rep):
    n_folds = 3

    # collect data
    data = generate_data1
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g, ml_m,
                                  n_folds,
                                  n_rep,
                                  score,
                                  dml_procedure)
    dml_plr_obj.fit()

    np.random.seed(3141)
    dml_plr_obj2 = dml.DoubleMLPLR(obj_dml_data,
                                   ml_g, ml_m,
                                   n_folds,
                                   n_rep,
                                   score,
                                   dml_procedure)
    dml_plr_obj2.fit()
    dml_plr_obj2._coef[0] = np.nan
    dml_plr_obj2._se[0] = np.nan
    dml_plr_obj2._est_causal_pars_and_se()

    res_dict = {'coef': dml_plr_obj.coef,
                'coef2': dml_plr_obj2.coef,
                'se': dml_plr_obj.se,
                'se2': dml_plr_obj2.se}

    return res_dict
Ejemplo n.º 4
0
def dml_plr_smpls_fixture(generate_data1, learner, score, dml_procedure,
                          n_rep):
    n_folds = 3

    # collect data
    data = generate_data1
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m, n_folds, n_rep,
                                  score, dml_procedure)

    dml_plr_obj.fit()

    smpls = dml_plr_obj.smpls

    dml_plr_obj2 = dml.DoubleMLPLR(obj_dml_data,
                                   ml_g,
                                   ml_m,
                                   score=score,
                                   dml_procedure=dml_procedure,
                                   draw_sample_splitting=False)
    dml_plr_obj2.set_sample_splitting(smpls)
    dml_plr_obj2.fit()

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef2': dml_plr_obj2.coef,
        'se': dml_plr_obj.se,
        'se2': dml_plr_obj2.se
    }

    return res_dict
def dml_plr_binary_classifier_fixture(learner, score, dml_procedure):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 502

    # Set machine learning methods for m & g
    ml_g = Lasso()
    ml_m = clone(learner)

    np.random.seed(3141)
    dml_plr_obj = dml.DoubleMLPLR(bonus_data,
                                  ml_g, ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    dml_plr_obj.fit()

    np.random.seed(3141)
    y = bonus_data.y
    x = bonus_data.x
    d = bonus_data.d
    n_obs = len(y)
    all_smpls = draw_smpls(n_obs, n_folds)

    res_manual = fit_plr(y, x, d, clone(ml_g), clone(ml_m),
                         all_smpls, dml_procedure, score)

    res_dict = {'coef': dml_plr_obj.coef,
                'coef_manual': res_manual['theta'],
                'se': dml_plr_obj.se,
                'se_manual': res_manual['se'],
                'boot_methods': boot_methods}

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(y, d, res_manual['thetas'], res_manual['ses'],
                                           res_manual['all_g_hat'], res_manual['all_m_hat'],
                                           all_smpls, score, bootstrap, n_rep_boot)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Ejemplo n.º 6
0
def dml_plr_pyvsr_fixture(generate_data1, idx, score, dml_procedure):
    n_folds = 2
    n_rep_boot = 483

    # collect data
    data = generate_data1[idx]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    learner = LinearRegression()
    ml_g = clone(learner)
    ml_m = clone(learner)

    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    #np.random.seed(3141)
    dml_plr_obj.fit()

    # fit the DML model in R
    all_train, all_test = export_smpl_split_to_r(dml_plr_obj.smpls[0])

    r_dataframe = pandas2ri.py2rpy(data)
    res_r = r_MLPLR(r_dataframe, score, dml_procedure, all_train, all_test)

    res_dict = {
        'coef_py': dml_plr_obj.coef,
        'coef_r': res_r[0],
        'se_py': dml_plr_obj.se,
        'se_r': res_r[1]
    }

    return res_dict
def dml_plr_rep_no_cross_fit_fixture(generate_data1, idx, learner, score,
                                     n_rep):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 498
    dml_procedure = 'dml1'

    # collect data
    data = generate_data1[idx]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  n_rep,
                                  score,
                                  dml_procedure,
                                  apply_cross_fitting=False)

    dml_plr_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    X = data.loc[:, X_cols].values
    d = data['d'].values
    all_smpls = []
    for i_rep in range(n_rep):
        resampling = KFold(n_splits=n_folds, shuffle=True)
        smpls = [(train, test) for train, test in resampling.split(X)]
        all_smpls.append(smpls)

    # adapt to do no-cross-fitting in each repetition
    all_smpls = [[xx[0]] for xx in all_smpls]

    thetas = np.zeros(n_rep)
    ses = np.zeros(n_rep)
    all_g_hat = list()
    all_m_hat = list()
    for i_rep in range(n_rep):
        smpls = all_smpls[i_rep]

        g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(learner),
                                        clone(learner), smpls)

        all_g_hat.append(g_hat)
        all_m_hat.append(m_hat)

        thetas[i_rep], ses[i_rep] = plr_dml1(y, X, d, all_g_hat[i_rep],
                                             all_m_hat[i_rep], smpls, score)

    res_manual = np.median(thetas)
    se_manual = np.sqrt(
        np.median(
            np.power(ses, 2) * len(smpls[0][1]) +
            np.power(thetas - res_manual, 2)) / len(smpls[0][1]))

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': res_manual,
        'se': dml_plr_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        all_boot_theta = list()
        all_boot_t_stat = list()
        for i_rep in range(n_rep):
            smpls = all_smpls[i_rep]
            boot_theta, boot_t_stat = boot_plr(thetas[i_rep],
                                               y,
                                               d,
                                               all_g_hat[i_rep],
                                               all_m_hat[i_rep],
                                               smpls,
                                               score,
                                               ses[i_rep],
                                               bootstrap,
                                               n_rep_boot,
                                               dml_procedure,
                                               apply_cross_fitting=False)
            all_boot_theta.append(boot_theta)
            all_boot_t_stat.append(boot_t_stat)

        boot_theta = np.hstack(all_boot_theta)
        boot_t_stat = np.hstack(all_boot_t_stat)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
def dml_plr_fixture(generate_data1, learner, score, dml_procedure, n_rep):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 498

    # collect data
    data = generate_data1
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g, ml_m,
                                  n_folds,
                                  n_rep,
                                  score,
                                  dml_procedure)

    dml_plr_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values
    n_obs = len(y)
    all_smpls = []
    for i_rep in range(n_rep):
        resampling = KFold(n_splits=n_folds,
                           shuffle=True)
        smpls = [(train, test) for train, test in resampling.split(x)]
        all_smpls.append(smpls)

    thetas = np.zeros(n_rep)
    ses = np.zeros(n_rep)
    all_g_hat = list()
    all_m_hat = list()
    for i_rep in range(n_rep):
        smpls = all_smpls[i_rep]

        g_hat, m_hat = fit_nuisance_plr(y, x, d,
                                        clone(learner), clone(learner), smpls)

        all_g_hat.append(g_hat)
        all_m_hat.append(m_hat)

        if dml_procedure == 'dml1':
            thetas[i_rep], ses[i_rep] = plr_dml1(y, x, d,
                                                 all_g_hat[i_rep], all_m_hat[i_rep],
                                                 smpls, score)
        else:
            assert dml_procedure == 'dml2'
            thetas[i_rep], ses[i_rep] = plr_dml2(y, x, d,
                                                 all_g_hat[i_rep], all_m_hat[i_rep],
                                                 smpls, score)

    res_manual = np.median(thetas)
    se_manual = np.sqrt(np.median(np.power(ses, 2)*n_obs + np.power(thetas - res_manual, 2))/n_obs)

    res_dict = {'coef': dml_plr_obj.coef,
                'coef_manual': res_manual,
                'se': dml_plr_obj.se,
                'se_manual': se_manual,
                'boot_methods': boot_methods
                }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        all_boot_theta = list()
        all_boot_t_stat = list()
        for i_rep in range(n_rep):
            smpls = all_smpls[i_rep]
            boot_theta, boot_t_stat = boot_plr(thetas[i_rep],
                                               y, d,
                                               all_g_hat[i_rep], all_m_hat[i_rep],
                                               smpls, score,
                                               ses[i_rep],
                                               bootstrap, n_rep_boot,
                                               dml_procedure)
            all_boot_theta.append(boot_theta)
            all_boot_t_stat.append(boot_t_stat)

        boot_theta = np.hstack(all_boot_theta)
        boot_t_stat = np.hstack(all_boot_t_stat)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Ejemplo n.º 9
0
dml_data = dml.datasets.fetch_bonus()
dml_data.data.head()

# %%
# Specify learner and estimate causal parameter: PLR model with random forest as learner
# --------------------------------------------------------------------------------------

# Set machine learning methods for m & g
ml_g = RandomForestRegressor()
ml_m = RandomForestRegressor()
n_folds = 2
n_rep = 10

np.random.seed(3141)
dml_plr_rf = dml.DoubleMLPLR(dml_data, ml_g, ml_m, n_folds, n_rep,
                             'partialling out', 'dml2')

# set some hyperparameters for the learners
pars = {'n_estimators': 500, 'max_features': 'sqrt', 'max_depth': 5}

dml_plr_rf.set_ml_nuisance_params('ml_g', 'tg', pars)
dml_plr_rf.set_ml_nuisance_params('ml_m', 'tg', pars)

# %%
#

dml_plr_rf.fit()
dml_plr_rf.summary

# %%
#
Ejemplo n.º 10
0
def dml_plr_no_cross_fit_tune_fixture(generate_data1, learner, score,
                                      tune_on_folds):
    par_grid = {
        'ml_g': {
            'alpha': np.linspace(0.05, .95, 7)
        },
        'ml_m': {
            'alpha': np.linspace(0.05, .95, 7)
        }
    }
    n_folds_tune = 3

    boot_methods = ['normal']
    n_rep_boot = 502
    dml_procedure = 'dml1'

    # collect data
    data = generate_data1
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = Lasso()
    ml_m = Lasso()

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds=2,
                                  score=score,
                                  dml_procedure=dml_procedure,
                                  apply_cross_fitting=False)

    # tune hyperparameters
    _ = dml_plr_obj.tune(par_grid,
                         tune_on_folds=tune_on_folds,
                         n_folds_tune=n_folds_tune)

    # fit with tuned parameters
    dml_plr_obj.fit()

    np.random.seed(3141)
    y = obj_dml_data.y
    x = obj_dml_data.x
    d = obj_dml_data.d
    n_obs = len(y)

    all_smpls = draw_smpls(n_obs, 2)
    smpls = all_smpls[0]
    smpls = [smpls[0]]

    if tune_on_folds:
        g_params, m_params = tune_nuisance_plr(y, x, d, clone(ml_g),
                                               clone(ml_m), smpls,
                                               n_folds_tune, par_grid['ml_g'],
                                               par_grid['ml_m'])
    else:
        xx = [(np.arange(len(y)), np.array([]))]
        g_params, m_params = tune_nuisance_plr(y, x, d, clone(ml_g),
                                               clone(ml_m), xx, n_folds_tune,
                                               par_grid['ml_g'],
                                               par_grid['ml_m'])

    res_manual = fit_plr(y,
                         x,
                         d,
                         clone(ml_m),
                         clone(ml_g), [smpls],
                         dml_procedure,
                         score,
                         g_params=g_params,
                         m_params=m_params)

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': res_manual['theta'],
        'se': dml_plr_obj.se,
        'se_manual': res_manual['se'],
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(y,
                                           d,
                                           res_manual['thetas'],
                                           res_manual['ses'],
                                           res_manual['all_g_hat'],
                                           res_manual['all_m_hat'], [smpls],
                                           score,
                                           bootstrap,
                                           n_rep_boot,
                                           apply_cross_fitting=False)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Ejemplo n.º 11
0
def dml_plr_fixture(generate_data2, learner_g, learner_m, score, dml_procedure,
                    tune_on_folds):
    par_grid = {
        'ml_g': get_par_grid(learner_g),
        'ml_m': get_par_grid(learner_m)
    }
    n_folds_tune = 4

    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 502

    # collect data
    obj_dml_data = generate_data2

    # Set machine learning methods for m & g
    ml_g = clone(learner_g)
    ml_m = clone(learner_m)

    np.random.seed(3141)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    # tune hyperparameters
    _ = dml_plr_obj.tune(par_grid,
                         tune_on_folds=tune_on_folds,
                         n_folds_tune=n_folds_tune)

    # fit with tuned parameters
    dml_plr_obj.fit()

    np.random.seed(3141)
    y = obj_dml_data.y
    x = obj_dml_data.x
    d = obj_dml_data.d
    resampling = KFold(n_splits=n_folds, shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(x)]

    if tune_on_folds:
        g_params, m_params = tune_nuisance_plr(y, x, d, clone(learner_m),
                                               clone(learner_g), smpls,
                                               n_folds_tune, par_grid['ml_g'],
                                               par_grid['ml_m'])

        g_hat, m_hat = fit_nuisance_plr(y, x, d, clone(learner_m),
                                        clone(learner_g), smpls, g_params,
                                        m_params)
    else:
        xx = [(np.arange(len(y)), np.array([]))]
        g_params, m_params = tune_nuisance_plr(y, x, d, clone(learner_m),
                                               clone(learner_g), xx,
                                               n_folds_tune, par_grid['ml_g'],
                                               par_grid['ml_m'])

        g_hat, m_hat = fit_nuisance_plr(y, x, d, clone(learner_m),
                                        clone(learner_g), smpls,
                                        g_params * n_folds, m_params * n_folds)

    if dml_procedure == 'dml1':
        res_manual, se_manual = plr_dml1(y, x, d, g_hat, m_hat, smpls, score)
    else:
        assert dml_procedure == 'dml2'
        res_manual, se_manual = plr_dml2(y, x, d, g_hat, m_hat, smpls, score)

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': res_manual,
        'se': dml_plr_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(res_manual, y, d, g_hat, m_hat,
                                           smpls, score, se_manual, bootstrap,
                                           n_rep_boot, dml_procedure)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Ejemplo n.º 12
0
def dml_plr_ols_manual_fixture(generate_data1, score, dml_procedure):
    learner = LinearRegression()
    boot_methods = ['Bayes', 'normal', 'wild']
    n_folds = 2
    n_rep_boot = 501

    # collect data
    data = generate_data1
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    n = data.shape[0]
    this_smpl = list()
    xx = int(n / 2)
    this_smpl.append((np.arange(xx, n), np.arange(0, xx)))
    this_smpl.append((np.arange(0, xx), np.arange(xx, n)))
    smpls = [this_smpl]
    dml_plr_obj.set_sample_splitting(smpls)

    dml_plr_obj.fit()

    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values

    # add column of ones for intercept
    o = np.ones((n, 1))
    x = np.append(x, o, axis=1)

    smpls = dml_plr_obj.smpls[0]

    g_hat = []
    for (train_index, test_index) in smpls:
        ols_est = scipy.linalg.lstsq(x[train_index], y[train_index])[0]
        g_hat.append(np.dot(x[test_index], ols_est))

    m_hat = []
    for (train_index, test_index) in smpls:
        ols_est = scipy.linalg.lstsq(x[train_index], d[train_index])[0]
        m_hat.append(np.dot(x[test_index], ols_est))

    if dml_procedure == 'dml1':
        res_manual, se_manual = plr_dml1(y, x, d, g_hat, m_hat, smpls, score)
    else:
        assert dml_procedure == 'dml2'
        res_manual, se_manual = plr_dml2(y, x, d, g_hat, m_hat, smpls, score)

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': res_manual,
        'se': dml_plr_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(y, d, [res_manual], [se_manual],
                                           [g_hat], [m_hat], [smpls], score,
                                           bootstrap, n_rep_boot)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Ejemplo n.º 13
0
def dml_plr_fixture(generate_data1, learner, score, dml_procedure):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 502

    # collect data
    data = generate_data1
    x_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g, ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    dml_plr_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    x = data.loc[:, x_cols].values
    d = data['d'].values
    resampling = KFold(n_splits=n_folds,
                       shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(x)]

    g_hat, m_hat = fit_nuisance_plr(y, x, d,
                                    clone(learner), clone(learner), smpls)

    if dml_procedure == 'dml1':
        res_manual, se_manual = plr_dml1(y, x, d,
                                         g_hat, m_hat,
                                         smpls, score)
    else:
        assert dml_procedure == 'dml2'
        res_manual, se_manual = plr_dml2(y, x, d,
                                         g_hat, m_hat,
                                         smpls, score)

    res_dict = {'coef': dml_plr_obj.coef,
                'coef_manual': res_manual,
                'se': dml_plr_obj.se,
                'se_manual': se_manual,
                'boot_methods': boot_methods}

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(res_manual,
                                           y, d,
                                           g_hat, m_hat,
                                           smpls, score,
                                           se_manual,
                                           bootstrap, n_rep_boot)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Ejemplo n.º 14
0
def dml_plr_fixture(generate_data2, learner_g, learner_m, score, dml_procedure, tune_on_folds):
    par_grid = {'ml_g': get_par_grid(learner_g),
                'ml_m': get_par_grid(learner_m)}
    n_folds_tune = 4

    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 502

    # collect data
    obj_dml_data = generate_data2

    # Set machine learning methods for m & g
    ml_g = clone(learner_g)
    ml_m = clone(learner_m)

    np.random.seed(3141)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g, ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    # tune hyperparameters
    _ = dml_plr_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune)

    # fit with tuned parameters
    dml_plr_obj.fit()

    np.random.seed(3141)
    y = obj_dml_data.y
    x = obj_dml_data.x
    d = obj_dml_data.d
    n_obs = len(y)
    all_smpls = draw_smpls(n_obs, n_folds)
    smpls = all_smpls[0]

    if tune_on_folds:
        g_params, m_params = tune_nuisance_plr(y, x, d,
                                               clone(learner_g), clone(learner_m), smpls, n_folds_tune,
                                               par_grid['ml_g'], par_grid['ml_m'])
    else:
        xx = [(np.arange(len(y)), np.array([]))]
        g_params, m_params = tune_nuisance_plr(y, x, d,
                                               clone(learner_g), clone(learner_m), xx, n_folds_tune,
                                               par_grid['ml_g'], par_grid['ml_m'])
        g_params = g_params * n_folds
        m_params = m_params * n_folds

    res_manual = fit_plr(y, x, d, clone(learner_g), clone(learner_m),
                         all_smpls, dml_procedure, score,
                         g_params=g_params, m_params=m_params)

    res_dict = {'coef': dml_plr_obj.coef,
                'coef_manual': res_manual['theta'],
                'se': dml_plr_obj.se,
                'se_manual': res_manual['se'],
                'boot_methods': boot_methods}

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(y, d, res_manual['thetas'], res_manual['ses'],
                                           res_manual['all_g_hat'], res_manual['all_m_hat'],
                                           all_smpls, score, bootstrap, n_rep_boot)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
def dml_plr_no_cross_fit_tune_fixture(generate_data1, idx, learner, score,
                                      tune_on_folds):
    par_grid = {
        'ml_g': {
            'alpha': np.linspace(0.05, .95, 7)
        },
        'ml_m': {
            'alpha': np.linspace(0.05, .95, 7)
        }
    }
    n_folds_tune = 3

    boot_methods = ['normal']
    n_rep_boot = 502
    dml_procedure = 'dml1'

    # collect data
    data = generate_data1[idx]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = Lasso()
    ml_m = Lasso()

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds=2,
                                  score=score,
                                  dml_procedure=dml_procedure,
                                  apply_cross_fitting=False)

    # tune hyperparameters
    res_tuning = dml_plr_obj.tune(par_grid,
                                  tune_on_folds=tune_on_folds,
                                  n_folds_tune=n_folds_tune)

    # fit with tuned parameters
    dml_plr_obj.fit()

    np.random.seed(3141)
    y = obj_dml_data.y
    X = obj_dml_data.x
    d = obj_dml_data.d

    resampling = KFold(n_splits=2, shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(X)]
    smpls = [smpls[0]]

    if tune_on_folds:
        g_params, m_params = tune_nuisance_plr(y, X, d, clone(ml_m),
                                               clone(ml_g), smpls,
                                               n_folds_tune, par_grid['ml_g'],
                                               par_grid['ml_m'])

        g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(ml_m), clone(ml_g),
                                        smpls, g_params, m_params)
    else:
        xx = [(np.arange(len(y)), np.array([]))]
        g_params, m_params = tune_nuisance_plr(y, X, d, clone(ml_m),
                                               clone(ml_g), xx, n_folds_tune,
                                               par_grid['ml_g'],
                                               par_grid['ml_m'])

        g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(ml_m), clone(ml_g),
                                        smpls, g_params, m_params)

    assert dml_procedure == 'dml1'
    res_manual, se_manual = plr_dml1(y, X, d, g_hat, m_hat, smpls, score)

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': res_manual,
        'se': dml_plr_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(res_manual,
                                           y,
                                           d,
                                           g_hat,
                                           m_hat,
                                           smpls,
                                           score,
                                           se_manual,
                                           bootstrap,
                                           n_rep_boot,
                                           dml_procedure,
                                           apply_cross_fitting=False)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
def dml_plr_no_cross_fit_fixture(generate_data1, idx, learner, score, n_folds):
    boot_methods = ['normal']
    n_rep_boot = 502
    dml_procedure = 'dml1'

    # collect data
    data = generate_data1[idx]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure,
                                  apply_cross_fitting=False)

    dml_plr_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    X = data.loc[:, X_cols].values
    d = data['d'].values
    if n_folds == 1:
        smpls = [(np.arange(len(y)), np.arange(len(y)))]
    else:
        resampling = KFold(n_splits=n_folds, shuffle=True)
        smpls = [(train, test) for train, test in resampling.split(X)]
        smpls = [smpls[0]]

    g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(learner), clone(learner),
                                    smpls)

    assert dml_procedure == 'dml1'
    res_manual, se_manual = plr_dml1(y, X, d, g_hat, m_hat, smpls, score)

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': res_manual,
        'se': dml_plr_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(res_manual,
                                           y,
                                           d,
                                           g_hat,
                                           m_hat,
                                           smpls,
                                           score,
                                           se_manual,
                                           bootstrap,
                                           n_rep_boot,
                                           dml_procedure,
                                           apply_cross_fitting=False)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
def dml_plr_multitreat_fixture(generate_data_bivariate, generate_data_toeplitz, idx, learner, score, dml_procedure):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 483

    # collect data
    if idx < n_datasets:
        data = generate_data_bivariate[idx]
    else:
        data = generate_data_toeplitz[idx-n_datasets]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()
    d_cols = data.columns[data.columns.str.startswith('d')].tolist()
    
    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', d_cols, X_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g, ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    dml_plr_obj.fit()
    
    np.random.seed(3141)
    y = data['y'].values
    X = data.loc[:, X_cols].values
    d = data.loc[:, d_cols].values
    resampling = KFold(n_splits=n_folds,
                       shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(X)]
    
    n_d = d.shape[1]
    
    coef_manual = np.full(n_d, np.nan)
    se_manual = np.full(n_d, np.nan)
    
    all_g_hat = []
    all_m_hat = []
    
    for i_d in range(n_d):
        
        Xd = np.hstack((X, np.delete(d, i_d , axis=1)))
        
        g_hat, m_hat = fit_nuisance_plr(y, Xd, d[:, i_d],
                                        clone(learner), clone(learner), smpls)
        
        all_g_hat.append(g_hat)
        all_m_hat.append(m_hat)
        
        if dml_procedure == 'dml1':
            coef_manual[i_d], se_manual[i_d] = plr_dml1(y, Xd, d[:, i_d],
                                                        g_hat, m_hat,
                                                        smpls, score)
        elif dml_procedure == 'dml2':
            coef_manual[i_d], se_manual[i_d] = plr_dml2(y, Xd, d[:, i_d],
                                                        g_hat, m_hat,
                                                        smpls, score)
                   
    res_dict = {'coef': dml_plr_obj.coef,
                'coef_manual': coef_manual,
                'se': dml_plr_obj.se,
                'se_manual': se_manual,
                'boot_methods': boot_methods}
    
    
    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(coef_manual,
                                           y, d,
                                           all_g_hat, all_m_hat,
                                           smpls, score,
                                           se_manual,
                                           bootstrap, n_rep_boot,
                                           dml_procedure)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat
    
    return res_dict
def dml_plr_fixture(generate_data1, idx, score, dml_procedure):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 502

    # collect data
    data = generate_data1[idx]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()

    alpha = 0.05
    learner = Lasso(alpha=alpha)
    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'])
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    dml_plr_obj.fit()

    np.random.seed(3141)
    learner = Lasso()
    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    dml_plr_obj_ext_set_par = dml.DoubleMLPLR(obj_dml_data,
                                              ml_g,
                                              ml_m,
                                              n_folds,
                                              score=score,
                                              dml_procedure=dml_procedure)
    dml_plr_obj_ext_set_par.set_ml_nuisance_params('ml_g', 'd',
                                                   {'alpha': alpha})
    dml_plr_obj_ext_set_par.set_ml_nuisance_params('ml_m', 'd',
                                                   {'alpha': alpha})
    dml_plr_obj_ext_set_par.fit()

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': dml_plr_obj_ext_set_par.coef,
        'se': dml_plr_obj.se,
        'se_manual': dml_plr_obj_ext_set_par.se,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(314122)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat

        np.random.seed(314122)
        dml_plr_obj_ext_set_par.bootstrap(method=bootstrap,
                                          n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap +
                 '_manual'] = dml_plr_obj_ext_set_par.boot_coef
        res_dict['boot_t_stat' + bootstrap +
                 '_manual'] = dml_plr_obj_ext_set_par.boot_t_stat

    return res_dict
def dml_plr_binary_classifier_fixture(learner, score, dml_procedure):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 502

    # Set machine learning methods for m & g
    ml_g = Lasso()
    ml_m = clone(learner)

    np.random.seed(3141)
    dml_plr_obj = dml.DoubleMLPLR(bonus_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    dml_plr_obj.fit()

    np.random.seed(3141)
    y = bonus_data.y
    x = bonus_data.x
    d = bonus_data.d
    resampling = KFold(n_splits=n_folds, shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(x)]

    if is_classifier(ml_m):
        g_hat, m_hat = fit_nuisance_plr_classifier(y, x, d, clone(ml_m),
                                                   clone(ml_g), smpls)
    else:
        g_hat, m_hat = fit_nuisance_plr(y, x, d, clone(ml_m), clone(ml_g),
                                        smpls)

    if dml_procedure == 'dml1':
        res_manual, se_manual = plr_dml1(y, x, d, g_hat, m_hat, smpls, score)
    else:
        assert dml_procedure == 'dml2'
        res_manual, se_manual = plr_dml2(y, x, d, g_hat, m_hat, smpls, score)

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': res_manual,
        'se': dml_plr_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(res_manual, y, d, g_hat, m_hat,
                                           smpls, score, se_manual, bootstrap,
                                           n_rep_boot, dml_procedure)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict