def dml_plr_no_cross_fit_fixture(generate_data1, idx, learner, score, n_folds):
    boot_methods = ['normal']
    n_rep_boot = 502
    dml_procedure = 'dml1'

    # collect data
    data = generate_data1[idx]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure,
                                  apply_cross_fitting=False)

    dml_plr_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    X = data.loc[:, X_cols].values
    d = data['d'].values
    if n_folds == 1:
        smpls = [(np.arange(len(y)), np.arange(len(y)))]
    else:
        resampling = KFold(n_splits=n_folds, shuffle=True)
        smpls = [(train, test) for train, test in resampling.split(X)]
        smpls = [smpls[0]]

    g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(learner), clone(learner),
                                    smpls)

    assert dml_procedure == 'dml1'
    res_manual, se_manual = plr_dml1(y, X, d, g_hat, m_hat, smpls, score)

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': res_manual,
        'se': dml_plr_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(res_manual,
                                           y,
                                           d,
                                           g_hat,
                                           m_hat,
                                           smpls,
                                           score,
                                           se_manual,
                                           bootstrap,
                                           n_rep_boot,
                                           dml_procedure,
                                           apply_cross_fitting=False)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
def dml_plr_no_cross_fit_tune_fixture(generate_data1, idx, learner, score,
                                      tune_on_folds):
    par_grid = {
        'ml_g': {
            'alpha': np.linspace(0.05, .95, 7)
        },
        'ml_m': {
            'alpha': np.linspace(0.05, .95, 7)
        }
    }
    n_folds_tune = 3

    boot_methods = ['normal']
    n_rep_boot = 502
    dml_procedure = 'dml1'

    # collect data
    data = generate_data1[idx]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = Lasso()
    ml_m = Lasso()

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds=2,
                                  score=score,
                                  dml_procedure=dml_procedure,
                                  apply_cross_fitting=False)

    # tune hyperparameters
    res_tuning = dml_plr_obj.tune(par_grid,
                                  tune_on_folds=tune_on_folds,
                                  n_folds_tune=n_folds_tune)

    # fit with tuned parameters
    dml_plr_obj.fit()

    np.random.seed(3141)
    y = obj_dml_data.y
    X = obj_dml_data.x
    d = obj_dml_data.d

    resampling = KFold(n_splits=2, shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(X)]
    smpls = [smpls[0]]

    if tune_on_folds:
        g_params, m_params = tune_nuisance_plr(y, X, d, clone(ml_m),
                                               clone(ml_g), smpls,
                                               n_folds_tune, par_grid['ml_g'],
                                               par_grid['ml_m'])

        g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(ml_m), clone(ml_g),
                                        smpls, g_params, m_params)
    else:
        xx = [(np.arange(len(y)), np.array([]))]
        g_params, m_params = tune_nuisance_plr(y, X, d, clone(ml_m),
                                               clone(ml_g), xx, n_folds_tune,
                                               par_grid['ml_g'],
                                               par_grid['ml_m'])

        g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(ml_m), clone(ml_g),
                                        smpls, g_params, m_params)

    assert dml_procedure == 'dml1'
    res_manual, se_manual = plr_dml1(y, X, d, g_hat, m_hat, smpls, score)

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': res_manual,
        'se': dml_plr_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(res_manual,
                                           y,
                                           d,
                                           g_hat,
                                           m_hat,
                                           smpls,
                                           score,
                                           se_manual,
                                           bootstrap,
                                           n_rep_boot,
                                           dml_procedure,
                                           apply_cross_fitting=False)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
def dml_plr_rep_no_cross_fit_fixture(generate_data1, idx, learner, score,
                                     n_rep):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 498
    dml_procedure = 'dml1'

    # collect data
    data = generate_data1[idx]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  n_rep,
                                  score,
                                  dml_procedure,
                                  apply_cross_fitting=False)

    dml_plr_obj.fit()

    np.random.seed(3141)
    y = data['y'].values
    X = data.loc[:, X_cols].values
    d = data['d'].values
    all_smpls = []
    for i_rep in range(n_rep):
        resampling = KFold(n_splits=n_folds, shuffle=True)
        smpls = [(train, test) for train, test in resampling.split(X)]
        all_smpls.append(smpls)

    # adapt to do no-cross-fitting in each repetition
    all_smpls = [[xx[0]] for xx in all_smpls]

    thetas = np.zeros(n_rep)
    ses = np.zeros(n_rep)
    all_g_hat = list()
    all_m_hat = list()
    for i_rep in range(n_rep):
        smpls = all_smpls[i_rep]

        g_hat, m_hat = fit_nuisance_plr(y, X, d, clone(learner),
                                        clone(learner), smpls)

        all_g_hat.append(g_hat)
        all_m_hat.append(m_hat)

        thetas[i_rep], ses[i_rep] = plr_dml1(y, X, d, all_g_hat[i_rep],
                                             all_m_hat[i_rep], smpls, score)

    res_manual = np.median(thetas)
    se_manual = np.sqrt(
        np.median(
            np.power(ses, 2) * len(smpls[0][1]) +
            np.power(thetas - res_manual, 2)) / len(smpls[0][1]))

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': res_manual,
        'se': dml_plr_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        all_boot_theta = list()
        all_boot_t_stat = list()
        for i_rep in range(n_rep):
            smpls = all_smpls[i_rep]
            boot_theta, boot_t_stat = boot_plr(thetas[i_rep],
                                               y,
                                               d,
                                               all_g_hat[i_rep],
                                               all_m_hat[i_rep],
                                               smpls,
                                               score,
                                               ses[i_rep],
                                               bootstrap,
                                               n_rep_boot,
                                               dml_procedure,
                                               apply_cross_fitting=False)
            all_boot_theta.append(boot_theta)
            all_boot_t_stat.append(boot_t_stat)

        boot_theta = np.hstack(all_boot_theta)
        boot_t_stat = np.hstack(all_boot_t_stat)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
def dml_plr_multitreat_fixture(generate_data_bivariate, generate_data_toeplitz, idx, learner, score, dml_procedure):
    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 483

    # collect data
    if idx < n_datasets:
        data = generate_data_bivariate[idx]
    else:
        data = generate_data_toeplitz[idx-n_datasets]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()
    d_cols = data.columns[data.columns.str.startswith('d')].tolist()
    
    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    np.random.seed(3141)
    obj_dml_data = dml.DoubleMLData(data, 'y', d_cols, X_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g, ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    dml_plr_obj.fit()
    
    np.random.seed(3141)
    y = data['y'].values
    X = data.loc[:, X_cols].values
    d = data.loc[:, d_cols].values
    resampling = KFold(n_splits=n_folds,
                       shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(X)]
    
    n_d = d.shape[1]
    
    coef_manual = np.full(n_d, np.nan)
    se_manual = np.full(n_d, np.nan)
    
    all_g_hat = []
    all_m_hat = []
    
    for i_d in range(n_d):
        
        Xd = np.hstack((X, np.delete(d, i_d , axis=1)))
        
        g_hat, m_hat = fit_nuisance_plr(y, Xd, d[:, i_d],
                                        clone(learner), clone(learner), smpls)
        
        all_g_hat.append(g_hat)
        all_m_hat.append(m_hat)
        
        if dml_procedure == 'dml1':
            coef_manual[i_d], se_manual[i_d] = plr_dml1(y, Xd, d[:, i_d],
                                                        g_hat, m_hat,
                                                        smpls, score)
        elif dml_procedure == 'dml2':
            coef_manual[i_d], se_manual[i_d] = plr_dml2(y, Xd, d[:, i_d],
                                                        g_hat, m_hat,
                                                        smpls, score)
                   
    res_dict = {'coef': dml_plr_obj.coef,
                'coef_manual': coef_manual,
                'se': dml_plr_obj.se,
                'se_manual': se_manual,
                'boot_methods': boot_methods}
    
    
    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(coef_manual,
                                           y, d,
                                           all_g_hat, all_m_hat,
                                           smpls, score,
                                           se_manual,
                                           bootstrap, n_rep_boot,
                                           dml_procedure)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat
    
    return res_dict
def dml_plr_fixture(generate_data2, idx, learner_g, learner_m, score, dml_procedure, tune_on_folds):
    par_grid = {'ml_g': get_par_grid(learner_g),
                'ml_m': get_par_grid(learner_m)}
    n_folds_tune = 4

    boot_methods = ['normal']
    n_folds = 2
    n_rep_boot = 502

    # collect data
    obj_dml_data = generate_data2[idx]

    # Set machine learning methods for m & g
    ml_g = clone(learner_g)
    ml_m = clone(learner_m)

    np.random.seed(3141)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g, ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    # tune hyperparameters
    res_tuning = dml_plr_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune)

    # fit with tuned parameters
    dml_plr_obj.fit()

    np.random.seed(3141)
    y = obj_dml_data.y
    X = obj_dml_data.x
    d = obj_dml_data.d
    resampling = KFold(n_splits=n_folds,
                       shuffle=True)
    smpls = [(train, test) for train, test in resampling.split(X)]

    if tune_on_folds:
        g_params, m_params = tune_nuisance_plr(y, X, d,
                                               clone(learner_m), clone(learner_g), smpls, n_folds_tune,
                                               par_grid['ml_g'], par_grid['ml_m'])

        g_hat, m_hat = fit_nuisance_plr(y, X, d,
                                        clone(learner_m), clone(learner_g), smpls,
                                        g_params, m_params)
    else:
        xx = [(np.arange(len(y)), np.array([]))]
        g_params, m_params = tune_nuisance_plr(y, X, d,
                                               clone(learner_m), clone(learner_g), xx, n_folds_tune,
                                               par_grid['ml_g'], par_grid['ml_m'])

        g_hat, m_hat = fit_nuisance_plr(y, X, d,
                                        clone(learner_m), clone(learner_g),
                                        smpls,
                                        g_params * n_folds, m_params * n_folds)


    if dml_procedure == 'dml1':
        res_manual, se_manual = plr_dml1(y, X, d,
                                         g_hat, m_hat,
                                         smpls, score)
    elif dml_procedure == 'dml2':
        res_manual, se_manual = plr_dml2(y, X, d,
                                         g_hat, m_hat,
                                         smpls, score)

    res_dict = {'coef': dml_plr_obj.coef,
                'coef_manual': res_manual,
                'se': dml_plr_obj.se,
                'se_manual': se_manual,
                'boot_methods': boot_methods}

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(res_manual,
                                           y, d,
                                           g_hat, m_hat,
                                           smpls, score,
                                           se_manual,
                                           bootstrap, n_rep_boot,
                                           dml_procedure)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict
Exemple #6
0
def dml_plr_ols_manual_fixture(generate_data1, idx, score, dml_procedure):
    learner = LinearRegression()
    boot_methods = ['Bayes', 'normal', 'wild']
    n_folds = 2
    n_rep_boot = 501

    # collect data
    data = generate_data1[idx]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    ml_g = clone(learner)
    ml_m = clone(learner)

    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    N = data.shape[0]
    this_smpl = list()
    xx = int(N / 2)
    this_smpl.append((np.arange(xx, N), np.arange(0, xx)))
    this_smpl.append((np.arange(0, xx), np.arange(xx, N)))
    smpls = [this_smpl]
    dml_plr_obj.set_sample_splitting(smpls)

    dml_plr_obj.fit()

    y = data['y'].values
    X = data.loc[:, X_cols].values
    d = data['d'].values

    # add column of ones for intercept
    o = np.ones((N, 1))
    X = np.append(X, o, axis=1)

    smpls = dml_plr_obj.smpls[0]

    g_hat = []
    for idx, (train_index, test_index) in enumerate(smpls):
        ols_est = scipy.linalg.lstsq(X[train_index], y[train_index])[0]
        g_hat.append(np.dot(X[test_index], ols_est))

    m_hat = []
    for idx, (train_index, test_index) in enumerate(smpls):
        ols_est = scipy.linalg.lstsq(X[train_index], d[train_index])[0]
        m_hat.append(np.dot(X[test_index], ols_est))

    if dml_procedure == 'dml1':
        res_manual, se_manual = plr_dml1(y, X, d, g_hat, m_hat, smpls, score)
    elif dml_procedure == 'dml2':
        res_manual, se_manual = plr_dml2(y, X, d, g_hat, m_hat, smpls, score)

    res_dict = {
        'coef': dml_plr_obj.coef,
        'coef_manual': res_manual,
        'se': dml_plr_obj.se,
        'se_manual': se_manual,
        'boot_methods': boot_methods
    }

    for bootstrap in boot_methods:
        np.random.seed(3141)
        boot_theta, boot_t_stat = boot_plr(res_manual, y, d, g_hat, m_hat,
                                           smpls, score, se_manual, bootstrap,
                                           n_rep_boot, dml_procedure)

        np.random.seed(3141)
        dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
        res_dict['boot_coef' + bootstrap] = dml_plr_obj.boot_coef
        res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat
        res_dict['boot_coef' + bootstrap + '_manual'] = boot_theta
        res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat

    return res_dict