def monte_carlo(file, which, grid_points=10): """This function estimates various effect parameters for increasing presence of essential heterogeneity, which is reflected by increasing correlation between U_1 and V. """ # simulate a new data set with essential heterogeneity present model_dict = read(file) original_correlation = model_dict["DIST"]["params"][2] model_dict["DIST"]["params"][2] = -0.191 print_dict(model_dict, file.replace(".grmpy.yml", "")) grmpy.simulate(file) effects = [] # Loop over different correlations between V and U_1 for rho in np.linspace(0.00, -0.99, grid_points): # effects["rho"] += [rho] # Readjust the initialization file values to add correlation model_spec = read(file) X = model_spec["TREATED"]["order"] update_correlation_structure(file, model_spec, rho) sim_spec = read(file) # Simulate a Data set and specify exogeneous and endogeneous variables df_mc = create_data(file) treated = df_mc["D"] == 1 Xvar = df_mc[X] instr = sim_spec["CHOICE"]["order"] instr = [i for i in instr if i != "const"] # We calculate our parameter of interest label = which.lower() if label == "conventional_average_effects": ATE = np.mean(df_mc["Y1"] - df_mc["Y0"]) TT = np.mean(df_mc["Y1"].loc[treated] - df_mc["Y0"].loc[treated]) stat = (ATE, TT) elif label in ["random", "randomization"]: random = np.mean(df_mc[df_mc.D == 1]["Y"]) - np.mean( df_mc[df_mc.D == 0]["Y"]) stat = random elif label in ["ordinary_least_squares", "ols"]: results = sm.OLS(df_mc["Y"], df_mc[["const", "D"]]).fit() stat = results.params[1] elif label in ["instrumental_variables", "iv"]: iv = IV2SLS(df_mc["Y"], Xvar, df_mc["D"], df_mc[instr]).fit() stat = iv.params["D"] elif label in ["grmpy", "grmpy-par"]: rslt = grmpy.fit(file) beta_diff = rslt["TREATED"]["params"] - rslt["UNTREATED"]["params"] stat = np.dot(np.mean(Xvar), beta_diff) elif label in ["grmpy-semipar", "grmpy-liv"]: rslt = grmpy.fit(file, semipar=True) y0_fitted = np.dot(rslt["X"], rslt["b0"]) y1_fitted = np.dot(rslt["X"], rslt["b1"]) mte_x_ = y1_fitted - y0_fitted mte_u = rslt["mte_u"] us = np.linspace(0.005, 0.995, len(rslt["quantiles"])) mte_mat = np.zeros((len(mte_x_), len(mte_u))) for i in range(len(mte_x_)): for j in range(len(mte_u)): mte_mat[i, j] = mte_x_[i] + mte_u[j] ate_tilde_p = np.mean(mte_mat, axis=1) stat = ate_tilde_p.mean() else: raise NotImplementedError effects += [stat] # Restore original init file model_dict = read(file) model_dict["DIST"]["params"][2] = original_correlation print_dict(model_dict, file.replace(".grmpy.yml", "")) grmpy.simulate(file) return effects
def monte_carlo(file, which, grid_points=10): """ This function conducts a Monte Carlo simulation to compare the true and estimated treatment parameters for increasing (absolute) correlation between U_1 and V (i.e essential heterogeneity). In the example here, the correlation between U_1 and V becomes increasingly more negative. As we consider the absolute value of the correlation coefficient, values closer to -1 (or in the analogous case closer to +1) denote a higher degree of essential heterogeneity. The results of the Monte Carlo simulation can be used to evaluate the performance of different estimation strategies in the presence of essential heterogeneity. Depending on the specification of *which*, either the true ATE and TT, or an estimate of the ATE are returned. Options for *which*: Comparison of ATE and TT - "conventional_average_effects" Different estimation strategies for ATE - "randomization" ("random") - "ordinary_least_squares" ("ols") - "instrumental_variables" ("iv") - "grmpy_par" ("grmpy") - "grmpy_semipar"("grmpy-liv") Post-estimation: To plot the comparison between the true ATE and the respective parameter, use the function - plot_effects() for *which* = "conventional_average_effects", and - plot_estimates() else. Parameters ---------- file: yaml grmpy initialization file, provides information for the simulation process. which: string String denoting whether conventional average effects shall be computed or, alternatively, which estimation approach shall be implemented for the ATE. grid_points: int, default 10 Number of different values for rho, the correlation coefficient between U_1 and V, on the interval [0, -1), along which the parameters shall be evaluated. Returns ------- effects: list If *which* = "conventional_average_effects", list of lenght *grid_points* x 2 containing the true ATE and TT. Else, list of length *grid_points* x 1 containing an estimate of the ATE. """ # simulate a new data set with essential heterogeneity present model_dict = read(file) original_correlation = model_dict["DIST"]["params"][2] model_dict["DIST"]["params"][2] = -0.191 print_dict(model_dict, file.replace(".grmpy.yml", "")) grmpy.simulate(file) effects = [] # Loop over different correlations between U_1 and V for rho in np.linspace(0.00, -0.99, grid_points): # effects["rho"] += [rho] # Readjust the initialization file values to add correlation model_spec = read(file) X = model_spec["TREATED"]["order"] _update_correlation_structure(file, model_spec, rho) sim_spec = read(file) # Simulate a Data set and specify exogeneous and endogeneous variables df_mc = _create_data(file) treated = df_mc["D"] == 1 Xvar = df_mc[X] instr = sim_spec["CHOICE"]["order"] instr = [i for i in instr if i != "const"] # We calculate our parameter of interest label = which.lower() if label == "conventional_average_effects": ATE = np.mean(df_mc["Y1"] - df_mc["Y0"]) TT = np.mean(df_mc["Y1"].loc[treated] - df_mc["Y0"].loc[treated]) stat = (ATE, TT) elif label in ["randomization", "random"]: random = np.mean(df_mc[df_mc.D == 1]["Y"]) - np.mean( df_mc[df_mc.D == 0]["Y"] ) stat = random elif label in ["ordinary_least_squares", "ols"]: results = sm.OLS(df_mc["Y"], df_mc[["const", "D"]]).fit() stat = results.params[1] elif label in ["instrumental_variables", "iv"]: iv = IV2SLS(df_mc["Y"], Xvar, df_mc["D"], df_mc[instr]).fit() stat = iv.params["D"] elif label in ["grmpy", "grmpy-par"]: rslt = grmpy.fit(file) beta_diff = rslt["TREATED"]["params"] - rslt["UNTREATED"]["params"] stat = np.dot(np.mean(Xvar), beta_diff) elif label in ["grmpy-semipar", "grmpy-liv"]: rslt = grmpy.fit(file, semipar=True) y0_fitted = np.dot(rslt["X"], rslt["b0"]) y1_fitted = np.dot(rslt["X"], rslt["b1"]) mte_x_ = y1_fitted - y0_fitted mte_u = rslt["mte_u"] us = np.linspace(0.005, 0.995, len(rslt["quantiles"])) mte_mat = np.zeros((len(mte_x_), len(mte_u))) for i in range(len(mte_x_)): for j in range(len(mte_u)): mte_mat[i, j] = mte_x_[i] + mte_u[j] ate_tilde_p = np.mean(mte_mat, axis=1) stat = ate_tilde_p.mean() else: raise NotImplementedError effects += [stat] # Restore original init file model_dict = read(file) model_dict["DIST"]["params"][2] = original_correlation print_dict(model_dict, file.replace(".grmpy.yml", "")) grmpy.simulate(file) return effects