def fit(init_file, semipar=False): """This function estimates the MTE based on a parametric normal model or, alternatively, via the semiparametric method of local instrumental variables (LIV). Parameters ---------- init_file: yaml Initialization file containing parameters for the estimation process. Returns ------ rslt: dict Result dictionary containing - quantiles - mte - mte_x - mte_u - mte_min - mte_max - X - b1 - b0 """ # Load the estimation file dict_ = read(init_file, semipar) # Perform some consistency checks given the user's request check_presence_estimation_dataset(dict_) check_est_init_dict(dict_) # Semiparametric LIV Model if semipar: # Distribute initialization information. data = read_data(dict_["ESTIMATION"]["file"]) dict_, data = check_append_constant(init_file, dict_, data, semipar=True) rslt = semipar_fit(dict_, data) # Parametric Normal Model else: # Perform some extra checks check_par_init_file(dict_) # Distribute initialization information. data = read_data(dict_["ESTIMATION"]["file"]) dict_, data = check_append_constant(init_file, dict_, data, semipar=False) rslt = par_fit(dict_, data) return rslt
def test12(): """This test checks if our data import process is able to handle .txt, .dta and .pkl files. """ pkl = TEST_RESOURCES_DIR + "/data.grmpy.pkl" dta = TEST_RESOURCES_DIR + "/data.grmpy.dta" txt = TEST_RESOURCES_DIR + "/data.grmpy.txt" real_sum = -3211.20122 real_column_values = [ "Y", "D", "X1", "X2", "X3", "X5", "X4", "Y1", "Y0", "U1", "U0", "V", ] for data in [pkl, dta, txt]: df = read_data(data) sum_ = np.sum(df.sum()) columns = list(df) np.testing.assert_array_almost_equal(sum_, real_sum, decimal=5) np.testing.assert_equal(columns, real_column_values)
def par_fit(init_file): """The function estimates the coefficients of the simulated data set.""" check_presence_init(init_file) dict_ = read(init_file) np.random.seed(dict_["SIMULATION"]["seed"]) # We perform some basic consistency checks regarding the user's request. check_presence_estimation_dataset(dict_) #check_initialization_dict2(dict_) #check_init_file(dict_) # Distribute initialization information. data = read_data(dict_["ESTIMATION"]["file"]) num_treated = dict_["AUX"]["num_covars_treated"] num_untreated = num_treated + dict_["AUX"]["num_covars_untreated"] _, X1, X0, Z1, Z0, Y1, Y0 = process_data(data, dict_) if dict_["ESTIMATION"]["maxiter"] == 0: option = "init" else: option = dict_["ESTIMATION"]["start"] # Read data frame # define starting values x0 = start_values(dict_, data, option) opts, method = optimizer_options(dict_) dict_["AUX"]["criteria"] = calculate_criteria(dict_, X1, X0, Z1, Z0, Y1, Y0, x0) dict_["AUX"]["starting_values"] = backward_transformation(x0) rslt_dict = bfgs_dict() if opts["maxiter"] == 0: rslt = adjust_output(None, dict_, x0, X1, X0, Z1, Z0, Y1, Y0, rslt_dict) else: opt_rslt = minimize( minimizing_interface, x0, args=(dict_, X1, X0, Z1, Z0, Y1, Y0, num_treated, num_untreated, rslt_dict), method=method, options=opts, ) rslt = adjust_output(opt_rslt, dict_, opt_rslt["x"], X1, X0, Z1, Z0, Y1, Y0, rslt_dict) # Print Output files print_logfile(dict_, rslt) if "comparison" in dict_["ESTIMATION"].keys(): if dict_["ESTIMATION"]["comparison"] == 0: pass else: write_comparison(data, rslt) else: write_comparison(data, rslt) return rslt
def fit(init_file, semipar=False): """This function estimates the MTE based on a parametric normal model or, alternatively, via the semiparametric method of local instrumental variables (LIV). """ # Load the estimation file check_presence_init(init_file) dict_ = read(init_file, semipar) # Perform some consistency checks given the user's request check_presence_estimation_dataset(dict_) check_est_init_dict(dict_) # Semiparametric LIV Model if semipar is True: # Distribute initialization information. data = read_data(dict_["ESTIMATION"]["file"]) dict_, data = check_append_constant(init_file, dict_, data, semipar=True) rslt = semipar_fit(dict_, data) # Parametric Normal Model else: # Perform some extra checks check_par_init_file(dict_) # Distribute initialization information. data = read_data(dict_["ESTIMATION"]["file"]) dict_, data = check_append_constant(init_file, dict_, data, semipar=False) rslt = par_fit(dict_, data) return rslt
def plot_common_support(init_file, nbins, fs=24, output=False): """This function plots histograms of the treated and untreated population to assess the common support of the propensity score""" dict_ = read(init_file) # Distribute initialization information. data = read_data(dict_["ESTIMATION"]["file"]) # Process data for the semiparametric estimation. indicator = dict_["ESTIMATION"]["indicator"] D = data[indicator].values Z = data[dict_["CHOICE"]["order"]] logit = dict_["ESTIMATION"]["logit"] # estimate propensity score ps = estimate_treatment_propensity(D, Z, logit, show_output=False) data["ps"] = ps treated = data[[indicator, "ps"]][data[indicator] == 1].values untreated = data[[indicator, "ps"]][data[indicator] == 0].values treated = treated[:, 1].tolist() untreated = untreated[:, 1].tolist() # Make the histogram using a list of lists fig = plt.figure(figsize=(17.5, 10)) hist = plt.hist( [treated, untreated], bins=nbins, weights=[ np.ones(len(treated)) / len(treated), np.ones(len(untreated)) / len(untreated), ], density=0, alpha=0.55, label=["Treated", "Untreated"], ) # Plot formatting plt.tick_params(axis="both", labelsize=14) plt.legend(loc="upper right", prop={"size": 14}) plt.xticks(np.arange(0, 1.1, step=0.1)) plt.grid(axis="y", alpha=0.25) plt.xlabel("$P$", fontsize=fs) plt.ylabel("$f(P)$", fontsize=fs) # plt.title('Support of $P(\hat{Z})$ for $D=1$ and $D=0$', fontsize=fs) if not output is False: plt.savefig(output, dpi=300) fig.show()
def estimate(init_file): """The function estimates the coefficients of the simulated data set.""" check_presence_init(init_file) dict_ = read(init_file) np.random.seed(dict_['SIMULATION']['seed']) # We perform some basic consistency checks regarding the user's request. check_presence_estimation_dataset(dict_) check_initialization_dict(dict_) check_init_file(dict_) # Distribute initialization information. data_file = dict_['ESTIMATION']['file'] if dict_['ESTIMATION']['maxiter'] == 0: option = 'init' else: option = dict_['ESTIMATION']['start'] # Read data frame data = read_data(data_file) # define starting values x0 = start_values(dict_, data, option) opts, method = optimizer_options(dict_) dict_['AUX']['criteria'] = calculate_criteria(dict_, data, x0) dict_['AUX']['starting_values'] = backward_transformation(x0) rslt_dict = bfgs_dict() if opts['maxiter'] == 0: rslt = adjust_output(None, dict_, x0, data, rslt_dict) else: opt_rslt = minimize(minimizing_interface, x0, args=(dict_, data, rslt_dict), method=method, options=opts) rslt = adjust_output(opt_rslt, dict_, opt_rslt['x'], data, rslt_dict) # Print Output files print_logfile(dict_, rslt) if 'comparison' in dict_['ESTIMATION'].keys(): if dict_['ESTIMATION']['comparison'] == 0: pass else: write_comparison(dict_, data, rslt) else: write_comparison(dict_, data, rslt) return rslt
def test13(): """This test checks if our data import process is able to handle .txt, .dta and .pkl files.""" pkl = TEST_RESOURCES_DIR + '/data.grmpy.pkl' dta = TEST_RESOURCES_DIR + '/data.grmpy.dta' txt = TEST_RESOURCES_DIR + '/data.grmpy.txt' real_sum = -3211.20122 real_column_values = [ 'Y', 'D', 'X1', 'X2', 'X3', 'X5', 'X4', 'Y1', 'Y0', 'U1', 'U0', 'V' ] for data in [pkl, dta, txt]: df = read_data(data) sum = np.sum(df.sum()) columns = list(df) np.testing.assert_array_almost_equal(sum, real_sum, decimal=5) np.testing.assert_equal(columns, real_column_values) cleanup()
def plot_mte( rslt, init_file, college_years=4, font_size=22, label_size=16, color="blue", semipar=False, nboot=250, save_plot=False, ): """This function calculates the marginal treatment effect for different quantiles u_D of the unobservables. Depending on the model specification, either the parametric or semiparametric MTE is plotted along with the corresponding 90 percent confidence bands. """ # Read init dict and data dict_ = read(init_file, semipar) data = read_data(dict_["ESTIMATION"]["file"]) dict_, data = check_append_constant(init_file, dict_, data, semipar) if semipar is True: quantiles, mte, con_u, con_d = mte_and_cof_int_semipar( rslt, init_file, college_years, nboot ) else: quantiles, mte, con_u, con_d = mte_and_cof_int_par( rslt, dict_, data, college_years ) # Add confidence intervals to rslt dictionary rslt.update({"con_u": con_u, "con_d": con_d}) plot_curve(mte, quantiles, con_u, con_d, font_size, label_size, color, save_plot)
def bootstrap(init_file, nboot): """ This function generates bootsrapped standard errors given an init_file and the number of bootstraps to be drawn. Parameters ---------- init_file: yaml Initialization file containing parameters for the estimation process. nboot: int Number of bootstrap iterations, i.e. number of times the MTE is computed via bootstrap. Returns ------- mte_boot: np.ndarray Array containing *nbootstrap* estimates of the MTE. """ check_presence_init(init_file) dict_ = read(init_file, semipar=True) # Process the information specified in the initialization file bins, logit, bandwidth, gridsize, startgrid, endgrid = process_primary_inputs( dict_) trim, rbandwidth, reestimate_p, show_output = process_secondary_inputs( dict_) # Suppress output show_output = False # Prepare empty array to store output values mte_boot = np.zeros([gridsize, nboot]) # Load the baseline data data = read_data(dict_["ESTIMATION"]["file"]) counter = 0 while counter < nboot: boot_data = resample(data, replace=True, n_samples=len(data), random_state=None) # Estimate propensity score P(z) boot_data = estimate_treatment_propensity(dict_, boot_data, logit, show_output) prop_score = boot_data["prop_score"] if isinstance(prop_score, pd.Series): # Define common support and trim the data (if trim=True) X, Y, prop_score = trim_support(dict_, boot_data, logit, bins, trim, reestimate_p, show_output=False) b0, b1_b0 = double_residual_reg(X, Y, prop_score) # # Construct the MTE mte_x = mte_observed(X, b1_b0) mte_u = mte_unobserved_semipar(X, Y, b0, b1_b0, prop_score, bandwidth, gridsize, startgrid, endgrid) # Put the MTE together mte = mte_x.mean(axis=0) + mte_u mte_boot[:, counter] = mte counter += 1 else: continue return mte_boot
def semipar_fit(init_file): """This functions estimates the MTE via Local Instrumental Variables""" check_presence_init(init_file) dict_ = read(init_file) # np.random.seed(dict_["SIMULATION"]["seed"]) # needed? check_presence_estimation_dataset(dict_) check_initialization_dict(dict_) # Distribute initialization information. data = read_data(dict_["ESTIMATION"]["file"]) # Process data for the semiparametric estimation. indicator = dict_["ESTIMATION"]["indicator"] D = data[indicator].values Z = data[dict_["CHOICE"]["order"]] nbins = dict_["ESTIMATION"]["nbins"] trim = dict_["ESTIMATION"]["trim_support"] reestimate = dict_["ESTIMATION"]["reestimate_p"] rbandwidth = dict_["ESTIMATION"]["rbandwidth"] bandwidth = dict_["ESTIMATION"]["bandwidth"] gridsize = dict_["ESTIMATION"]["gridsize"] a = dict_["ESTIMATION"]["ps_range"][0] b = dict_["ESTIMATION"]["ps_range"][1] logit = dict_["ESTIMATION"]["logit"] show_output = dict_["ESTIMATION"]["show_output"] # The Local Instrumental Variables (LIV) approach # 1. Estimate propensity score P(z) ps = estimate_treatment_propensity(D, Z, logit, show_output) # 2a. Find common support treated, untreated, common_support = define_common_support( ps, indicator, data, nbins, show_output ) # 2b. Trim the data if trim is True: data, ps = trim_data(ps, common_support, data) # 2c. Re-estimate baseline propensity score on the trimmed sample if reestimate is True: D = data[indicator].values Z = data[dict_["CHOICE"]["order"]] # Re-estimate propensity score P(z) ps = estimate_treatment_propensity(D, Z, logit, show_output) # 3. Double Residual Regression # Sort data by ps data = data.sort_values(by="ps", ascending=True) ps = np.sort(ps) X = data[dict_["TREATED"]["order"]] Xp = construct_Xp(X, ps) Y = data[[dict_["ESTIMATION"]["dependent"]]] b0, b1_b0 = double_residual_reg(ps, X, Xp, Y, rbandwidth, show_output) # Turn the X, Xp, and Y DataFrames into np.ndarrays X_arr = np.array(X) Xp_arr = np.array(Xp) Y_arr = np.array(Y).ravel() # 4. Compute the unobserved part of Y Y_tilde = Y_arr - np.dot(X_arr, b0) - np.dot(Xp_arr, b1_b0) # 5. Estimate mte_u, the unobserved component of the MTE, # through a locally quadratic regression quantiles, mte_u = locpoly(ps, Y_tilde, 1, 2, bandwidth, gridsize, a, b) # 6. construct MTE # Calculate the MTE component that depends on X # mte_x = np.dot(X, b1_b0).mean(axis=0) # Put the MTE together # mte = mte_x + mte_u return quantiles, mte_u, X, b1_b0
def plot_mte( rslt, init_file, college_years=4, font_size=22, label_size=16, color="blue", semipar=False, nboot=250, save_plot=False, ): """ This function calculates the marginal treatment effect for different quantiles u_D of the unobservables. Depending on the model specification, either the parametric or semiparametric MTE is plotted along with the corresponding 90 percent confidence bands. Parameters ---------- rslt: dict Result dictionary returned by grmpy.fit(). init_file: yaml Initialization file containing parameters for the estimation process. college_years: int, default is 4 Average duration of college degree. The MTE plotted will thus refer to the returns per one year of college education. font_size: int, default is 22 Font size of the MTE graph. label_size: int, default is 16 Label size of the MTE graph color: str, default is "blue" Color of the MTE curve. semipar: bool, default is False Option to indicate the semiparametric estimation. If semipar is False, the parametric normal model is assumed and confidence intervals are computed analytically. Else (semipar is True), confidence bands are bootstrapped. nboot: int, default is 250 Only relevant for semiparametric estimation (semipar=True). Number of of bootstrap iterations used to compute confidence intervals. save_plot: bool or str or PathLike or file-like object, default is False If False, the resulting plot is shown but not saved. If True, the MTE plot is saved as 'MTE_plot.png'. Else, if a str or Pathlike or file-like object is specified, the plot is saved according to *save_plot*. The output format is inferred from the extension ('png', 'pdf', 'svg'... etc.) By default, '.png' is assumed. """ # Read init dict and data dict_ = read(init_file, semipar) data = read_data(dict_["ESTIMATION"]["file"]) dict_, data = check_append_constant(init_file, dict_, data, semipar) if semipar is True: quantiles, mte, con_u, con_d = mte_and_cof_int_semipar( rslt, init_file, college_years, nboot) else: quantiles, mte, con_u, con_d = mte_and_cof_int_par( rslt, dict_, data, college_years) # Add confidence intervals to rslt dictionary rslt.update({"con_u": con_u, "con_d": con_d}) plot_curve(mte, quantiles, con_u, con_d, font_size, label_size, color, save_plot)
def bootstrap(init_file, nbootstraps): """ This function generates bootsrapped standard errors given an init_file and the number of bootsraps to be drawn. """ check_presence_init(init_file) dict_ = read(init_file, semipar=True) # Process the information specified in the initialization file nbins, logit, bandwidth, gridsize, a, b = process_user_input(dict_) trim, rbandwidth, reestimate_p = process_default_input(dict_) # Suppress output show_output = False # Prepare empty array to store output values mte_boot = np.zeros([gridsize, nbootstraps]) # Load the baseline data data = read_data(dict_["ESTIMATION"]["file"]) counter = 0 while counter < nbootstraps: boot_data = resample(data, replace=True, n_samples=len(data), random_state=None) # Process the inputs for the decision equation indicator, D, Z = process_choice_data(dict_, boot_data) # Estimate propensity score P(z) ps = estimate_treatment_propensity(D, Z, logit, show_output) if isinstance(ps, np.ndarray): # Define common support and trim the data, if trim=True boot_data, ps = trim_support( dict_, boot_data, logit, ps, indicator, nbins, trim, reestimate_p, show_output, ) # Estimate the observed and unobserved component of the MTE X, b1_b0, b0, mte_u = mte_components(dict_, boot_data, ps, rbandwidth, bandwidth, gridsize, a, b, show_output) # Calculate the MTE component that depends on X mte_x = np.dot(X, b1_b0).mean(axis=0) # Put the MTE together mte = mte_x + mte_u mte_boot[:, counter] = mte counter += 1 else: continue return mte_boot
def bootstrap(init_file, nbootstraps, show_output=False): """ This function generates bootsrapped standard errors given an init_file and the number of bootsraps to be drawn. """ check_presence_init(init_file) dict_ = read(init_file) nbins = dict_["ESTIMATION"]["nbins"] trim = dict_["ESTIMATION"]["trim_support"] rbandwidth = dict_["ESTIMATION"]["rbandwidth"] bandwidth = dict_["ESTIMATION"]["bandwidth"] gridsize = dict_["ESTIMATION"]["gridsize"] a = dict_["ESTIMATION"]["ps_range"][0] b = dict_["ESTIMATION"]["ps_range"][1] logit = dict_["ESTIMATION"]["logit"] # Distribute initialization information. data = read_data(dict_["ESTIMATION"]["file"]) # Prepare empty arrays to store output values mte_boot = np.zeros([gridsize, nbootstraps]) counter = 0 while counter < nbootstraps: boot = resample(data, replace=True, n_samples=len(data), random_state=None) # Process data for the semiparametric estimation. indicator = dict_["ESTIMATION"]["indicator"] D = boot[indicator].values Z = boot[dict_["CHOICE"]["order"]] # The Local Instrumental Variables (LIV) approach # 1. Estimate propensity score P(z) ps = estimate_treatment_propensity(D, Z, logit, show_output) if isinstance(ps, np.ndarray): # & (np.min(ps) <= 0.3) & (np.max(ps) >= 0.7): # 2a. Find common support treated, untreated, common_support = define_common_support( ps, indicator, boot, nbins, show_output ) # 2b. Trim the data if trim is True: boot, ps = trim_data(ps, common_support, boot) # 3. Double Residual Regression # Sort data by ps boot = boot.sort_values(by="ps", ascending=True) ps = np.sort(ps) X = boot[dict_["TREATED"]["order"]] Xp = construct_Xp(X, ps) Y = boot[[dict_["ESTIMATION"]["dependent"]]] b0, b1_b0 = double_residual_reg(ps, X, Xp, Y, rbandwidth, show_output) # Turn the X, Xp, and Y DataFrames into np.ndarrays X_arr = np.array(X) Xp_arr = np.array(Xp) Y_arr = np.array(Y).ravel() # 4. Compute the unobserved part of Y Y_tilde = Y_arr - np.dot(X_arr, b0) - np.dot(Xp_arr, b1_b0) # 5. Estimate mte_u, the unobserved component of the MTE, # through a locally quadratic regression quantiles, mte_u = locpoly(ps, Y_tilde, 1, 2, bandwidth, gridsize, a, b) # 6. construct MTE # Calculate the MTE component that depends on X mte_x = np.dot(X, b1_b0).mean(axis=0) # Put the MTE together mte = mte_x + mte_u mte_boot[:, counter] = mte counter += 1 else: continue return mte_boot