def test8(): """We want to able to smoothly switch between generating and printing random initialization files. """ for _ in range(10): generate_random_dict() dict_1 = read('test.grmpy.ini') print_dict(dict_1) dict_2 = read('test.grmpy.ini') np.testing.assert_equal(dict_1, dict_2)
def test9(): """This test ensures that the random initialization file generating process, the read in process and the simulation process works if the constraints function allows for different number of co- variates for each treatment state and the occurence of cost-benefit shifters.""" for i in range(5): constr = dict() constr['DETERMINISTIC'], constr['AGENT'], constr[ 'STATE_DIFF'] = False, 1000, True constr['OVERLAP'] = True generate_random_dict(constr) read('test.grmpy.ini') simulate('test.grmpy.ini') estimate('test.grmpy.ini') cleanup()
def simulate(init_file): """This function simulates a user-specified version of the generalized Roy model.""" init_dict = read(init_file) # Distribute information seed = init_dict['SIMULATION']['seed'] # Set random seed to ensure recomputabiltiy np.random.seed(seed) # Simulate unobservables of the model U, V = simulate_unobservables(init_dict) # Simulate observables of the model X = simulate_covariates(init_dict, 'TREATED') Z = simulate_covariates(init_dict, 'COST') # Simulate endogeneous variables of the model Y, D, Y_1, Y_0 = simulate_outcomes(init_dict, X, Z, U) # Write output file df = write_output(init_dict, Y, D, X, Z, Y_1, Y_0, U, V) # Calculate Criteria function value if init_dict['DETERMINISTIC'] is False: x0 = start_values(init_dict, df, 'init') init_dict['AUX']['criteria_value'] = calculate_criteria( init_dict, df, x0) # Print Log file print_info(init_dict, df) return df
def test5(): """The tests checks if the simulation process works even if the covariance between U1 and V and U0 and V is equal. Further the test ensures that the mte_information function returns the same value for each quantile. """ for _ in range(10): generate_random_dict() init_dict = read('test.grmpy.ini') # We impose that the covariance between the random components of the potential # outcomes and the random component determining choice is identical. init_dict['DIST']['all'][2] = init_dict['DIST']['all'][4] # Distribute information coeffs_untreated = init_dict['UNTREATED']['all'] coeffs_treated = init_dict['TREATED']['all'] # Construct auxiliary information cov = construct_covariance_matrix(init_dict) df = simulate('test.grmpy.ini') x = df.filter(regex=r'^X\_', axis=1) q = [0.01] + list(np.arange(0.05, 1, 0.05)) + [0.99] mte = mte_information(coeffs_treated, coeffs_untreated, cov, q, x) # We simply test that there is a single unique value for the marginal treatment effect. np.testing.assert_equal(len(set(mte)), 1)
def test4(): """The fifth test tests the random init file generating process and the import process. It generates an random init file, imports it again and compares the entries in the both dictio- naries. """ for _ in range(10): gen_dict = generate_random_dict() init_file_name = gen_dict['SIMULATION']['source'] print_dict(gen_dict, init_file_name) imp_dict = read(init_file_name + '.grmpy.ini') for key_ in ['TREATED', 'UNTREATED', 'COST', 'DIST']: np.testing.assert_array_almost_equal(gen_dict[key_]['coeff'], imp_dict[key_]['all'], decimal=4) if key_ in ['TREATED', 'UNTREATED', 'COST']: for i in range(len(gen_dict[key_]['types'])): if isinstance(gen_dict[key_]['types'][i], str): if not gen_dict[key_]['types'][i] == imp_dict[key_]['types'][i]: raise AssertionError() elif isinstance(gen_dict[key_]['types'][i], list): if not gen_dict[key_]['types'][i][0] == imp_dict[key_]['types'][i][0]: raise AssertionError() np.testing.assert_array_almost_equal( gen_dict[key_]['types'][i][1], imp_dict[key_]['types'][i][1], 4) for key_ in ['source', 'agents', 'seed']: if not gen_dict['SIMULATION'][key_] == imp_dict['SIMULATION'][key_]: raise AssertionError()
def create_data(): """This function creates the a data set based on the results from Caineiro 2011.""" # Read in initialization file and the data set init_dict = read("reliability.grmpy.yml") df = pd.read_pickle("aer-simulation-mock.pkl") # Distribute information indicator, dep = ( init_dict["ESTIMATION"]["indicator"], init_dict["ESTIMATION"]["dependent"], ) label_out = init_dict["TREATED"]["order"] label_choice = init_dict["CHOICE"]["order"] seed = init_dict["SIMULATION"]["seed"] # Set random seed to ensure recomputabiltiy np.random.seed(seed) # Simulate unobservables U = simulate_unobservables(init_dict) df["U1"], df["U0"], df["V"] = U["U1"], U["U0"], U["V"] # Simulate choice and output df[dep + "1"] = np.dot(df[label_out], init_dict["TREATED"]["params"]) + df["U1"] df[dep + "0"] = np.dot(df[label_out], init_dict["UNTREATED"]["params"]) + df["U0"] df[indicator] = np.array( np.dot(df[label_choice], init_dict["CHOICE"]["params"]) - df["V"] > 0 ).astype(int) df[dep] = df[indicator] * df[dep + "1"] + (1 - df[indicator]) * df[dep + "0"] # Save the data df.to_pickle("aer-simulation-mock.pkl") return df
def test1(): """The first test tests whether the relationships in the simulated datasets are appropriate in a deterministic and an un-deterministic setting. """ constr = dict() for case in ['deterministic', 'undeterministic']: if case == 'deterministic': constr['DETERMINISTIC'] = True else: constr['DETERMINISTIC'] = False for _ in range(10): generate_random_dict(constr) df = simulate('test.grmpy.ini') dict_ = read('test.grmpy.ini') x_treated = df[[ dict_['varnames'][i - 1] for i in dict_['TREATED']['order'] ]] y_treated = pd.DataFrame.sum(dict_['TREATED']['all'] * x_treated, axis=1) + df.U1 x_untreated = df[[ dict_['varnames'][i - 1] for i in dict_['UNTREATED']['order'] ]] y_untreated = pd.DataFrame.sum( dict_['UNTREATED']['all'] * x_untreated, axis=1) + df.U0 np.testing.assert_array_almost_equal(df.Y1, y_treated, decimal=5) np.testing.assert_array_almost_equal(df.Y0, y_untreated, decimal=5) np.testing.assert_array_equal(df.Y[df.D == 1], df.Y1[df.D == 1]) np.testing.assert_array_equal(df.Y[df.D == 0], df.Y0[df.D == 0])
def test5(): """The tests checks if the simulation process works even if the covariance between U1 and V and U0 and V is equal. Further the test ensures that the mte_information function returns the same value for each quantile. """ for _ in range(10): generate_random_dict() init_dict = read("test.grmpy.yml") # We impose that the covariance between the random components of the potential # outcomes and the random component determining choice is identical. init_dict["DIST"]["params"][2] = init_dict["DIST"]["params"][4] # Distribute information coeffs_untreated = init_dict["UNTREATED"]["params"] coeffs_treated = init_dict["TREATED"]["params"] # Construct auxiliary information cov = construct_covariance_matrix(init_dict) df = simulate("test.grmpy.yml") x = df[list( set(init_dict["TREATED"]["order"] + init_dict["UNTREATED"]["order"]))] q = [0.01] + list(np.arange(0.05, 1, 0.05)) + [0.99] mte = mte_information(coeffs_treated, coeffs_untreated, cov, q, x, init_dict) # We simply test that there is a single unique value for the marginal treatment # effect. np.testing.assert_equal(len(set(mte)), 1)
def check_vault(num_tests=100): """This function checks the complete regression vault that is distributed as part of the package. """ fname = (os.path.dirname(grmpy.__file__) + "/test/resources/old_regression_vault.grmpy.json") tests = json.load(open(fname)) if num_tests > len(tests): print("The specified number of evaluations is larger than the number" " of entries in the regression_test vault.\n" "Therefore the test runs the complete test battery.") else: tests = [tests[i] for i in np.random.choice(len(tests), num_tests)] for test in tests: stat, dict_, criteria = test print_dict(dict_transformation(dict_)) init_dict = read("test.grmpy.yml") df = simulate("test.grmpy.yml") _, X1, X0, Z1, Z0, Y1, Y0 = process_data(df, init_dict) x0 = start_values(init_dict, df, "init") criteria_ = calculate_criteria(init_dict, X1, X0, Z1, Z0, Y1, Y0, x0) np.testing.assert_almost_equal(criteria_, criteria) np.testing.assert_almost_equal(np.sum(df.sum()), stat) cleanup("regression")
def simulate(init_file): """This function simulates a user-specified version of the generalized Roy model.""" init_dict = read(init_file) # We perform some basic consistency checks regarding the user's request. check_initialization_dict(init_dict) # Distribute information seed = init_dict['SIMULATION']['seed'] # Set random seed to ensure recomputabiltiy np.random.seed(seed) # Simulate unobservables of the model U, V = simulate_unobservables(init_dict) # Simulate observables of the model X = simulate_covariates(init_dict) # Simulate endogeneous variables of the model Y, D, Y_1, Y_0 = simulate_outcomes(init_dict, X, U, V) # Write output file df = write_output(init_dict, Y, D, X, Y_1, Y_0, U, V) # Calculate Criteria function value if not init_dict['DETERMINISTIC']: x0 = start_values(init_dict, df, 'init') init_dict['AUX']['criteria_value'] = calculate_criteria( init_dict, df, x0) # Print Log file print_info(init_dict, df) return df
def create_data(): """This function creates the a data set based on the results from Caineiro 2011.""" # Read in initialization file and the data set init_dict = read('reliability.grmpy.ini') df = pd.read_pickle('aer-simulation-mock.pkl') # Distribute information indicator, dep = init_dict['ESTIMATION']['indicator'], init_dict['ESTIMATION']['dependent'] label_out = [init_dict['varnames'][j - 1] for j in init_dict['TREATED']['order']] label_choice = [init_dict['varnames'][j - 1] for j in init_dict['CHOICE']['order']] seed = init_dict['SIMULATION']['seed'] # Set random seed to ensure recomputabiltiy np.random.seed(seed) # Simulate unobservables U, V = simulate_unobservables(init_dict) df['U1'], df['U0'], df['V'] = U[:, 0], U[:, 1], V # Simulate choice and output df[dep + '1'] = np.dot(df[label_out], init_dict['TREATED']['all']) + df['U1'] df[dep + '0'] = np.dot(df[label_out], init_dict['UNTREATED']['all']) + df['U0'] df[indicator] = np.array( np.dot(df[label_choice], init_dict['CHOICE']['all']) - df['V'] > 0).astype(int) df[dep] = df[indicator] * df[dep + '1'] + (1 - df[indicator]) * df[dep + '0'] # Save the data df.to_pickle('aer-simulation-mock.pkl') return df
def test1(): """The first test tests whether the relationships in the simulated datasets are appropriate in a deterministic and an un-deterministic setting. """ constr = dict() for case in ["deterministic", "undeterministic"]: if case == "deterministic": constr["DETERMINISTIC"] = True else: constr["DETERMINISTIC"] = True for _ in range(10): generate_random_dict(constr) df = simulate("test.grmpy.yml") dict_ = read("test.grmpy.yml") x_treated = df[dict_["TREATED"]["order"]] y_treated = (pd.DataFrame.sum( dict_["TREATED"]["params"] * x_treated, axis=1) + df.U1) x_untreated = df[dict_["UNTREATED"]["order"]] y_untreated = (pd.DataFrame.sum( dict_["UNTREATED"]["params"] * x_untreated, axis=1) + df.U0) np.testing.assert_array_almost_equal(df.Y1, y_treated, decimal=5) np.testing.assert_array_almost_equal(df.Y0, y_untreated, decimal=5) np.testing.assert_array_equal(df.Y[df.D == 1], df.Y1[df.D == 1]) np.testing.assert_array_equal(df.Y[df.D == 0], df.Y0[df.D == 0])
def fit(init_file, semipar=False): """This function estimates the MTE based on a parametric normal model or, alternatively, via the semiparametric method of local instrumental variables (LIV). Parameters ---------- init_file: yaml Initialization file containing parameters for the estimation process. Returns ------ rslt: dict Result dictionary containing - quantiles - mte - mte_x - mte_u - mte_min - mte_max - X - b1 - b0 """ # Load the estimation file dict_ = read(init_file, semipar) # Perform some consistency checks given the user's request check_presence_estimation_dataset(dict_) check_est_init_dict(dict_) # Semiparametric LIV Model if semipar: # Distribute initialization information. data = read_data(dict_["ESTIMATION"]["file"]) dict_, data = check_append_constant(init_file, dict_, data, semipar=True) rslt = semipar_fit(dict_, data) # Parametric Normal Model else: # Perform some extra checks check_par_init_file(dict_) # Distribute initialization information. data = read_data(dict_["ESTIMATION"]["file"]) dict_, data = check_append_constant(init_file, dict_, data, semipar=False) rslt = par_fit(dict_, data) return rslt
def par_fit(init_file): """The function estimates the coefficients of the simulated data set.""" check_presence_init(init_file) dict_ = read(init_file) np.random.seed(dict_["SIMULATION"]["seed"]) # We perform some basic consistency checks regarding the user's request. check_presence_estimation_dataset(dict_) #check_initialization_dict2(dict_) #check_init_file(dict_) # Distribute initialization information. data = read_data(dict_["ESTIMATION"]["file"]) num_treated = dict_["AUX"]["num_covars_treated"] num_untreated = num_treated + dict_["AUX"]["num_covars_untreated"] _, X1, X0, Z1, Z0, Y1, Y0 = process_data(data, dict_) if dict_["ESTIMATION"]["maxiter"] == 0: option = "init" else: option = dict_["ESTIMATION"]["start"] # Read data frame # define starting values x0 = start_values(dict_, data, option) opts, method = optimizer_options(dict_) dict_["AUX"]["criteria"] = calculate_criteria(dict_, X1, X0, Z1, Z0, Y1, Y0, x0) dict_["AUX"]["starting_values"] = backward_transformation(x0) rslt_dict = bfgs_dict() if opts["maxiter"] == 0: rslt = adjust_output(None, dict_, x0, X1, X0, Z1, Z0, Y1, Y0, rslt_dict) else: opt_rslt = minimize( minimizing_interface, x0, args=(dict_, X1, X0, Z1, Z0, Y1, Y0, num_treated, num_untreated, rslt_dict), method=method, options=opts, ) rslt = adjust_output(opt_rslt, dict_, opt_rslt["x"], X1, X0, Z1, Z0, Y1, Y0, rslt_dict) # Print Output files print_logfile(dict_, rslt) if "comparison" in dict_["ESTIMATION"].keys(): if dict_["ESTIMATION"]["comparison"] == 0: pass else: write_comparison(data, rslt) else: write_comparison(data, rslt) return rslt
def get_effect_grmpy(file): """This function simply returns the ATE of the data set.""" dict_ = read("reliability.grmpy.yml") df = pd.read_pickle("aer-simulation-mock.pkl") beta_diff = dict_["TREATED"]["params"] - dict_["UNTREATED"]["params"] covars = dict_["TREATED"]["order"] ATE = np.dot(np.mean(df[covars]), beta_diff) return ATE
def get_effect_grmpy(file): """This function simply returns the ATE of the data set.""" dict_ = read(file) df = pd.read_pickle(dict_["SIMULATION"]["source"] + ".grmpy.pkl") beta_diff = dict_["TREATED"]["params"] - dict_["UNTREATED"]["params"] covars = dict_["TREATED"]["order"] ATE = np.dot(np.mean(df[covars]), beta_diff) return ATE
def get_effect_grmpy(file): """This function simply returns the ATE of the data set.""" dict_ = read('reliability.grmpy.ini') df = pd.read_pickle('aer-simulation-mock.pkl') beta_diff = dict_['TREATED']['all'] - dict_['UNTREATED']['all'] covars = [dict_['varnames'][j - 1] for j in dict_['TREATED']['order']] ATE = np.dot(np.mean(df[covars]), beta_diff) return ATE
def test9(): """This test ensures that the random initialization file generating process, the read in process and the simulation process works if the constraints function allows for different number of covariates for each treatment state and the occurence of cost-benefit shifters.""" for _ in range(5): constr = dict() constr["DETERMINISTIC"], constr["AGENT"], constr["STATE_DIFF"] = ( False, 1000, True, ) constr["OVERLAP"] = True generate_random_dict(constr) read("test.grmpy.yml") simulate("test.grmpy.yml") fit("test.grmpy.yml") cleanup()
def refactor_results(dict_, file): pseudo = read(file) for key in ['TREATED', 'UNTREATED', 'COST', 'DIST']: if key == 'DIST': pseudo['DIST']['coeff'] = dict_['AUX']['x_internal'][-6:] else: pseudo[key]['coeff'] = dict_[key]['all'].tolist() del pseudo[key]['all'] print_dict(pseudo, 'test')
def _create_data(file): """ This function creates the data set used in the Monte Carlo simulation. In particular, the unobservables, choice, and output are simulated for each indiviudal based on the grmpy initialization file. Thereafter, the data is both returned as a pandas.DataFrame and saved locally in pickle format. Parameters ---------- file: yaml grmpy initialization file. Returns ------- df: pandas.DataFrame DataFrame """ # Read in initialization file and the data set init_dict = read(file) df = pd.read_pickle(init_dict["SIMULATION"]["source"] + ".grmpy.pkl") # Distribute information indicator, dep = ( init_dict["ESTIMATION"]["indicator"], init_dict["ESTIMATION"]["dependent"], ) label_out = init_dict["TREATED"]["order"] label_choice = init_dict["CHOICE"]["order"] seed = init_dict["SIMULATION"]["seed"] # Set random seed to ensure recomputabiltiy np.random.seed(seed) # Simulate unobservables U = simulate_unobservables(init_dict) df["U1"], df["U0"], df["V"] = U["U1"], U["U0"], U["V"] # Simulate choice and output df[dep + "1"] = np.dot(df[label_out], init_dict["TREATED"]["params"]) + df["U1"] df[dep + "0"] = np.dot(df[label_out], init_dict["UNTREATED"]["params"]) + df["U0"] df[indicator] = np.array( np.dot(df[label_choice], init_dict["CHOICE"]["params"]) - df["V"] > 0).astype(int) df[dep] = df[indicator] * df[dep + "1"] + (1 - df[indicator]) * df[dep + "0"] # Save the data df.to_pickle(init_dict["SIMULATION"]["source"] + ".grmpy.pkl") return df
def test4(): """The fifth test tests the random init file generating process and the import process. It generates an random init file, imports it again and compares the entries in both dictionaries. """ for _ in range(10): gen_dict = generate_random_dict() init_file_name = gen_dict["SIMULATION"]["source"] print_dict(gen_dict, init_file_name) imp_dict = read(init_file_name + ".grmpy.yml") dicts = [gen_dict, imp_dict] for section in ["TREATED", "UNTREATED", "CHOICE", "DIST"]: np.testing.assert_array_almost_equal(gen_dict[section]["params"], imp_dict[section]["params"], decimal=4) if section in ["TREATED", "UNTREATED", "CHOICE"]: for dict_ in dicts: if not dict_[section]["order"] == dict_[section]["order"]: raise AssertionError() if len(dict_[section]["order"]) != len( set(dict_[section]["order"])): raise AssertionError() if dict_[section]["order"][0] != "X1": raise AssertionError() for variable in gen_dict["VARTYPES"].keys(): if variable not in imp_dict["VARTYPES"].keys(): raise AssertionError() if gen_dict["VARTYPES"][variable] != imp_dict["VARTYPES"][variable]: raise AssertionError if gen_dict["VARTYPES"]["X1"] != "nonbinary": raise AssertionError for subkey in ["source", "agents", "seed"]: if not gen_dict["SIMULATION"][subkey] == imp_dict["SIMULATION"][ subkey]: raise AssertionError() for subkey in [ "agents", "file", "optimizer", "start", "maxiter", "dependent", "indicator", "comparison", "output_file", ]: if not gen_dict["ESTIMATION"][subkey] == imp_dict["ESTIMATION"][ subkey]: raise AssertionError()
def monte_carlo(file, grid_points): """This function estimates the ATE for a sample with different correlation structures between U1 and V. Two different strategies for (OLS,LATE) are implemented. """ # Define a dictionary with a key for each estimation strategy effects = {} for key_ in ['grmpy', 'ols', 'true']: effects[key_] = [] # Loop over different correlations between V and U_1 for rho in np.linspace(0.00, 0.99, grid_points): # Readjust the initialization file values to add correlation model_spec = read(file) sim_spec = read('reliability.grmpy.ini') X = [sim_spec['varnames'][j - 1] for j in sim_spec['TREATED']['order']] update_correlation_structure(model_spec, rho) # Simulate a Data set and specify exogeneous and endogeneous variables df_mc = create_data() endog, exog, exog_ols = df_mc['wage'], df_mc[X], df_mc[['state'] + X] # Calculate true average treatment effect ATE = np.mean(df_mc['wage1'] - df_mc['wage0']) effects['true'] += [ATE] # Estimate via grmpy rslt = estimate('reliability.grmpy.ini') beta_diff = rslt['TREATED']['all'] - rslt['UNTREATED']['all'] stat = np.dot(np.mean(exog), beta_diff) effects['grmpy'] += [stat] # Estimate via OLS ols = sm.OLS(endog, exog_ols).fit() stat = ols.params[0] effects['ols'] += [stat] return effects
def monte_carlo(file, grid_points): """This function estimates the ATE for a sample with different correlation structures between U1 and V. Two different strategies for (OLS,LATE) are implemented. """ # Define a dictionary with a key for each estimation strategy effects = {} for key_ in ["grmpy", "ols", "true"]: effects[key_] = [] # Loop over different correlations between V and U_1 for rho in np.linspace(0.00, 0.99, grid_points): # Readjust the initialization file values to add correlation model_spec = read(file) sim_spec = read("reliability.grmpy.yml") X = sim_spec["TREATED"]["order"] update_correlation_structure(model_spec, rho) # Simulate a Data set and specify exogeneous and endogeneous variables df_mc = create_data() endog, exog, exog_ols = df_mc["wage"], df_mc[X], df_mc[["state"] + X] # Calculate true average treatment effect ATE = np.mean(df_mc["wage1"] - df_mc["wage0"]) effects["true"] += [ATE] # Estimate via grmpy rslt = fit("reliability.grmpy.yml") beta_diff = rslt["TREATED"]["params"] - rslt["UNTREATED"]["params"] stat = np.dot(np.mean(exog), beta_diff) effects["grmpy"] += [stat] # Estimate via OLS ols = sm.OLS(endog, exog_ols).fit() stat = ols.params[0] effects["ols"] += [stat] return effects
def simulate_test_data(): """ Simulate test dict_ and data. """ fname = TEST_RESOURCES_DIR + "/tutorial.grmpy.yml" data = simulate(fname) dict_ = read(fname) dict_, data = check_append_constant( TEST_RESOURCES_DIR + "/tutorial.grmpy.yml", dict_, data, semipar=True ) return dict_, data
def plot_est_mte(rslt, file): """This function calculates the marginal treatment effect for different quartiles of the unobservable V. ased on the calculation results.""" init_dict = read(file) data_frame = pd.read_pickle(init_dict['ESTIMATION']['file']) # Define the Quantiles and read in the original results quantiles = [0.0001] + np.arange(0.01, 1., 0.01).tolist() + [0.9999] mte_ = json.load(open('data/mte_original.json', 'r')) mte_original = mte_[1] mte_original_d = mte_[0] mte_original_u = mte_[2] # Calculate the MTE and confidence intervals mte = calculate_mte(rslt, init_dict, data_frame, quantiles) mte = [i / 4 for i in mte] mte_up, mte_d = calculate_cof_int(rslt, init_dict, data_frame, mte, quantiles) # Plot both curves ax = plt.figure(figsize=(17.5, 10)).add_subplot(111) ax.set_ylabel(r"$B^{MTE}$", fontsize=24) ax.set_xlabel("$u_D$", fontsize=24) ax.tick_params(axis='both', which='major', labelsize=18) ax.plot(quantiles, mte, label='grmpy $B^{MTE}$', color='blue', linewidth=4) ax.plot(quantiles, mte_up, color='blue', linestyle=':', linewidth=3) ax.plot(quantiles, mte_d, color='blue', linestyle=':', linewidth=3) ax.plot(quantiles, mte_original, label='original$B^{MTE}$', color='orange', linewidth=4) ax.plot(quantiles, mte_original_d, color='orange', linestyle=':', linewidth=3) ax.plot(quantiles, mte_original_u, color='orange', linestyle=':', linewidth=3) ax.set_ylim([-0.41, 0.51]) ax.set_xlim([-0.005, 1.005]) blue_patch = mpatches.Patch(color='blue', label='original $B^{MTE}$') orange_patch = mpatches.Patch(color='orange', label='grmpy $B^{MTE}$') plt.legend(handles=[blue_patch, orange_patch], prop={'size': 16}) plt.show() return mte
def plot_common_support(init_file, nbins, fs=24, output=False): """This function plots histograms of the treated and untreated population to assess the common support of the propensity score""" dict_ = read(init_file) # Distribute initialization information. data = read_data(dict_["ESTIMATION"]["file"]) # Process data for the semiparametric estimation. indicator = dict_["ESTIMATION"]["indicator"] D = data[indicator].values Z = data[dict_["CHOICE"]["order"]] logit = dict_["ESTIMATION"]["logit"] # estimate propensity score ps = estimate_treatment_propensity(D, Z, logit, show_output=False) data["ps"] = ps treated = data[[indicator, "ps"]][data[indicator] == 1].values untreated = data[[indicator, "ps"]][data[indicator] == 0].values treated = treated[:, 1].tolist() untreated = untreated[:, 1].tolist() # Make the histogram using a list of lists fig = plt.figure(figsize=(17.5, 10)) hist = plt.hist( [treated, untreated], bins=nbins, weights=[ np.ones(len(treated)) / len(treated), np.ones(len(untreated)) / len(untreated), ], density=0, alpha=0.55, label=["Treated", "Untreated"], ) # Plot formatting plt.tick_params(axis="both", labelsize=14) plt.legend(loc="upper right", prop={"size": 14}) plt.xticks(np.arange(0, 1.1, step=0.1)) plt.grid(axis="y", alpha=0.25) plt.xlabel("$P$", fontsize=fs) plt.ylabel("$f(P)$", fontsize=fs) # plt.title('Support of $P(\hat{Z})$ for $D=1$ and $D=0$', fontsize=fs) if not output is False: plt.savefig(output, dpi=300) fig.show()
def plot_rslts(rslt, file): init_dict = read(file) data_frame = pd.read_pickle(init_dict["ESTIMATION"]["file"]) # Define the Quantiles and read in the original results mte_ = json.load(open("resources/mte_original.json")) mte_original = mte_[1] mte_original_d = mte_[0] mte_original_u = mte_[2] # Calculate the MTE and confidence intervals quantiles, mte, mte_up, mte_d = mte_and_cof_int_par(rslt, data_frame, 4) # Plot both curves ax = plt.figure().add_subplot(111) ax.set_ylabel(r"$MTE$") ax.set_xlabel("$u_D$") ax.tick_params(axis="both", which="major", labelsize=18) ax.plot(quantiles, mte, label="grmpy MTE", color="blue", linewidth=4) ax.plot(quantiles, mte_up, color="blue", linestyle=":", linewidth=3) ax.plot(quantiles, mte_d, color="blue", linestyle=":", linewidth=3) ax.plot(quantiles, mte_original, label="original${MTE}$", color="orange", linewidth=4) ax.plot(quantiles, mte_original_d, color="orange", linestyle=":", linewidth=3) ax.plot(quantiles, mte_original_u, color="orange", linestyle=":", linewidth=3) ax.xaxis.set_ticks(np.arange(0, 1.1, step=0.1)) ax.yaxis.set_ticks(np.arange(-0.5, 0.5, step=0.1)) ax.set_ylim([-0.37, 0.47]) ax.set_xlim([0, 1]) ax.margins(x=0.003) ax.margins(y=0.03) blue_patch = mpatches.Patch(color="blue", label="original $MTE$") orange_patch = mpatches.Patch(color="orange", label="replicated $MTE$") plt.legend(handles=[blue_patch, orange_patch], prop={"size": 16}) plt.savefig(OUTPUT_DIR + "/fig-marginal-benefit-parametric-replication.png", dpi=300)
def estimate(init_file): """The function estimates the coefficients of the simulated data set.""" check_presence_init(init_file) dict_ = read(init_file) np.random.seed(dict_['SIMULATION']['seed']) # We perform some basic consistency checks regarding the user's request. check_presence_estimation_dataset(dict_) check_initialization_dict(dict_) check_init_file(dict_) # Distribute initialization information. data_file = dict_['ESTIMATION']['file'] if dict_['ESTIMATION']['maxiter'] == 0: option = 'init' else: option = dict_['ESTIMATION']['start'] # Read data frame data = read_data(data_file) # define starting values x0 = start_values(dict_, data, option) opts, method = optimizer_options(dict_) dict_['AUX']['criteria'] = calculate_criteria(dict_, data, x0) dict_['AUX']['starting_values'] = backward_transformation(x0) rslt_dict = bfgs_dict() if opts['maxiter'] == 0: rslt = adjust_output(None, dict_, x0, data, rslt_dict) else: opt_rslt = minimize(minimizing_interface, x0, args=(dict_, data, rslt_dict), method=method, options=opts) rslt = adjust_output(opt_rslt, dict_, opt_rslt['x'], data, rslt_dict) # Print Output files print_logfile(dict_, rslt) if 'comparison' in dict_['ESTIMATION'].keys(): if dict_['ESTIMATION']['comparison'] == 0: pass else: write_comparison(dict_, data, rslt) else: write_comparison(dict_, data, rslt) return rslt
def parametric_mte(rslt, file): """This function calculates the marginal treatment effect for different quartiles of the unobservable V based on the calculation results.""" init_dict = read(file) data_frame = pd.read_pickle(init_dict["ESTIMATION"]["file"]) # Define quantiles and read in the original results quantiles = [0.0001] + np.arange(0.01, 1.0, 0.01).tolist() + [0.9999] # Calculate the MTE and confidence intervals mte = calculate_mte(rslt, data_frame, quantiles) mte_up, mte_d = calculate_cof_int(rslt, init_dict, data_frame, mte, quantiles) return quantiles, mte, mte_up, mte_d
def test2(): """The third test checks whether the relationships hold if the coefficients are zero in different setups. """ for _ in range(10): for i in ['ALL', 'TREATED', 'UNTREATED', 'COST', 'TREATED & UNTREATED']: constr = constraints(probability=0.0) dict_ = generate_random_dict(constr) if i == 'ALL': for key_ in ['TREATED', 'UNTREATED', 'COST']: dict_[key_]['coeff'] = np.array([0.] * len(dict_[key_]['coeff'])) elif i == 'TREATED & UNTREATED': for key_ in ['TREATED', 'UNTREATED']: dict_[key_]['coeff'] = np.array([0.] * len(dict_[key_]['coeff'])) else: dict_[i]['coeff'] = np.array([0.] * len(dict_[i]['coeff'])) print_dict(dict_) dict_ = read('test.grmpy.ini') df = simulate('test.grmpy.ini') x = df.filter(regex=r'^X\_', axis=1) if i == 'ALL': np.testing.assert_array_equal(df.Y1, df.U1) np.testing.assert_array_equal(df.Y0, df.U0) elif i == 'TREATED & UNTREATED': np.testing.assert_array_equal(df.Y1, df.U1) np.testing.assert_array_equal(df.Y0, df.U0) np.testing.assert_array_equal(df.Y[df.D == 1], df.U1[df.D == 1]) np.testing.assert_array_equal(df.Y[df.D == 0], df.U0[df.D == 0]) elif i == 'TREATED': y_untreated = pd.DataFrame.sum(dict_['UNTREATED']['all'] * x, axis=1) + df.U0 np.testing.assert_array_almost_equal(df.Y0, y_untreated, decimal=5) np.testing.assert_array_equal(df.Y1, df.U1) elif i == 'UNTREATED': y_treated = pd.DataFrame.sum(dict_['TREATED']['all'] * x, axis=1) + df.U1 np.testing.assert_array_almost_equal(df.Y1, y_treated, decimal=5) np.testing.assert_array_equal(df.Y0, df.U0) else: y_treated = pd.DataFrame.sum(dict_['TREATED']['all'] * x, axis=1) + df.U1 y_untreated = pd.DataFrame.sum(dict_['UNTREATED']['all'] * x, axis=1) + df.U0 np.testing.assert_array_almost_equal(df.Y1, y_treated, decimal=5) np.testing.assert_array_almost_equal(df.Y0, y_untreated, decimal=5) np.testing.assert_array_equal(df.Y[df.D == 1], df.Y1[df.D == 1]) np.testing.assert_array_equal(df.Y[df.D == 0], df.Y0[df.D == 0]) np.testing.assert_array_almost_equal(df.V, (df.UC - df.U1 + df.U0))