def _update_correlation_structure(file, model_dict, rho): """ This function takes a valid model specification and updates the correlation structure among the unobservables. The information is saved to a new init file replacing the input file. Parameters ---------- file: yaml grmpy initialization file. model_dict: dict grmpy initialization dictionary, the output of grmpy.read() rho: float The correlation coefficient between U_1 and V, which takes values between [0, -1). Values closer to -1 denote a larger degree of essential heterogeneity in the sample. """ # We first extract the baseline information from the model dictionary. sd_v = model_dict["DIST"]["params"][-1] sd_u1 = model_dict["DIST"]["params"][0] # Now we construct the implied covariance, which is relevant for the # initialization file. cov1v = rho * sd_v * sd_u1 model_dict["DIST"]["params"][2] = cov1v # We print the specification of the covariance to a new init file, # which has the same name as the input file and replaces the original one. print_dict(model_dict, file.replace(".grmpy.yml", ""))
def test6(): """Additionally to test5 this test checks if the comparison file provides the expected output when maxiter is set to zero and the estimation process uses the initialization file values as start values. """ for _ in range(5): constr = dict() constr["DETERMINISTIC"], constr["MAXITER"], constr[ "AGENTS"] = False, 0, 15000 constr["START"], constr["SAME_SIZE"] = "init", True dict_ = generate_random_dict(constr) dict_["DIST"]["params"][1], dict_["DIST"]["params"][5] = 0.0, 1.0 print_dict(dict_) simulate("test.grmpy.yml") fit("test.grmpy.yml") dict_ = read_desc("comparison.grmpy.info") for section in ["ALL", "TREATED", "UNTREATED"]: np.testing.assert_equal(len(set(dict_[section]["Number"])), 1) np.testing.assert_almost_equal( dict_[section]["Observed Sample"], dict_[section]["Simulated Sample (finish)"], 0.001, ) np.testing.assert_array_almost_equal( dict_[section]["Simulated Sample (finish)"], dict_[section]["Simulated Sample (start)"], 0.001, )
def check_vault(num_tests=100): """This function checks the complete regression vault that is distributed as part of the package. """ fname = (os.path.dirname(grmpy.__file__) + "/test/resources/old_regression_vault.grmpy.json") tests = json.load(open(fname)) if num_tests > len(tests): print("The specified number of evaluations is larger than the number" " of entries in the regression_test vault.\n" "Therefore the test runs the complete test battery.") else: tests = [tests[i] for i in np.random.choice(len(tests), num_tests)] for test in tests: stat, dict_, criteria = test print_dict(dict_transformation(dict_)) init_dict = read("test.grmpy.yml") df = simulate("test.grmpy.yml") _, X1, X0, Z1, Z0, Y1, Y0 = process_data(df, init_dict) x0 = start_values(init_dict, df, "init") criteria_ = calculate_criteria(init_dict, X1, X0, Z1, Z0, Y1, Y0, x0) np.testing.assert_almost_equal(criteria_, criteria) np.testing.assert_almost_equal(np.sum(df.sum()), stat) cleanup("regression")
def test4(): """The fifth test tests the random init file generating process and the import process. It generates an random init file, imports it again and compares the entries in the both dictio- naries. """ for _ in range(10): gen_dict = generate_random_dict() init_file_name = gen_dict['SIMULATION']['source'] print_dict(gen_dict, init_file_name) imp_dict = read(init_file_name + '.grmpy.ini') for key_ in ['TREATED', 'UNTREATED', 'COST', 'DIST']: np.testing.assert_array_almost_equal(gen_dict[key_]['coeff'], imp_dict[key_]['all'], decimal=4) if key_ in ['TREATED', 'UNTREATED', 'COST']: for i in range(len(gen_dict[key_]['types'])): if isinstance(gen_dict[key_]['types'][i], str): if not gen_dict[key_]['types'][i] == imp_dict[key_]['types'][i]: raise AssertionError() elif isinstance(gen_dict[key_]['types'][i], list): if not gen_dict[key_]['types'][i][0] == imp_dict[key_]['types'][i][0]: raise AssertionError() np.testing.assert_array_almost_equal( gen_dict[key_]['types'][i][1], imp_dict[key_]['types'][i][1], 4) for key_ in ['source', 'agents', 'seed']: if not gen_dict['SIMULATION'][key_] == imp_dict['SIMULATION'][key_]: raise AssertionError()
def test1(): """The test runs a loop to check the consistency of the random init file generating process and the following simulation. """ for _ in range(10): dict_ = generate_random_dict() print_dict(dict_) simulate('test.grmpy.ini')
def test8(): """We want to able to smoothly switch between generating and printing random initialization files. """ for _ in range(10): generate_random_dict() dict_1 = read('test.grmpy.ini') print_dict(dict_1) dict_2 = read('test.grmpy.ini') np.testing.assert_equal(dict_1, dict_2)
def refactor_results(dict_, file): pseudo = read(file) for key in ['TREATED', 'UNTREATED', 'COST', 'DIST']: if key == 'DIST': pseudo['DIST']['coeff'] = dict_['AUX']['x_internal'][-6:] else: pseudo[key]['coeff'] = dict_[key]['all'].tolist() del pseudo[key]['all'] print_dict(pseudo, 'test')
def test4(): """The fifth test tests the random init file generating process and the import process. It generates an random init file, imports it again and compares the entries in both dictionaries. """ for _ in range(10): gen_dict = generate_random_dict() init_file_name = gen_dict["SIMULATION"]["source"] print_dict(gen_dict, init_file_name) imp_dict = read(init_file_name + ".grmpy.yml") dicts = [gen_dict, imp_dict] for section in ["TREATED", "UNTREATED", "CHOICE", "DIST"]: np.testing.assert_array_almost_equal(gen_dict[section]["params"], imp_dict[section]["params"], decimal=4) if section in ["TREATED", "UNTREATED", "CHOICE"]: for dict_ in dicts: if not dict_[section]["order"] == dict_[section]["order"]: raise AssertionError() if len(dict_[section]["order"]) != len( set(dict_[section]["order"])): raise AssertionError() if dict_[section]["order"][0] != "X1": raise AssertionError() for variable in gen_dict["VARTYPES"].keys(): if variable not in imp_dict["VARTYPES"].keys(): raise AssertionError() if gen_dict["VARTYPES"][variable] != imp_dict["VARTYPES"][variable]: raise AssertionError if gen_dict["VARTYPES"]["X1"] != "nonbinary": raise AssertionError for subkey in ["source", "agents", "seed"]: if not gen_dict["SIMULATION"][subkey] == imp_dict["SIMULATION"][ subkey]: raise AssertionError() for subkey in [ "agents", "file", "optimizer", "start", "maxiter", "dependent", "indicator", "comparison", "output_file", ]: if not gen_dict["ESTIMATION"][subkey] == imp_dict["ESTIMATION"][ subkey]: raise AssertionError()
def update_correlation_structure(model_dict, rho): """This function takes a valid model specification and updates the correlation structure among the unobservables.""" # We first extract the baseline information from the model dictionary. sd_v = model_dict["DIST"]["params"][-1] sd_u = model_dict["DIST"]["params"][0] # Now we construct the implied covariance, which is relevant for the initialization file. cov = rho * sd_v * sd_u model_dict["DIST"]["params"][2] = cov # We print out the specification to an initialization file with the name mc_init.grmpy.ini. print_dict(model_dict, "reliability")
def check_vault(): """This function checks the complete regression vault that is distributed as part of the package. """ fname = os.path.dirname( grmpy.__file__) + '/test/resources/regression_vault.grmpy.json' tests = json.load(open(fname)) for test in tests: stat, dict_, criteria = test print_dict(dict_) df = simulate('test.grmpy.ini') np.testing.assert_almost_equal(np.sum(df.sum()), stat) cleanup('regression')
def test2(): """The third test checks whether the relationships hold if the coefficients are zero in different setups. """ for _ in range(10): for i in ['ALL', 'TREATED', 'UNTREATED', 'COST', 'TREATED & UNTREATED']: constr = constraints(probability=0.0) dict_ = generate_random_dict(constr) if i == 'ALL': for key_ in ['TREATED', 'UNTREATED', 'COST']: dict_[key_]['coeff'] = np.array([0.] * len(dict_[key_]['coeff'])) elif i == 'TREATED & UNTREATED': for key_ in ['TREATED', 'UNTREATED']: dict_[key_]['coeff'] = np.array([0.] * len(dict_[key_]['coeff'])) else: dict_[i]['coeff'] = np.array([0.] * len(dict_[i]['coeff'])) print_dict(dict_) dict_ = read('test.grmpy.ini') df = simulate('test.grmpy.ini') x = df.filter(regex=r'^X\_', axis=1) if i == 'ALL': np.testing.assert_array_equal(df.Y1, df.U1) np.testing.assert_array_equal(df.Y0, df.U0) elif i == 'TREATED & UNTREATED': np.testing.assert_array_equal(df.Y1, df.U1) np.testing.assert_array_equal(df.Y0, df.U0) np.testing.assert_array_equal(df.Y[df.D == 1], df.U1[df.D == 1]) np.testing.assert_array_equal(df.Y[df.D == 0], df.U0[df.D == 0]) elif i == 'TREATED': y_untreated = pd.DataFrame.sum(dict_['UNTREATED']['all'] * x, axis=1) + df.U0 np.testing.assert_array_almost_equal(df.Y0, y_untreated, decimal=5) np.testing.assert_array_equal(df.Y1, df.U1) elif i == 'UNTREATED': y_treated = pd.DataFrame.sum(dict_['TREATED']['all'] * x, axis=1) + df.U1 np.testing.assert_array_almost_equal(df.Y1, y_treated, decimal=5) np.testing.assert_array_equal(df.Y0, df.U0) else: y_treated = pd.DataFrame.sum(dict_['TREATED']['all'] * x, axis=1) + df.U1 y_untreated = pd.DataFrame.sum(dict_['UNTREATED']['all'] * x, axis=1) + df.U0 np.testing.assert_array_almost_equal(df.Y1, y_treated, decimal=5) np.testing.assert_array_almost_equal(df.Y0, y_untreated, decimal=5) np.testing.assert_array_equal(df.Y[df.D == 1], df.Y1[df.D == 1]) np.testing.assert_array_equal(df.Y[df.D == 0], df.Y0[df.D == 0]) np.testing.assert_array_almost_equal(df.V, (df.UC - df.U1 + df.U0))
def test1(): """ This module contains a simple test for the equality of the results of R's locpoly function and grmpy's locpoly function. Therefore, the mock data set from Carneiro et al (2011) is used. """ init_dict = read(TEST_RESOURCES_DIR + "/replication_semipar.yml") init_dict["ESTIMATION"]["file"] = TEST_RESOURCES_DIR + "/aer-replication-mock.pkl" print_dict(init_dict, TEST_RESOURCES_DIR + "/replication_semipar") test_rslt = fit(TEST_RESOURCES_DIR + "/replication_semipar.grmpy.yml", semipar=True) expected_mte_u = pd.read_pickle( TEST_RESOURCES_DIR + "/replication-results-mte_u.pkl" ) np.testing.assert_array_almost_equal(test_rslt["mte_u"], expected_mte_u, 6)
def test2(): """This test runs a random selection of five regression tests from the package's regression test vault. """ fname = os.path.dirname( grmpy.__file__) + '/test/resources/regression_vault.grmpy.json' tests = json.load(open(fname)) for i in np.random.choice(range(len(tests)), size=5): stat, dict_, criteria = tests[i] print_dict(dict_) df = simulate('test.grmpy.ini') init_dict = read('test.grmpy.ini') start = start_values(init_dict, df, 'init') criteria_ = calculate_criteria(init_dict, df, start) np.testing.assert_array_almost_equal(criteria, criteria_) np.testing.assert_almost_equal(np.sum(df.sum()), stat)
def update_correlation_structure(model_dict, rho): """This function takes a valid model specification and updates the correlation structure among the unobservables.""" # We first extract the baseline information from the model dictionary. sd_v = model_dict['DIST']['all'][-1] sd_u = model_dict['DIST']['all'][0] # Now we construct the implied covariance, which is relevant for the initialization file. cov = rho * sd_v * sd_u model_dict['DIST']['all'][2] = cov # We print out the specification to an initialization file with the name mc_init.grmpy.ini. for key_ in ['TREATED', 'UNTREATED', 'CHOICE']: x = [model_dict['varnames'][j - 1] for j in model_dict[key_]['order']] model_dict[key_]['order'] = x print_dict(model_dict, 'reliability')
def test8(): """The test checks if an UserError occurs if wrong inputs are specified for a different functions/methods. """ constr = dict() constr["DETERMINISTIC"], constr["AGENTS"] = False, 1000 generate_random_dict(constr) df = simulate("test.grmpy.yml") dict_ = read("test.grmpy.yml") a = list() dict_["ESTIMATION"]["file"] = "data.grmpy.yml" print_dict(dict_, "false_data") pytest.raises(UserError, fit, "tast.grmpy.yml") pytest.raises(UserError, fit, "false_data.grmpy.yml") pytest.raises(UserError, simulate, "tast.grmpy.yml") pytest.raises(UserError, read, "tast.grmpy.yml") pytest.raises(UserError, start_values, a, df, "init") pytest.raises(UserError, generate_random_dict, a)
def test8(): """The test checks if an UserError occurs if wrong inputs are specified for a different functions/methods. """ constr = dict() constr['DETERMINISTIC'], constr['AGENTS'] = False, 1000 generate_random_dict(constr) df = simulate('test.grmpy.ini') a = [] dict_ = read('test.grmpy.ini') dict_['ESTIMATION']['file'] = 'data.grmpy.ini' print_dict(dict_, 'false_data') pytest.raises(UserError, estimate, 'tast.grmpy.ini') pytest.raises(UserError, estimate, 'false_data.grmpy.ini') pytest.raises(UserError, simulate, 'tast.grmpy.ini') pytest.raises(UserError, read, 'tast.grmpy.ini') pytest.raises(UserError, start_values, a, df, 'init') pytest.raises(UserError, generate_random_dict, a)
def test2(): """This test runs a random selection of five regression tests from the package's regression test vault. """ fname = TEST_RESOURCES_DIR + '/regression_vault.grmpy.json' tests = json.load(open(fname)) random_choice = np.random.choice(range(len(tests)), 5) tests = [tests[i] for i in random_choice] for test in tests: stat, dict_, criteria = test print_dict(dict_) df = simulate('test.grmpy.ini') init_dict = read('test.grmpy.ini') start = start_values(init_dict, df, 'init') criteria_ = calculate_criteria(init_dict, df, start) np.testing.assert_almost_equal(np.sum(df.sum()), stat) np.testing.assert_array_almost_equal(criteria, criteria_)
def test_replication_carneiro(): """ This function checks the equality of the results of R's locpoly function and grmpy's locpoly function. The mock data set from Carneiro et al (2011) is used and both the mte_u and the final mte are compared. """ init_dict = read(TEST_RESOURCES_DIR + "/replication_semipar.yml") init_dict["ESTIMATION"][ "file"] = TEST_RESOURCES_DIR + "/aer-replication-mock.pkl" print_dict(init_dict, TEST_RESOURCES_DIR + "/replication_semipar") test_rslt = fit(TEST_RESOURCES_DIR + "/replication_semipar.grmpy.yml", semipar=True) expected_mte_u = pd.read_pickle(TEST_RESOURCES_DIR + "/replication-results-mte_u.pkl") expected_mte = pd.read_pickle(TEST_RESOURCES_DIR + "/replication-results-mte.pkl") np.testing.assert_array_almost_equal(test_rslt["mte_u"], expected_mte_u, 6) np.testing.assert_array_almost_equal(test_rslt["mte"], expected_mte, 6)
def test2(): """This test runs a random selection of five regression tests from the our old regression test battery. """ fname = TEST_RESOURCES_DIR + "/old_regression_vault.grmpy.json" tests = json.load(open(fname)) random_choice = np.random.choice(range(len(tests)), 5) tests = [tests[i] for i in random_choice] for test in tests: stat, dict_, criteria = test print_dict(dict_transformation(dict_)) df = simulate("test.grmpy.yml") init_dict = read("test.grmpy.yml") start = start_values(init_dict, df, "init") _, X1, X0, Z1, Z0, Y1, Y0 = process_data(df, init_dict) criteria_ = calculate_criteria(init_dict, X1, X0, Z1, Z0, Y1, Y0, start) np.testing.assert_almost_equal(np.sum(df.sum()), stat) np.testing.assert_array_almost_equal(criteria, criteria_)
def update_tutorial(file, rho=None): """This function enables us to rewrite the grmpy tutorial file so that it correspond to a parameterization with essential heterogeneity""" if rho is None: rho = [] rho += [np.random.uniform(0.3, 0.7, 1)] rho += [np.random.uniform(-0.1, -0.345, 1)] init_dict = read(file) init_dict["SIMULATION"]["source"] = "data_eh" sd1 = init_dict["DIST"]["params"][0] sd0 = init_dict["DIST"]["params"][3] sdv = init_dict["DIST"]["params"][-1] init_dict["DIST"]["params"][2] = sd1 * sdv * rho[0] init_dict["DIST"]["params"][-2] = sd0 * sdv * rho[1] print_dict(init_dict, "files/tutorial_eh")
def test3(): """The test checks if the criteria function value of the simulated and the 'estimated' sample is equal if both samples include an identical number of individuals. """ for _ in range(5): constr = constraints(probability=0.0, agents=10000, start='init', optimizer='SCIPY-BFGS') dict_ = generate_random_dict(constr) print_dict(dict_) df1 = simulate('test.grmpy.ini') rslt = estimate('test.grmpy.ini') init_dict = read('test.grmpy.ini') df2 = simulate_estimation(init_dict, rslt, df1) start = start_values(init_dict, df1, 'init') criteria = [] for data in [df1, df2]: criteria += [calculate_criteria(init_dict, data, start)] np.testing.assert_allclose(criteria[1], criteria[0], rtol=0.1)
def test6(): """Additionally to test5 this test checks if the comparison file provides the expected output when maxiter is set to zero and the estimation process uses the initialization file values as start values. """ for _ in range(5): constr = dict() constr['DETERMINISTIC'], constr['MAXITER'], constr[ 'AGENTS'] = False, 0, 10000 constr['START'], constr['SAME_SIZE'] = 'init', True dict_ = generate_random_dict(constr) dict_['DIST']['all'][1], dict_['DIST']['all'][5] = 0.0, 1.0 print_dict(dict_) simulate('test.grmpy.ini') estimate('test.grmpy.ini') dict_ = read_desc('comparison.grmpy.txt') for key_ in ['All', 'Treated', 'Untreated']: np.testing.assert_equal(len(set(dict_[key_]['Number'])), 1) np.testing.assert_almost_equal( dict_[key_]['Observed Sample'], dict_[key_]['Simulated Sample (finish)'], 0.001) np.testing.assert_array_almost_equal( dict_[key_]['Simulated Sample (finish)'], dict_[key_]['Simulated Sample (start)'], 0.001)
def print_model_dict(model_dict, fname='mc_init'): """This function prints a model specification.""" print_dict(model_dict, fname)
from grmpy.test.random_init import generate_random_dict from grmpy.test.random_init import print_dict import grmpy # We simply specify a minimum number of minutes for our package to run with different requests. MINUTES = 1 end_time = datetime.datetime.now() + datetime.timedelta(minutes=MINUTES) counter = 1 while True: if datetime.datetime.now() >= end_time: break print('\n Iteration ', counter) dict_ = generate_random_dict() print_dict(dict_) grmpy.simulate('test.grmpy.ini') # This is a temporary fix so that the determination of starting values by PROBIT does # not work if we have a perfect separation. try: grmpy.estimate('test.grmpy.ini') except statsmodels.tools.sm_exceptions.PerfectSeparationError: print('separation error, skip') subprocess.check_call(['git', 'clean', '-d', '-f']) counter += 1
def test7(): """This test ensures that the estimation process returns an UserError if one tries to execute an estimation process with initialization file values as start values for an deterministic setting. """ fname_falsespec1 = TEST_RESOURCES_DIR + "/test_falsespec1.grmpy.yml" fname_falsespec2 = TEST_RESOURCES_DIR + "/test_falsespec2.grmpy.yml" fname_noparams = TEST_RESOURCES_DIR + "/test_noparams.grmpy.yml" fname_binary = TEST_RESOURCES_DIR + "/test_binary.grmpy.yml" fname_vzero = TEST_RESOURCES_DIR + "/test_vzero.grmpy.yml" fname_possd = TEST_RESOURCES_DIR + "/test_npsd.grmpy.yml" fname_zero = TEST_RESOURCES_DIR + "/test_zero.grmpy.yml" for _ in range(5): constr = dict() constr["AGENTS"], constr["DETERMINISTIC"] = 1000, True generate_random_dict(constr) dict_ = read("test.grmpy.yml") pytest.raises(UserError, check_sim_distribution, dict_) pytest.raises(UserError, fit, "test.grmpy.yml") generate_random_dict(constr) dict_ = read("test.grmpy.yml") if len(dict_["CHOICE"]["order"]) == 1: dict_["CHOICE"]["params"] = list(dict_["CHOICE"]["params"]) dict_["CHOICE"]["params"] += [1.000] dict_["CHOICE"]["order"] += [2] dict_["CHOICE"]["order"][1] = "X1" print_dict(dict_) pytest.raises(UserError, check_sim_init_dict, dict_) pytest.raises(UserError, simulate, "test.grmpy.yml") pytest.raises(UserError, fit, "test.grmpy.yml") constr["AGENTS"] = 0 generate_random_dict(constr) dict_ = read("test.grmpy.yml") pytest.raises(UserError, check_sim_init_dict, dict_) pytest.raises(UserError, simulate, "test.grmpy.yml") length = np.random.randint(2, 100) array = np.random.rand(length, 1) subsitute = np.random.randint(0, len(array) - 1) array[subsitute] = np.inf pytest.raises(UserError, check_start_values, array) dict_ = read(fname_possd) pytest.raises(UserError, check_sim_init_dict, dict_) pytest.raises(UserError, simulate, fname_possd) dict_ = read(fname_zero) pytest.raises(UserError, check_sim_distribution, dict_) pytest.raises(UserError, fit, fname_zero) dict_ = read(fname_vzero) pytest.raises(UserError, check_sim_distribution, dict_) pytest.raises(UserError, fit, fname_vzero) dict_ = read(fname_noparams) pytest.raises(UserError, check_sim_distribution, dict_) pytest.raises(UserError, fit, fname_noparams) dict_ = read(fname_falsespec1) pytest.raises(UserError, check_sim_init_dict, dict_) pytest.raises(UserError, fit, fname_noparams) dict_ = read(fname_falsespec2) pytest.raises(UserError, check_sim_init_dict, dict_) pytest.raises(UserError, fit, fname_noparams) dict_ = read(fname_binary) status, _ = check_special_conf(dict_) np.testing.assert_equal(status, True) pytest.raises(UserError, check_sim_init_dict, dict_) pytest.raises(UserError, fit, fname_noparams)
def monte_carlo(file, which, grid_points=10): """This function estimates various effect parameters for increasing presence of essential heterogeneity, which is reflected by increasing correlation between U_1 and V. """ # simulate a new data set with essential heterogeneity present model_dict = read(file) original_correlation = model_dict["DIST"]["params"][2] model_dict["DIST"]["params"][2] = -0.191 print_dict(model_dict, file.replace(".grmpy.yml", "")) grmpy.simulate(file) effects = [] # Loop over different correlations between V and U_1 for rho in np.linspace(0.00, -0.99, grid_points): # effects["rho"] += [rho] # Readjust the initialization file values to add correlation model_spec = read(file) X = model_spec["TREATED"]["order"] update_correlation_structure(file, model_spec, rho) sim_spec = read(file) # Simulate a Data set and specify exogeneous and endogeneous variables df_mc = create_data(file) treated = df_mc["D"] == 1 Xvar = df_mc[X] instr = sim_spec["CHOICE"]["order"] instr = [i for i in instr if i != "const"] # We calculate our parameter of interest label = which.lower() if label == "conventional_average_effects": ATE = np.mean(df_mc["Y1"] - df_mc["Y0"]) TT = np.mean(df_mc["Y1"].loc[treated] - df_mc["Y0"].loc[treated]) stat = (ATE, TT) elif label in ["random", "randomization"]: random = np.mean(df_mc[df_mc.D == 1]["Y"]) - np.mean( df_mc[df_mc.D == 0]["Y"]) stat = random elif label in ["ordinary_least_squares", "ols"]: results = sm.OLS(df_mc["Y"], df_mc[["const", "D"]]).fit() stat = results.params[1] elif label in ["instrumental_variables", "iv"]: iv = IV2SLS(df_mc["Y"], Xvar, df_mc["D"], df_mc[instr]).fit() stat = iv.params["D"] elif label in ["grmpy", "grmpy-par"]: rslt = grmpy.fit(file) beta_diff = rslt["TREATED"]["params"] - rslt["UNTREATED"]["params"] stat = np.dot(np.mean(Xvar), beta_diff) elif label in ["grmpy-semipar", "grmpy-liv"]: rslt = grmpy.fit(file, semipar=True) y0_fitted = np.dot(rslt["X"], rslt["b0"]) y1_fitted = np.dot(rslt["X"], rslt["b1"]) mte_x_ = y1_fitted - y0_fitted mte_u = rslt["mte_u"] us = np.linspace(0.005, 0.995, len(rslt["quantiles"])) mte_mat = np.zeros((len(mte_x_), len(mte_u))) for i in range(len(mte_x_)): for j in range(len(mte_u)): mte_mat[i, j] = mte_x_[i] + mte_u[j] ate_tilde_p = np.mean(mte_mat, axis=1) stat = ate_tilde_p.mean() else: raise NotImplementedError effects += [stat] # Restore original init file model_dict = read(file) model_dict["DIST"]["params"][2] = original_correlation print_dict(model_dict, file.replace(".grmpy.yml", "")) grmpy.simulate(file) return effects
def monte_carlo(file, which, grid_points=10): """ This function conducts a Monte Carlo simulation to compare the true and estimated treatment parameters for increasing (absolute) correlation between U_1 and V (i.e essential heterogeneity). In the example here, the correlation between U_1 and V becomes increasingly more negative. As we consider the absolute value of the correlation coefficient, values closer to -1 (or in the analogous case closer to +1) denote a higher degree of essential heterogeneity. The results of the Monte Carlo simulation can be used to evaluate the performance of different estimation strategies in the presence of essential heterogeneity. Depending on the specification of *which*, either the true ATE and TT, or an estimate of the ATE are returned. Options for *which*: Comparison of ATE and TT - "conventional_average_effects" Different estimation strategies for ATE - "randomization" ("random") - "ordinary_least_squares" ("ols") - "instrumental_variables" ("iv") - "grmpy_par" ("grmpy") - "grmpy_semipar"("grmpy-liv") Post-estimation: To plot the comparison between the true ATE and the respective parameter, use the function - plot_effects() for *which* = "conventional_average_effects", and - plot_estimates() else. Parameters ---------- file: yaml grmpy initialization file, provides information for the simulation process. which: string String denoting whether conventional average effects shall be computed or, alternatively, which estimation approach shall be implemented for the ATE. grid_points: int, default 10 Number of different values for rho, the correlation coefficient between U_1 and V, on the interval [0, -1), along which the parameters shall be evaluated. Returns ------- effects: list If *which* = "conventional_average_effects", list of lenght *grid_points* x 2 containing the true ATE and TT. Else, list of length *grid_points* x 1 containing an estimate of the ATE. """ # simulate a new data set with essential heterogeneity present model_dict = read(file) original_correlation = model_dict["DIST"]["params"][2] model_dict["DIST"]["params"][2] = -0.191 print_dict(model_dict, file.replace(".grmpy.yml", "")) grmpy.simulate(file) effects = [] # Loop over different correlations between U_1 and V for rho in np.linspace(0.00, -0.99, grid_points): # effects["rho"] += [rho] # Readjust the initialization file values to add correlation model_spec = read(file) X = model_spec["TREATED"]["order"] _update_correlation_structure(file, model_spec, rho) sim_spec = read(file) # Simulate a Data set and specify exogeneous and endogeneous variables df_mc = _create_data(file) treated = df_mc["D"] == 1 Xvar = df_mc[X] instr = sim_spec["CHOICE"]["order"] instr = [i for i in instr if i != "const"] # We calculate our parameter of interest label = which.lower() if label == "conventional_average_effects": ATE = np.mean(df_mc["Y1"] - df_mc["Y0"]) TT = np.mean(df_mc["Y1"].loc[treated] - df_mc["Y0"].loc[treated]) stat = (ATE, TT) elif label in ["randomization", "random"]: random = np.mean(df_mc[df_mc.D == 1]["Y"]) - np.mean( df_mc[df_mc.D == 0]["Y"] ) stat = random elif label in ["ordinary_least_squares", "ols"]: results = sm.OLS(df_mc["Y"], df_mc[["const", "D"]]).fit() stat = results.params[1] elif label in ["instrumental_variables", "iv"]: iv = IV2SLS(df_mc["Y"], Xvar, df_mc["D"], df_mc[instr]).fit() stat = iv.params["D"] elif label in ["grmpy", "grmpy-par"]: rslt = grmpy.fit(file) beta_diff = rslt["TREATED"]["params"] - rslt["UNTREATED"]["params"] stat = np.dot(np.mean(Xvar), beta_diff) elif label in ["grmpy-semipar", "grmpy-liv"]: rslt = grmpy.fit(file, semipar=True) y0_fitted = np.dot(rslt["X"], rslt["b0"]) y1_fitted = np.dot(rslt["X"], rslt["b1"]) mte_x_ = y1_fitted - y0_fitted mte_u = rslt["mte_u"] us = np.linspace(0.005, 0.995, len(rslt["quantiles"])) mte_mat = np.zeros((len(mte_x_), len(mte_u))) for i in range(len(mte_x_)): for j in range(len(mte_u)): mte_mat[i, j] = mte_x_[i] + mte_u[j] ate_tilde_p = np.mean(mte_mat, axis=1) stat = ate_tilde_p.mean() else: raise NotImplementedError effects += [stat] # Restore original init file model_dict = read(file) model_dict["DIST"]["params"][2] = original_correlation print_dict(model_dict, file.replace(".grmpy.yml", "")) grmpy.simulate(file) return effects
def test2(): """The third test checks whether the relationships hold if the coefficients are zero in different setups. """ for _ in range(10): for i in [ 'ALL', 'TREATED', 'UNTREATED', 'CHOICE', 'TREATED & UNTREATED' ]: constr = dict() constr['DETERMINISTIC'] = False dict_ = generate_random_dict(constr) if i == 'ALL': for key_ in ['TREATED', 'UNTREATED', 'CHOICE']: dict_[key_]['all'] = np.array([0.] * len(dict_[key_]['all'])) elif i == 'TREATED & UNTREATED': for key_ in ['TREATED', 'UNTREATED']: dict_[key_]['all'] = np.array([0.] * len(dict_[key_]['all'])) else: dict_[i]['all'] = np.array([0.] * len(dict_[i]['all'])) print_dict(dict_) dict_ = read('test.grmpy.ini') df = simulate('test.grmpy.ini') x_treated = df[[ dict_['varnames'][i - 1] for i in dict_['TREATED']['order'] ]] x_untreated = df[[ dict_['varnames'][i - 1] for i in dict_['UNTREATED']['order'] ]] if i == 'ALL': np.testing.assert_array_equal(df.Y1, df.U1) np.testing.assert_array_equal(df.Y0, df.U0) elif i == 'TREATED & UNTREATED': np.testing.assert_array_equal(df.Y1, df.U1) np.testing.assert_array_equal(df.Y0, df.U0) np.testing.assert_array_equal(df.Y[df.D == 1], df.U1[df.D == 1]) np.testing.assert_array_equal(df.Y[df.D == 0], df.U0[df.D == 0]) elif i == 'TREATED': y_untreated = pd.DataFrame.sum( dict_['UNTREATED']['all'] * x_untreated, axis=1) + df.U0 np.testing.assert_array_almost_equal(df.Y0, y_untreated, decimal=5) np.testing.assert_array_equal(df.Y1, df.U1) elif i == 'UNTREATED': y_treated = pd.DataFrame.sum( dict_['TREATED']['all'] * x_treated, axis=1) + df.U1 np.testing.assert_array_almost_equal(df.Y1, y_treated, decimal=5) np.testing.assert_array_equal(df.Y0, df.U0) else: y_treated = pd.DataFrame.sum( dict_['TREATED']['all'] * x_treated, axis=1) + df.U1 y_untreated = pd.DataFrame.sum( dict_['UNTREATED']['all'] * x_untreated, axis=1) + df.U0 np.testing.assert_array_almost_equal(df.Y1, y_treated, decimal=5) np.testing.assert_array_almost_equal(df.Y0, y_untreated, decimal=5) np.testing.assert_array_equal(df.Y[df.D == 1], df.Y1[df.D == 1]) np.testing.assert_array_equal(df.Y[df.D == 0], df.Y0[df.D == 0])
def test2(): """The second test checks whether the relationships hold if the coefficients are zero in different setups. """ for _ in range(10): for case in [ "ALL", "TREATED", "UNTREATED", "CHOICE", "TREATED & UNTREATED" ]: constr = dict() constr["DETERMINISTIC"] = False dict_ = generate_random_dict(constr) if case == "ALL": for section in ["TREATED", "UNTREATED", "CHOICE"]: dict_[section]["params"] = np.array( [0.0] * len(dict_[section]["params"])) elif case == "TREATED & UNTREATED": for section in ["TREATED", "UNTREATED"]: dict_[section]["params"] = np.array( [0.0] * len(dict_[section]["params"])) else: dict_[case]["params"] = np.array([0.0] * len(dict_[case]["params"])) print_dict(dict_) dict_ = read("test.grmpy.yml") df = simulate("test.grmpy.yml") x_treated = df[dict_["TREATED"]["order"]] x_untreated = df[dict_["UNTREATED"]["order"]] if case == "ALL": np.testing.assert_array_equal(df.Y1, df.U1) np.testing.assert_array_equal(df.Y0, df.U0) elif case == "TREATED & UNTREATED": np.testing.assert_array_equal(df.Y1, df.U1) np.testing.assert_array_equal(df.Y0, df.U0) np.testing.assert_array_equal(df.Y[df.D == 1], df.U1[df.D == 1]) np.testing.assert_array_equal(df.Y[df.D == 0], df.U0[df.D == 0]) elif case == "TREATED": y_untreated = (pd.DataFrame.sum( dict_["UNTREATED"]["params"] * x_untreated, axis=1) + df.U0) np.testing.assert_array_almost_equal(df.Y0, y_untreated, decimal=5) np.testing.assert_array_equal(df.Y1, df.U1) elif case == "UNTREATED": y_treated = (pd.DataFrame.sum( dict_["TREATED"]["params"] * x_treated, axis=1) + df.U1) np.testing.assert_array_almost_equal(df.Y1, y_treated, decimal=5) np.testing.assert_array_equal(df.Y0, df.U0) else: y_treated = (pd.DataFrame.sum( dict_["TREATED"]["params"] * x_treated, axis=1) + df.U1) y_untreated = (pd.DataFrame.sum( dict_["UNTREATED"]["params"] * x_untreated, axis=1) + df.U0) np.testing.assert_array_almost_equal(df.Y1, y_treated, decimal=5) np.testing.assert_array_almost_equal(df.Y0, y_untreated, decimal=5) np.testing.assert_array_equal(df.Y[df.D == 1], df.Y1[df.D == 1]) np.testing.assert_array_equal(df.Y[df.D == 0], df.Y0[df.D == 0])
def test7(): """This test ensures that the estimation process returns an UserError if one tries to execute an estimation process with initialization file values as start values for an deterministic setting. """ fname_diff_categorical = TEST_RESOURCES_DIR + '/test_categorical_diff.grmpy.ini' fname_categorical = TEST_RESOURCES_DIR + '/test_categorical.grmpy.ini' fname_diff_binary = TEST_RESOURCES_DIR + '/test_binary_diff.grmpy.ini' fname_vzero = TEST_RESOURCES_DIR + '/test_vzero.grmpy.ini' fname_possd = TEST_RESOURCES_DIR + '/test_npsd.grmpy.ini' fname_zero = TEST_RESOURCES_DIR + '/test_zero.grmpy.ini' for _ in range(5): constr = dict() constr['AGENTS'], constr['DETERMINISTIC'] = 1000, True generate_random_dict(constr) dict_ = read('test.grmpy.ini') pytest.raises(UserError, check_init_file, dict_) pytest.raises(UserError, estimate, 'test.grmpy.ini') generate_random_dict(constr) dict_ = read('test.grmpy.ini') if len(dict_['CHOICE']['order']) == 1: dict_['CHOICE']['all'] = list(dict_['CHOICE']['all']) dict_['CHOICE']['all'] += [1.000] dict_['CHOICE']['order'] += [2] dict_['CHOICE']['types'] += ['nonbinary'] dict_['CHOICE']['order'][1] = 1 print_dict(dict_) pytest.raises(UserError, check_initialization_dict, dict_) pytest.raises(UserError, simulate, 'test.grmpy.ini') pytest.raises(UserError, estimate, 'test.grmpy.ini') constr['AGENTS'] = 0 generate_random_dict(constr) dict_ = read('test.grmpy.ini') pytest.raises(UserError, check_initialization_dict, dict_) pytest.raises(UserError, simulate, 'test.grmpy.ini') tests = [] tests += [['TREATED', 'UNTREATED'], ['TREATED', 'CHOICE'], ['UNTREATED', 'CHOICE']] tests += [['TREATED', 'UNTREATED', 'CHOICE']] for combi in tests: constr['STATE_DIFF'], constr['OVERLAP'] = True, True generate_random_dict(constr) dict_ = read('test.grmpy.ini') for j in combi: if len(dict_[j]['order']) == 1: dict_[j]['all'] = list(dict_[j]['all']) dict_[j]['all'] += [1.000] dict_[j]['order'] += [2] dict_[j]['types'] += ['nonbinary'] else: pass dict_[j]['order'][1] = len(dict_['AUX']['types']) + 1 frac = np.random.uniform(0.1, 0.8) dict_[j]['types'][1] = ['binary', frac] print_dict(dict_) pytest.raises(UserError, read, 'test.grmpy.ini') dict_ = read(fname_possd) pytest.raises(UserError, check_initialization_dict, dict_) pytest.raises(UserError, simulate, fname_possd) dict_ = read(fname_categorical) pytest.raises(UserError, check_initialization_dict, dict_) pytest.raises(UserError, simulate, fname_categorical) dict_ = read(fname_zero) pytest.raises(UserError, check_init_file, dict_) pytest.raises(UserError, estimate, fname_zero) dict_ = read(fname_vzero) pytest.raises(UserError, check_init_file, dict_) pytest.raises(UserError, estimate, fname_vzero) dict_ = read(fname_diff_binary) pytest.raises(UserError, check_initialization_dict, dict_) pytest.raises(UserError, estimate, fname_diff_binary) dict_ = read(fname_diff_categorical) pytest.raises(UserError, check_initialization_dict, dict_) pytest.raises(UserError, estimate, fname_diff_categorical)