def test_2(): models, num_periods = ["kw_97_basic", "kw_94_one", "kw_97_extended"], 3 add_occ, add_types = range(1, 3), range(1, 3) for model, add_occ, add_types in list(product(models, add_occ, add_types)): args = (model, num_periods, add_occ, add_types) params, options = scaling_model_specification(*args) rp.get_simulate_func(params, options)(params)
def test_type_proportions(model, type_proportions): nine_years_or_less = type_proportions[0] ten_years_or_more = type_proportions[1] params, options = rp.get_example_model(model, with_data=False) options["n_periods"] = 1 options["simulated_agents"] = 10_000 simulate = rp.get_simulate_func(params, options) df = simulate(params) np.testing.assert_allclose( df.loc[df.Experience_School.le(9), "Type"].value_counts(normalize=True, sort=False).sort_index(), nine_years_or_less, atol=0.05, ) np.testing.assert_allclose( df.loc[df.Experience_School.ge(10), "Type"].value_counts(normalize=True, sort=False).sort_index(), ten_years_or_more, atol=0.05, )
def simulate_life_cycle_df(params, options, sim_seed, sol_seed, col_to_keep): """Simulate life cycle dataset, store choices and wages (mean and std). Args: params (pd.DataFrame): DataFrame containing model parameters. options (dict): Dictionary containing model options. sim_seed (int): Seed for simulation. sim_seed (int): Seed for solution. col_to_keep (list): Columns of the simulate data from which to compute relevant moments (choice and wages). Returns: pd.DataFrame. """ with _temporary_working_directory(snippet=f"{sim_seed}_{sol_seed}"): options["simulation_seed"] = int(sim_seed) options["solution_seed"] = int(sol_seed) simulate = rp.get_simulate_func(params, options) df = simulate(params) # extract choices choices = df.groupby("Period").Choice.value_counts( normalize=True).unstack() # extract wages (mean and std) wages = df[col_to_keep].groupby("Period").describe().loc[:, ( slice(None), ["mean", "std"])] res = pd.concat([wages, choices], axis=1) return res
def test_equality_for_myopic_agents_and_tiny_delta(): """Test equality of simulated data and likelihood with myopia and tiny delta.""" # Get simulated data and likelihood for myopic model. params, options = generate_random_model(myopic=True) simulate = rp.get_simulate_func(params, options) df = simulate(params) log_like = get_log_like_func(params, options, df) likelihood = log_like(params) # Get simulated data and likelihood for model with tiny delta. params.loc["delta", "value"] = 1e-12 df_ = simulate(params) log_like = rp.get_log_like_func(params, options, df_) likelihood_ = log_like(params) # The continuation values are different because for delta = 0 the backward induction # is completely skipped and all continuation values are set to zero whereas for a # tiny delta, the delta ensures that continuation have no impact. columns = df.filter(like="Continu").columns.tolist() pd.testing.assert_frame_equal(df.drop(columns=columns), df_.drop(columns=columns)) np.testing.assert_almost_equal(likelihood, likelihood_, decimal=12)
def test_distribution_of_observables(seed): """Test that the distribution of observables matches the simulated distribution.""" np.random.seed(seed) # Now specify a set of observables point_constr = { "observables": [np.random.randint(2, 6)], "simulation_agents": 1000, "n_periods": 1, } params, options = generate_random_model(point_constr=point_constr) simulate = rp.get_simulate_func(params, options) df = simulate(params) # Check observable probabilities probs = df["Observable_0"].value_counts(normalize=True, sort=False) # Check proportions np.testing.assert_almost_equal( probs.to_numpy(), params.loc[params.index.get_level_values(0).str. contains("observable_observable_0"), "value", ].to_numpy(), decimal=1, )
def test_distribution_of_observables(): """Test that the distribution of observables matches the simulated distribution.""" # Now specify a set of observables point_constr = { "observables": [np.random.randint(2, 6)], "simulation_agents": 1000 } params, options = generate_random_model(point_constr=point_constr) simulate = rp.get_simulate_func(params, options) df = simulate(params) # Check observable probabilities probs = df["Observable_0"].value_counts(normalize=True, sort=False) # Check proportions n_levels = point_constr["observables"][0] for level in range(n_levels): # Some observables might be missing in the simulated data because of small # probabilities. Test for zero probability in this case. probability = probs.loc[level] if level in probs.index else 0 params_probability = params.loc[(f"observable_observable_0_{level}", "probability"), "value"] np.testing.assert_allclose(probability, params_probability, atol=0.05)
def main(): """Evaluate the criterion function multiple times for a scalability report. The criterion function is evaluated ``maxfun``-times. The number of threads used is limited by environment variables. **respy** has to be imported after the environment variables are set as Numpy, Numba and others load them at import time. """ model = sys.argv[1] maxfun = int(sys.argv[2]) n_threads = int(sys.argv[3]) # Validate input. assert maxfun >= 0, "Maximum number of function evaluations cannot be negative." assert n_threads >= 1 or n_threads == -1, ( "Use -1 to impose no restrictions on maximum number of threads or choose a " "number higher than zero.") # Set number of threads os.environ["NUMBA_NUM_THREADS"] = f"{n_threads}" os.environ["MKL_NUM_THREADS"] = f"{n_threads}" os.environ["OMP_NUM_THREADS"] = f"{n_threads}" os.environ["NUMEXPR_NUM_THREADS"] = f"{n_threads}" # Late import of respy to ensure that environment variables are read by Numpy, etc.. import respy as rp # Get model params, options = rp.get_example_model(model, with_data=False) # Simulate the data simulate = rp.get_simulate_func(params, options) df = simulate(params) # Get the criterion function and the parameter vector. crit_func = rp.get_log_like_func(params, options, df) # Run the estimation start = dt.datetime.now() for _ in range(maxfun): crit_func(params) end = dt.datetime.now() # Aggregate information output = { "model": model, "maxfun": maxfun, "n_threads": n_threads, "start": str(start), "end": str(end), "duration": str(end - start), } # Save time to file with open("scalability_results.txt", "a+") as file: file.write(json.dumps(output)) file.write("\n")
def test_table_6_exact_solution_row_mean_and_sd(model, subsidy): """Replicate the first two rows of Table 6 in Keane and Wolpin (1994). In more detail, the mean effects and the standard deviations of a 500, 1000, and 2000 dollar tuition subsidy on years of schooling and of experience in occupation a and occupation b based on 40 samples of 100 individuals using true parameters are tested. """ params, options = rp.get_example_model(model, with_data=False) options["simulation_agents"] = 4000 simulate = rp.get_simulate_func(params, options) df_wo_ts = simulate(params) params.loc[("nonpec_edu", "at_least_twelve_exp_edu"), "value"] += subsidy df_w_ts = simulate(params) columns = [ "Bootstrap_Sample", "Experience_Edu", "Experience_A", "Experience_B" ] # Calculate the statistics based on 40 bootstrap samples á 100 individuals. # Assign bootstrap sample number. for df in [df_wo_ts, df_w_ts]: df["Bootstrap_Sample"] = pd.cut(df.index.get_level_values(0), bins=40, labels=np.arange(1, 41)) # Calculate mean experiences. mean_exp_wo_ts = (df_wo_ts.query("Period == 39")[columns].groupby( "Bootstrap_Sample").mean()) mean_exp_w_ts = (df_w_ts.query("Period == 39")[columns].groupby( "Bootstrap_Sample").mean()) # Calculate bootstrap statistics. diff = (mean_exp_w_ts.subtract(mean_exp_wo_ts).assign( Data=model).reset_index().set_index(["Data", "Bootstrap_Sample" ]).stack().unstack([0, 2])) rp_replication = diff.agg(["mean", "std"]) # Expected values are taken from Table 6 in the paper. kw_94_table_6 = pd.read_csv(TEST_RESOURCES_DIR / "kw_94_table_6.csv", index_col=0, header=[0, 1], nrows=2) # Test that standard deviations are very close. np.testing.assert_allclose(rp_replication[model].iloc[1], kw_94_table_6[model].iloc[1], atol=0.05) # Test that difference lies within one standard deviation. diff = (rp_replication[model].iloc[0].to_numpy() - kw_94_table_6[model].iloc[0].to_numpy()) assert (np.abs(diff) < kw_94_table_6[model].iloc[1]).all()
def model_wrapper_kw_94(params, base_options, tuition_subsidy): # TODO: This needs to done only once during the whole script. However, this requires me to # set up a more complicated structure for the parallelism and so we will simply postpone # this until we have a more serious application. simulate = rp.get_simulate_func(params, base_options) policy_params = params.copy() policy_params.loc[("nonpec_edu", "at_least_twelve_exp_edu"), "value"] += tuition_subsidy policy_df = simulate(policy_params) edu = policy_df.groupby("Identifier")["Experience_Edu"].max().mean() return edu, policy_df
def test_simulated_data(model_or_seed): """Test simulated data with ``check_simulated_data``. Note that, ``check_estimation_data`` is also tested in this function as these tests focus on a subset of the data. """ params, options = process_model_or_seed(model_or_seed) simulate = rp.get_simulate_func(params, options) df = simulate(params) optim_paras, _ = process_params_and_options(params, options) check_simulated_data(optim_paras, df)
def plot_chatter_numagents_both(seeds, num_agents, calc_moments, replace_nans, kwargs): """Plot criterion function for different seeds and numbers of agents. (simulated and observed sample) """ args = copy.deepcopy(kwargs) # Initialize df to hold results. results = pd.DataFrame(columns=num_agents) # Increase number of agents in real data. for num in num_agents: options_true = args["options"].copy() options_true["simulation_agents"] = num simulate = rp.get_simulate_func(args["params"], options_true) data_true = simulate(args["params"]) moments_true = { "Choice Frequencies": replace_nans(calc_moments["Choice Frequencies"](data_true)), "Wage Distribution": replace_nans(calc_moments["Wage Distribution"](data_true)), } # Increase number of agents in simulated model. options_chatter = args["options"].copy() options_chatter["simulation_agents"] = num criterion_values = [] for seed in seeds: options_chatter["simulation_seed"] = seed criterion_msm = rp.get_moment_errors_func( params=args["params"], options=options_chatter, calc_moments=args["calc_moments"], replace_nans=args["replace_nans"], empirical_moments=moments_true, weighting_matrix=args["weighting_matrix"], ) val = criterion_msm(args["params"]) criterion_values.append(val) results[num] = criterion_values # Plot the results. plt.plot(seeds, results[num], label=num) plt.ylim(-1, 200) plt.title("Increasing the number of Observed and Simulated agents") plt.ylabel("Criterion function") plt.xlabel("Seed") plt.legend(loc="best")
def test_equality_of_models_with_and_without_observables(seed): """Test equality of models with and without observables. First, generate a model where the parameter values of observables is set to zero. The second model is obtained by assigning all observable indicators the value of the constant in the reward functions and set the constants to zero. The two models should be equivalent. """ np.random.seed(seed) # Now specify a set of observables observables = [np.random.randint(2, 6)] point_constr = {"observables": observables} # Get simulated data and likelihood for myopic model. params, options = generate_random_model(myopic=True, point_constr=point_constr) # Get all reward values index_reward = [ x for x in set(params.index.get_level_values(0)) if "nonpec" in x or "wage" in x ] # Get all indices that have obs_labels = generate_obs_labels(observables, index_reward) # Set these values to zero params.loc[obs_labels, "value"] = 0 # Simulate the base model simulate = rp.get_simulate_func(params, options) df = simulate(params) # Put two new values into the eq for x in obs_labels: params.loc[x, "value"] = params.loc[(x[0], "constant"), "value"] for x in index_reward: params.loc[(x, "constant"), "value"] = 0 # Simulate the new model df_ = simulate(params) # test for equality pd.testing.assert_frame_equal(df_, df)
def test_invariance_of_model_solution_in_solve_and_simulation(model_or_seed): params, options = process_model_or_seed(model_or_seed) options["n_periods"] = 3 state_space = rp.solve(params, options) simulate = rp.get_simulate_func(params, options) _ = simulate(params) state_space_ = simulate.keywords["state_space"] np.testing.assert_array_equal(state_space.states, state_space_.states) np.testing.assert_array_equal(state_space.wages, state_space_.wages) np.testing.assert_array_equal(state_space.nonpec, state_space_.nonpec) np.testing.assert_array_equal(state_space.emax_value_functions, state_space_.emax_value_functions) np.testing.assert_array_equal(state_space.base_draws_sol, state_space_.base_draws_sol)
def evaluate(self, free_params): """This method evaluates the criterion function for a candidate parametrization proposed by the optimizer. we need to translate between the opt dataframe and the model dataframe""" self.update_model_spec(self.params, free_params) simulate = rp.get_simulate_func(self.params, self.options) array_sim = simulate(self.params) self.moments_sim = self.get_moments(array_sim) stats_obs, stats_sim = [], [] for group in self.moments_sim.keys(): for period in range(int(max(self.moments_sim[group].keys()) + 1)): if period not in self.moments_sim[group].keys(): continue if period not in self.moments_obs[group].keys(): continue stats_obs.extend(self.moments_obs[group][period]) stats_sim.extend(self.moments_sim[group][period]) is_valid = ( len(stats_obs) == len(stats_sim) == len(np.diag(self.weighing_matrix)) ) if is_valid: stats_diff = np.array(stats_obs) - np.array(stats_sim) fval_intermed = np.dot(stats_diff, self.weighing_matrix) fval = float(np.dot(fval_intermed, stats_diff)) else: fval = HUGE_INT self._logging_smm(stats_obs, stats_sim) self.num_evals = self.num_evals + 1 return fval
def test_distribution_of_lagged_choices(): params, options, actual_df = rp.get_example_model("kw_97_extended") options["n_periods"] = 1 options["simulated_agents"] = 10_000 simulate = rp.get_simulate_func(params, options) df = simulate(params) actual_df = actual_df.query("Period == 0") expected = pd.crosstab(actual_df.Lagged_Choice_1, actual_df.Experience_School, normalize="columns") df = df.query("Period == 0") calculated = pd.crosstab(df.Lagged_Choice_1, df.Experience_School, normalize="columns") # Allow for 4% differences which likely for small subsets. np.testing.assert_allclose(expected, calculated, atol=0.04)
def test_replication_of_choice_probabilities(model, table): """Replicate choice probabilities in Tables 2.1-2.3. in Keane and Wolpin (1994b). For each of the three parameterizations a data set is simulated and the choice probabilities for each period are compared to the numbers in the paper. """ # Get choice probabilities from paper. expected = pd.read_csv(TEST_RESOURCES_DIR / table, index_col="period") # Simulate data for choice probabilities with more individuals to stabilize choice # probabilities. Also, more draws in the solution for better approximation of EMAX. params, options = rp.get_example_model(model, with_data=False) options["simulated_agents"] = 10_000 simulate = rp.get_simulate_func(params, options) df = simulate(params) result = (df.groupby("Period").Choice.value_counts( normalize=True).unstack().fillna(0)) np.testing.assert_allclose(expected, result, atol=0.1)
def generate_data(model, present_bias=1): """Generate and save simulated data from specified model, with specified present-bias parameter. Parameters ---------- model: string "kw_94_one", "kw_94_two", "kw_94_three" according to the desired Keane and Wolpin (1994) specification. present bias: float 1 for exponential discounting, < 1 for hyperbolic discounting. """ params, options = rp.get_example_model(model, with_data=False) params.loc[("beta", "beta"), ["value", "comment"]] = [ present_bias, "present-bias parameter", ] simulation_seeds = np.linspace(0, 99, 100) solution_seeds = np.linspace(1000, 1099, 100) # Generate datasets for simulation, solution in zip(simulation_seeds, solution_seeds): options["simulation_seed"] = int(simulation) options["solution_seed"] = int(solution) simulate = rp.get_simulate_func(params, options) df = simulate(params) # Save datasets (require paths to exist) if present_bias == 1: df.to_pickle( f"respy_datasets/exp_datasets/{model}/seed_sim_{str(int(simulation))}_sol_seed_{str(int(solution))}.pickle" ) else: df.to_pickle( f"respy_datasets/hyp_datasets/{model}/seed_sim_{str(int(simulation))}_sol_seed_{str(int(solution))}.pickle" )
def test_equality_for_myopic_agents_and_tiny_delta(seed): """Test equality of simulated data and likelihood with myopia and tiny delta.""" np.random.seed(seed) # Get simulated data and likelihood for myopic model. params, options = generate_random_model(myopic=True) simulate = rp.get_simulate_func(params, options) df = simulate(params) crit_func = get_crit_func(params, options, df) likelihood = crit_func(params) # Get simulated data and likelihood for model with tiny delta. params.loc["delta", "value"] = 1e-12 df_ = simulate(params) crit_func_ = rp.get_crit_func(params, options, df_) likelihood_ = crit_func_(params) pd.testing.assert_frame_equal(df, df_) np.testing.assert_almost_equal(likelihood, likelihood_, decimal=12)
def task_get_history_delta_wage_moments(produces): np.random.seed(123) params, options, data_stored = rp.get_example_model("kw_94_one") params.loc[("delta", "delta")] model_to_simulate = rp.get_simulate_func(params, options) parameter_true = {"delta_delta": 0.95} pseudo_observed_data = compute_model( parameter_true, model_to_simulate=model_to_simulate, parameter_for_simulation=params, options_for_simulation=options, descriptives="wage_moments", ) population_size = 500 max_nr_populations = 10 minimum_epsilon = 0.05 delta_prior_low = 0.9 delta_prior_length = 0.09 parameters_prior = { "delta_delta": [[delta_prior_low, delta_prior_length], "uniform"] } history = respyabc( model=compute_model, parameters_prior=parameters_prior, data=pseudo_observed_data, distance_abc=compute_mean_squared_distance, descriptives="wage_moments", population_size_abc=population_size, max_nr_populations_abc=max_nr_populations, minimum_epsilon_abc=minimum_epsilon, ) with open(produces, "wb") as out_file: pickle.dump(history, out_file)
""" Simualte the norpy model with respy package under previously estimated optimal paramters. Use this as a sanity check of the model correctness. """ import yaml import respy as rp import numpy as np import pandas as pd from ov_respy_config import TEST_RESOURCES_DIR from adapter.smm_utils import get_moments #Get the basic modelstructure options = yaml.safe_load( (TEST_RESOURCES_DIR / f"norpy_estimates.yaml").read_text()) params = pd.read_csv(TEST_RESOURCES_DIR / f"norpy_estimates.csv", index_col=["category", "name"]) #Get rid of weird string values for x in params["value"]: x = float(x) #Simulate the data with the specified coefficeints simulate = rp.get_simulate_func(params, options) df = simulate(params) moments = pd.DataFrame(dict(get_moments(df)))
index = pd.MultiIndex.from_tuples(index, names=("agents", "draws", "tau")) rslts = pd.DataFrame(index=index, columns=["delta"]) params_base, options_base = rp.get_example_model("robinson", False) delta_true = params_base.loc[("delta", "delta"), "value"] for num_agents in GRID_AGENTS: options = options_base.copy() options["estimation_draws"] = num_draws options["solution_draws"] = num_draws for num_draws in GRID_DRAWS: simulate = rp.get_simulate_func(params_base, options) df = simulate(params_base) for tau in GRID_TAU: options["estimation_tau"] = tau options["simulation_agents"] = num_agents crit_func = rp.get_log_like_func(params_base, options, df) grid = np.concatenate((np.linspace(0.948, 0.952, 40), [delta_true])) fvals = [] for value in grid: params = params_base.copy() params.loc[("delta", "delta"), "value"] = value
def test_table_6_exact_solution_row_mean_and_sd(): """Replicate the first two rows of Table 6 in Keane and Wolpin (1994). In more detail, the mean effects and the standard deviations of a 500, 1000, and 2000 dollar tuition subsidy on years of schooling and of experience in occupation a and occupation b based on 40 samples of 100 individuals using true parameters are tested. """ # Specify the three different data sets. models = np.repeat(["one", "two", "three"], 2) tuition_subsidies = [0, 500, 0, 1000, 0, 2000] # Generate the 3 * 2 data sets as list of DataFrames by simulating with respective # tuition subsidy. data_frames = [] for model, subsidy in zip(models, tuition_subsidies): params, options = rp.get_example_model(f"kw_94_{model}", with_data=False) options["simulation_agents"] = 4000 simulate = rp.get_simulate_func(params, options) params.loc[("nonpec_edu", "at_least_twelve_exp_edu"), "value"] += subsidy data_frames.append(simulate(params)) columns = [ "Bootstrap_Sample", "Experience_Edu", "Experience_A", "Experience_B" ] # Calculate the statistics based on 40 bootstrap samples á 100 individuals. bootstrapped_statistics = [] for i, title in zip(range(0, 6, 2), ["kw_94_one", "kw_94_two", "kw_94_three"]): # Select sample with and without tuition subsidy. df_wo_ts = data_frames[i] df_w_ts = data_frames[i + 1] # Assign bootstrap sample number. df_wo_ts["Bootstrap_Sample"] = pd.cut(df_wo_ts.Identifier, bins=40, labels=np.arange(1, 41)) df_w_ts["Bootstrap_Sample"] = pd.cut(df_w_ts.Identifier, bins=40, labels=np.arange(1, 41)) # Calculate mean experiences. mean_exp_wo_ts = ( df_wo_ts.loc[df_wo_ts.Period.eq(39), columns].groupby("Bootstrap_Sample").mean()) mean_exp_w_ts = ( df_w_ts.loc[df_w_ts.Period.eq(39), columns].groupby("Bootstrap_Sample").mean()) # Calculate bootstrap statistics. diff = (mean_exp_w_ts.subtract(mean_exp_wo_ts).assign( Data=title).reset_index().set_index(["Data", "Bootstrap_Sample" ]).stack().unstack([0, 2])) bootstrapped_statistics.append(diff) rp_replication = pd.concat( [bs.agg(["mean", "std"]) for bs in bootstrapped_statistics], axis=1) # Expected values are taken from csv of table 6. kw_94_table_6 = pd.read_csv(TEST_RESOURCES_DIR / "kw_94_table_6.csv", index_col=0, header=[0, 1], nrows=2) # Test that standard deviations are very close. np.testing.assert_allclose(rp_replication.iloc[1], kw_94_table_6.iloc[1], atol=0.05) # Test that difference lies within one standard deviation. diff = rp_replication.iloc[0].to_numpy() - kw_94_table_6.iloc[0].to_numpy() assert (np.abs(diff) < kw_94_table_6.iloc[1]).all()
def test_1(): params, options = scaling_model_specification("kw_94_two", 10, 3) df = rp.get_simulate_func(params, options)(params) np.testing.assert_almost_equal(df.sum().sum(), 14022027476.102118)
def test_n_step_ahead_simulation_with_data(model): params, options, df = rp.get_example_model(model) options["n_periods"] = 11 simulate = rp.get_simulate_func(params, options, "n_step_ahead_with_data", df) _ = simulate(params)
def test_one_step_ahead_simulation(model): params, options, df = rp.get_example_model(model) options["n_periods"] = 11 simulate = rp.get_simulate_func(params, options, "one_step_ahead", df) _ = simulate(params)
def test_3(): params, options = scaling_model_specification(*get_random_request()) rp.get_simulate_func(params, options)(params)
def test_one_step_ahead_simulation(): params, options, df = rp.get_example_model("kw_97_basic") options["n_periods"] = 11 simulate = rp.get_simulate_func(params, options, "one_step_ahead", df) df = simulate(params)