def get_example_model(model, with_data=True): """Return parameters, options and data (optional) of an example model. Parameters ---------- model : str Choose one model name in ``{"robinson_crusoe_basic", "robinson_crusoe_extended", kw_94_one", "kw_94_two", "kw_94_three", "kw_97_basic", "kw_97_extended" "kw_2000"}``. with_data : bool Whether the accompanying data set should be returned. For some data sets, real data can be provided, for others, a simulated data set will be produced. """ assert model in EXAMPLE_MODELS, f"{model} is not in {EXAMPLE_MODELS}." options = yaml.safe_load( (TEST_RESOURCES_DIR / f"{model}.yaml").read_text()) params = pd.read_csv(TEST_RESOURCES_DIR / f"{model}.csv", index_col=["category", "name"]) if "kw_97" in model and with_data: df = (create_kw_97(params, options), ) elif ("kw_94" in model or "robinson" in model) and with_data: simulate = get_simulate_func(params, options) df = (simulate(params), ) else: df = () if with_data: warnings.warn(f"No data available for model '{model}'.", category=UserWarning) return (params, options) + df
def get_example_model(model, with_data=True): """Return parameters, options and data (optional) of an example model. Parameters ---------- model : str Use arbitrary string to see all available models in traceback. with_data : bool Whether the accompanying data set should be returned. """ assert model in EXAMPLE_MODELS, f"{model} is not in {EXAMPLE_MODELS}." options = yaml.safe_load( (TEST_RESOURCES_DIR / f"{model}.yaml").read_text()) params = pd.read_csv(TEST_RESOURCES_DIR / f"{model}.csv", index_col=["category", "name"]) if "kw_97" in model and with_data: df = (create_kw_97(params, options), ) elif ("kw_94" in model or "robinson" in model) and with_data: simulate = get_simulate_func(params, options) df = (simulate(params), ) else: df = () if with_data: warnings.warn(f"No data available for model '{model}'.", category=UserWarning) return (params, options) + df
def msm_args(worker_id): """Provides example input for testing method of simulated moments.""" calc_moments = {"Mean Wage": _calc_wage_mean, "Choices": _calc_choice_freq} params, options = get_example_model("kw_94_one", with_data=False) options["n_periods"] = 3 # Give each pytest worker another directory, so that they do not clean the directory # for the other workers. options["cache_path"] = f".respy-{worker_id}" simulate = get_simulate_func(params, options) df = simulate(params) empirical_moments = { "Choices": _replace_nans(_calc_choice_freq(df)), "Mean Wage": _replace_nans(_calc_wage_mean(df)), } weighting_matrix = get_diag_weighting_matrix(empirical_moments) return ( params, options, calc_moments, _replace_nans, empirical_moments, weighting_matrix, )
def test_transition_probabilities_for_one_exogenous_process(model_with_one_exog_proc): params, options = model_with_one_exog_proc simulate = get_simulate_func(params, options) df = simulate(params) df["Prev_Illness"] = df.groupby("Identifier")["Illness"].shift() probs = pd.crosstab(df["Illness"], df["Prev_Illness"], normalize=True) assert np.allclose(probs, [[0.81, 0.09], [0.09, 0.01]], atol=0.01)
def test_return_output_dict_for_likelihood(model): params, options = process_model_or_seed(model) options["n_periods"] = 3 simulate = get_simulate_func(params, options) df = simulate(params) log_like = get_log_like_func(params, options, df, return_scalar=False) log_like = log_like(params) assert isinstance(log_like["value"], float) assert isinstance(log_like["contributions"], np.ndarray) assert isinstance(log_like["comparison_plot_data"], pd.DataFrame)
def test_return_scalar_for_likelihood(model): params, options = process_model_or_seed(model) simulate = get_simulate_func(params, options) df = simulate(params) loglike = get_crit_func(params, options, df, return_scalar=True) value = loglike(params) assert isinstance(value, float) loglike = get_crit_func(params, options, df, return_scalar=False) array = loglike(params) assert isinstance(array, np.ndarray)
def simulate_truncated_data(params, options, is_missings=True): """Simulate a (truncated) dataset. The data can have two more properties. First, truncated history, second, missing wages. """ optim_paras, _ = process_params_and_options(params, options) simulate = get_simulate_func(params, options) df = simulate(params) np.random.seed(options["simulation_seed"]) if is_missings: # Truncate the histories of agents. This mimics the effect of attrition. # Histories can be truncated after the first period or not at all. So, all # individuals have at least one observation. period_of_truncation = ( # noqa: F841 df.reset_index() .groupby("Identifier") .Period.transform(lambda x: np.random.choice(x.max() + 1) + 1) .to_numpy() ) data_subset = df.query("Period < @period_of_truncation").copy() # Add some missings to wage data. is_working = data_subset["Choice"].isin(optim_paras["choices_w_wage"]) num_drop_wages = int(is_working.sum() * np.random.uniform(high=0.5)) if num_drop_wages > 0: indices = data_subset["Wage"][is_working].index index_missing = np.random.choice(indices, num_drop_wages, replace=False) data_subset.loc[index_missing, "Wage"] = np.nan else: pass else: data_subset = df # We can restrict the information to observed entities only. col_dtype = generate_column_dtype_dict_for_estimation(optim_paras) data_subset = data_subset[list(col_dtype)[2:]] return data_subset
def test_simulation_with_flexible_choice_sets(): params, options = process_model_or_seed("robinson_crusoe_basic") # Extend with observable characteristic. params.loc[("observable_health_well", "probability"), "value"] = 0.9 params.loc[("observable_health_sick", "probability"), "value"] = 0.1 # Sick people can never work. options["negative_choice_set"] = { "fishing": ["health == 'sick'"], "friday": ["period < 2", "exp_fishing == 0"], } # Create internal specification objects. optim_paras, options = process_params_and_options(params, options) simulate = get_simulate_func(params, options) df = simulate(params) assert isinstance(df, pd.DataFrame)
def test_return_scalar_for_likelihood(model): params, options = process_model_or_seed(model) options["n_periods"] = 3 simulate = get_simulate_func(params, options) df = simulate(params) log_like = get_log_like_func(params, options, df, return_scalar=True) value = log_like(params) assert isinstance(value, float) log_like_contribs = get_log_like_func(params, options, df, return_scalar=False) outputs = log_like_contribs(params) assert isinstance(outputs, dict)
def test_randomness_msm(model_or_seed): params, options = process_model_or_seed(model_or_seed) simulate = get_simulate_func(params, options) df = simulate(params) empirical_moments = _replace_nans(_calc_choice_freq(df)) weighting_matrix = get_diag_weighting_matrix(empirical_moments) weighted_sum_squared_errors = get_moment_errors_func( params, options, _calc_choice_freq, _replace_nans, empirical_moments, weighting_matrix, ) assert weighted_sum_squared_errors(params) == 0
def test_invariance_of_model_solution_in_solve_and_criterion_functions(model): params, options = process_model_or_seed(model) options["n_periods"] = 2 if model == "kw_2000" else 3 solve = get_solve_func(params, options) state_space = solve(params) simulate = get_simulate_func(params, options) df = simulate(params) state_space_sim = simulate.keywords["solve"].keywords["state_space"] criterion = get_crit_func(params, options, df) _ = criterion(params) state_space_crit = criterion.keywords["solve"].keywords["state_space"] for state_space_ in [state_space_sim, state_space_crit]: assert state_space.core.equals( state_space_.core.reindex_like(state_space.core)) apply_to_attributes_of_two_state_spaces( state_space.get_attribute("wages"), state_space_.get_attribute("wages"), np.testing.assert_array_equal, ) apply_to_attributes_of_two_state_spaces( state_space.get_attribute("nonpecs"), state_space_.get_attribute("nonpecs"), np.testing.assert_array_equal, ) apply_to_attributes_of_two_state_spaces( state_space.get_attribute("expected_value_functions"), state_space_.get_attribute("expected_value_functions"), np.testing.assert_array_equal, ) apply_to_attributes_of_two_state_spaces( state_space.get_attribute("base_draws_sol"), state_space_.get_attribute("base_draws_sol"), np.testing.assert_array_equal, )
def test_return_comparison_plot_data_for_likelihood(model): params, options = process_model_or_seed(model) simulate = get_simulate_func(params, options) df = simulate(params) loglike = get_crit_func(params, options, df, return_comparison_plot_data=False) loglike = loglike(params) assert isinstance(loglike, float) loglike = get_crit_func(params, options, df, return_comparison_plot_data=True) loglike, df = loglike(params) assert isinstance(loglike, float) assert isinstance(df, pd.DataFrame)
def test_invariance_of_model_solution_in_solve_and_criterion_functions(model): params, options = process_model_or_seed(model) solve = get_solve_func(params, options) state_space = solve(params) simulate = get_simulate_func(params, options) df = simulate(params) state_space_sim = simulate.keywords["solve"].keywords["state_space"] log_like = get_log_like_func(params, options, df) _ = log_like(params) state_space_crit = log_like.keywords["solve"].keywords["state_space"] for state_space_ in [state_space_sim, state_space_crit]: assert state_space.core.equals( state_space_.core.reindex_like(state_space.core)) apply_to_attributes_of_two_state_spaces( state_space.wages, state_space_.wages, np.testing.assert_array_equal, ) apply_to_attributes_of_two_state_spaces( state_space.nonpecs, state_space_.nonpecs, np.testing.assert_array_equal, ) apply_to_attributes_of_two_state_spaces( state_space.expected_value_functions, state_space_.expected_value_functions, np.testing.assert_array_equal, ) apply_to_attributes_of_two_state_spaces( state_space.base_draws_sol, state_space_.base_draws_sol, np.testing.assert_array_equal, )
def inputs(): calc_moments = {"Mean Wage": _calc_wage_mean, "Choices": _calc_choice_freq} params, options = get_example_model("kw_94_one", with_data=False) options["n_periods"] = 5 simulate = get_simulate_func(params, options) df = simulate(params) empirical_moments = { "Choices": _replace_nans(_calc_choice_freq(df)), "Mean Wage": _replace_nans(_calc_wage_mean(df)), } weighting_matrix = get_diag_weighting_matrix(empirical_moments) return ( params, options, calc_moments, _replace_nans, empirical_moments, weighting_matrix, )
def get_moment_errors_func( params, options, calc_moments, replace_nans, empirical_moments, weighting_matrix=None, n_simulation_periods=None, return_scalar=True, ): """Get the moment errors function for MSM estimation. Parameters ---------- params : pandas.DataFrame or pandas.Series Contains parameters. options : dict Dictionary containing model options. calc_moments : callable or list or dict Function(s) used to calculate simulated moments. Must match structure of empirical moments i.e. if empirical_moments is a list of pandas.DataFrames, calc_moments must be a list of the same length containing functions that correspond to the moments in empirical_moments. replace_nans : callable or list or dict or None Functions(s) specifying how to handle missings in simulated_moments. Must match structure of empirical_moments. empirical_moments : pandas.DataFrame or pandas.Series or dict or list Contains the empirical moments calculated for the observed data. Moments should be saved to pandas.DataFrame or pandas.Series that can either be passed to the function directly or as items of a list or dictionary. Index of pandas.DataFrames can be of type MultiIndex, but columns cannot. weighting_matrix : numpy.ndarray, default None Square matrix of dimension (NxN) with N denoting the number of empirical_moments. Used to weight squared moment errors. Will use identity matrix by default. n_simulation_periods : int, default None Dictates the number of periods in the simulated dataset. This option does not affect ``options["n_periods"]`` which controls the number of periods for which decision rules are computed. return_scalar : bool, default True Indicates whether to return the scalar value of weighted square product of moment error vector or dictionary that additionally contains vector of (weighted) moment errors, simulated moments that follow the structure of empirical moments, and simulated as well as empirical moments in a pandas.DataFrame that adheres to a tidy data format. The dictionary will contain the following key and value pairs: - "value": Scalar vale of weighted moment errors (float) - "root_contributions": Moment error vectors multiplied with root of weighting matrix (numpy.ndarray) - "simulated_moments": Simulated moments for given parametrization. Will be in the same data format as `empirical_moments` (pandas.Series or pandas.DataFrame or list or dict) - "comparison_plot_data": A :class:`pandas.DataFrame` that contains both empirical and simulated moments in a tidy data format (pandas.DataFrame). Data contains the following columns: - ``moment_column``: Contains the column names of the moment DataFrames/Series names. - ``moment_index``: Contains the index of the moment DataFrames/ Series.MultiIndex indices will be joined to one string. - ``value``: Contains moment values. - ``moment_set``: Indicator for each set of moments, will use keys if empirical_moments are specified in a dict. Moments input as lists will be numbered according to position. - ``kind``: Indicates whether moments are empirical or simulated. Returns ------- moment_errors_func : callable Function where all arguments except the parameter vector are set. Raises ------ ValueError If replacement function cannot be broadcast (1:1 or 1:N) to simulated moments. ValueError If the number of functions to compute the simulated moments does not match the number of empirical moments. """ empirical_moments = copy.deepcopy(empirical_moments) are_empirical_moments_dict = isinstance(empirical_moments, dict) if weighting_matrix is None: weighting_matrix = get_diag_weighting_matrix(empirical_moments) simulate = get_simulate_func(params=params, options=options, n_simulation_periods=n_simulation_periods) empirical_moments = _harmonize_input(empirical_moments) calc_moments = _harmonize_input(calc_moments) # If only one replacement function is given for multiple sets of moments, # duplicate replacement function for all sets of simulated moments. if replace_nans is None: replace_nans = _return_input if callable(replace_nans): replace_nans = {k: replace_nans for k in empirical_moments} replace_nans = _harmonize_input(replace_nans) if 1 < len(replace_nans) < len(empirical_moments): raise ValueError( "Replacement functions can only be matched 1:1 or 1:n with sets of " "empirical moments.") elif len(replace_nans) > len(empirical_moments): raise ValueError( "There are more replacement functions than sets of empirical moments." ) else: pass if len(calc_moments) != len(empirical_moments): raise ValueError( "Number of functions to calculate simulated moments must be equal to " "the number of sets of empirical moments.") moment_errors_func = functools.partial( moment_errors, simulate=simulate, calc_moments=calc_moments, replace_nans=replace_nans, empirical_moments=empirical_moments, weighting_matrix=weighting_matrix, return_scalar=return_scalar, are_empirical_moments_dict=are_empirical_moments_dict, ) return moment_errors_func
def get_msm_func( params, options, calc_moments, replace_nans, empirical_moments, weighting_matrix, n_simulation_periods=None, return_scalar=True, ): """Get the msm function. Parameters ---------- params : pandas.DataFrame or pandas.Series Contains parameters. options : dict Dictionary containing model options. calc_moments : callable or list Function(s) used to calculate simulated moments. Must match structure of empirical moments i.e. if empirical_moments is a list of pandas.DataFrames, calc_moments must be a list of the same length containing functions that correspond to the moments in empirical_moments. replace_nans : callable or list Functions(s) specifying how to handle missings in simulated_moments. Must match structure of empirical_moments. Exception: If only one replacement function is specified, it will be used on all sets of simulated moments. empirical_moments : pandas.DataFrame or pandas.Series or dict or list Contains the empirical moments calculated for the observed data. Moments should be saved to pandas.DataFrame or pandas.Series that can either be passed to the function directly or as items of a list or dictionary. Index of pandas.DataFrames can be of type MultiIndex, but columns cannot. weighting_matrix : numpy.ndarray Square matrix of dimension (NxN) with N denoting the number of empirical_moments. Used to weight squared moment errors. n_simulation_periods : int, default None Dictates the number of periods in the simulated dataset. This option does not affect ``options["n_periods"]`` which controls the number of periods for which decision rules are computed. return_scalar : bool, default True Indicates whether to return moment error vector (False) or weighted square product of moment error vector (True). Returns ------- msm_func: callable MSM function where all arguments except the parameter vector are set. """ empirical_moments = copy.deepcopy(empirical_moments) simulate = get_simulate_func(params=params, options=options, n_simulation_periods=n_simulation_periods) empirical_moments = _harmonize_input(empirical_moments) calc_moments = _harmonize_input(calc_moments) replace_nans = _harmonize_input(replace_nans) # If only one replacement function is given for multiple sets of moments, duplicate # replacement function for all sets of simulated moments. if len(replace_nans) == 1 and len(empirical_moments) > 1: replace_nans = replace_nans * len(empirical_moments) elif 1 < len(replace_nans) < len(empirical_moments): raise ValueError( "Replacement functions can only be matched 1:1 or 1:n with sets of " "empirical moments.") elif len(replace_nans) > len(empirical_moments): raise ValueError( "There are more replacement functions than sets of empirical moments." ) else: pass if len(calc_moments) != len(empirical_moments): raise ValueError( "Number of functions to calculate simulated moments must be equal to " "the number of sets of empirical moments.") msm_func = functools.partial( msm, simulate=simulate, calc_moments=calc_moments, replace_nans=replace_nans, empirical_moments=empirical_moments, weighting_matrix=weighting_matrix, return_scalar=return_scalar, ) return msm_func