def save_pair_plot(fit, param_names=None, info_path=InfoPath(), pair_plot_params=PairPlotParams()): """ Save a pair plot of distributions of parameters. It helps to see correlations between parameters and spot funnel shaped distributions that can result in sampling problems. Parameters ---------- fit : cmdstanpy.stanfit.CmdStanMCMC Samples from cmdstanpy. param_names : list of str Names of parameters. Include all if None. info_path : InfoPath Path information for creating summaries. """ info_path.set_codefile() param_names = filter_param_names(fit.column_names, param_names) samples = fit.get_drawset(params=param_names) shared_save_pair_plot(samples, param_names=param_names, info_path=info_path, pair_plot_params=pair_plot_params)
def save_summary(samples, param_names=None, info_path=InfoPath(), summary_params=SummaryParams()): """ Generates and saves statistical summary of the samples using mean, std, mode, hpdi. Parameters ---------- samples : Panda's dataframe Each column contains samples for a parameter. param_names : list of str Names of parameters to be included in the summary. Include all if None. info_path : InfoPath Path information for creating summaries. """ info_path.set_codefile() column_names = list(samples) param_names = filter_param_names(column_names, param_names) samples = samples[param_names] # Filter by column names df_summary, table = sample_summary(samples, params=summary_params) return save_summary_to_disk(df_summary, table, info_path)
def save_tree_plot(models, extra_values=[], param_names=None, info_path=InfoPath(), summary_params=SummaryParams(), tree_params=TreePlotParams()): """ Save a tree plot that summarises parameter distributions. Can compare summaries from multiple models, when multiple samples are supplied. One can also supply additional markers to be compared with using `extra_values` parameter. Parameters ---------- models : list Panda's data frames List of data frames for each model, containg sample values for multiple parameters (one parameter is one data frame column). Supply multiple data frames to see their distribution summaries compared on the tree plot. extra_values : list of dict Additional markers to be shown on tree plot, without error bars: [ { "mu": 2.3, "sigma": 3.3 } ] param_names : list of str Names of parameters. Include all if None. info_path : InfoPath Path information for creating summaries. """ info_path.set_codefile() summaries = [] for samples in models: column_names = list(samples) param_names = filter_param_names(column_names, param_names) summary, _ = sample_summary(samples, params=summary_params) summaries.append(summary) for values in extra_values: summaries.append(summary_from_dict(values)) make_comparative_tree_plot(summaries, info_path=info_path, tree_params=tree_params)
def make_histograms(samples, summary, param_names=None, params=HistogramParams(), summary_params=SummaryParams()): """ Make multiple files with histograms for the parameters from posterior destribution. Parameters ----------- samples : Panda's DataFrame Each column contains samples from posterior distribution. summary : Panda's DataFrame Summary information about each column. param_names : list of str Names of the parameters for plotting. If None, all will be plotted. """ param_names = filter_param_names(samples.columns, param_names) # Total number of plots n_plots = math.ceil(math.ceil(len(param_names) / params.ncols) / params.num_plot_rows) if n_plots > params.max_plot_pages: print(( f'Showing only first {params.max_plot_pages} ' f'pages out of {n_plots} of histogram.' 'Consider specifying "param_names".')) n_plots = params.max_plot_pages if n_plots < 1: n_plots = 1 figures_and_axes = [] # Make multiple traceplots for i_plot in range(n_plots): fig, ax = make_histogram_one_page( i_start=i_plot * params.num_plot_rows * params.ncols, samples=samples, summary=summary, param_names=param_names, params=params, summary_params=summary_params) figures_and_axes.append([fig, ax]) return figures_and_axes
def save_tree_plot(fits, extra_values=[], param_names=None, info_path=InfoPath(), summary_params=SummaryParams(), tree_params=TreePlotParams()): """ Save a tree plot that summarises parameter distributions. Can compare summaries from multiple models, when multiple fits are supplied. One can also supply additional markers to be compared with using `extra_values` parameter. Parameters ---------- fits : list of cmdstanpy.stanfit.CmdStanMCMC Contains the samples from cmdstanpy. extra_values : list of dict Additional markers to be shown on tree plot, without error bars: [ { "mu": 2.3, "sigma": 3.3 } ] param_names : list of str Names of parameters. Include all if None. info_path : InfoPath Path information for creating summaries. """ info_path.set_codefile() summaries = [] for fit in fits: param_names = filter_param_names(fit.column_names, param_names) samples = fit.get_drawset(params=param_names) summary, _ = sample_summary(samples, params=summary_params) summaries.append(summary) for values in extra_values: summaries.append(summary_from_dict(values)) make_comparative_tree_plot(summaries, info_path=info_path, tree_params=tree_params)
def make_pair_plot(samples, param_names=None, pair_plot_params=PairPlotParams()): """ Make a pair plot for the parameters from posterior destribution. Parameters ----------- samples : Panda's DataFrame Each column contains samples from posterior distribution. param_names : list of str Names of the parameters for plotting. If None, all will be plotted. Returns ------- Seaborn's PairGrid """ param_names = filter_param_names(samples.columns, param_names) if len(param_names) > pair_plot_params.max_params: print(( f'Showing only first {pair_plot_params.max_params} ' f'parameters out of {len(param_names)} in pair plot.' 'Consider limiting the parameter with "param_names".')) param_names = param_names[:pair_plot_params.max_params] samples = samples[param_names] # Show no more than `max_samples` markers keep_nth = math.ceil(samples.shape[0] / pair_plot_params.max_samples) samples = samples[::keep_nth] g = sns.PairGrid(samples) g = g.map_upper(sns.scatterplot, s=pair_plot_params.marker_size, color=pair_plot_params.color, edgecolor=pair_plot_params.edgecolor, alpha=pair_plot_params.alpha) g = g.map_lower(sns.kdeplot, color=pair_plot_params.color) g = g.map_diag(plt.hist, color=pair_plot_params.color, edgecolor=pair_plot_params.diag_edge_color) return g
def traceplot(fit, param_names=None, params=TraceplotParams()): """ Show traceplots, diagnostic plots of samples for all parameters for all chains. Parameters ---------- param_names: list of str List of parameters to plot. If None, plot all. """ sns.set(style="ticks") # Make the list of columns to be shown in the plots param_names = filter_param_names(fit.column_names, param_names) param_names.insert(0, 'lp__') # Always show traceplot of probability # Total number of plots n_plots = math.ceil(len(param_names) / params.num_traceplot_rows) if n_plots > params.max_traceplot_pages: print(( f'Traceplot shows only first {params.max_traceplot_pages} ' f'pages out of {n_plots}. Consider specifying "param_names".')) n_plots = params.max_traceplot_pages if n_plots < 1: n_plots = 1 figures_and_axes = [] # Make multople traceplots for i_plot in range(n_plots): fig, ax = make_single_traceplot( i_start=i_plot * params.num_traceplot_rows, fit=fit, param_names=param_names, params=params) figures_and_axes.append([fig, ax]) return figures_and_axes
def make_summary(fit, param_names, summary_params=SummaryParams()): """ Returns statistical summary table for parameters: mean, std, mode, hpdi. Parameters ---------- fit : cmdstanpy.stanfit.CmdStanMCMC Contains the samples from cmdstanpy. param_names : list of str Names of parameters to be included in the summar. Include all if None. """ param_names = filter_param_names(fit.column_names, param_names) samples = fit.get_drawset(params=param_names) # Get R_hat values from the summary # -------- df_summary = fit.summary() df_summary.rename( index=(lambda name: name.replace('[', '.').replace(']', '')), inplace=True) df_summary = df_summary[['N_Eff', 'R_hat']] df_summary['N_Eff'] = np.round(df_summary['N_Eff']) df_summary['N_Eff'] = df_summary['N_Eff'].astype(int) # Get the summary df_summary, table = sample_summary(df=samples, extra_values=df_summary, params=summary_params) return df_summary, table, samples
def save_compare_parameters( fits, labels, extra_values=[], param_names=None, type: CompareParametersType = CompareParametersType.TEXT, info_path=InfoPath(), summary_params=SummaryParams()): """ Saves a text table that compares model parameters Parameters ---------- fits : list of cmdstanpy.stanfit.CmdStanMCMC Contains the samples from cmdstanpy. labels : list of str Names of the models in `fits` list. extra_values : list of dict Additional values to be shown in the table: [ { "mu": 2.3, "sigma": 3.3 } ] param_names : list of str Names of parameters. Include all if None. type : CompareParametersType Format of values in the text table. info_path : InfoPath Path information for creating summaries. """ info_path.set_codefile() models = [] for fit in fits: param_names = filter_param_names(fit.column_names, param_names) samples = fit.get_drawset(params=param_names) models.append(samples) shared_save_compare_parameters(models, labels=labels, extra_values=extra_values, type=type, param_names=param_names, info_path=info_path, summary_params=summary_params)
def test_filter_param_names__numbered(): result = filter_param_names(['a.1', 'a.2', 'a', 'b', 'c'], ['a', 'b']) assert result == ['a.1', 'a.2', 'a', 'b']
def test_filter_param_names(): result = filter_param_names(['a', 'b', 'c'], ['a', 'b']) assert result == ['a', 'b']
def test_filter_param_names__remove_technical_columns(): result = filter_param_names(['a', 'stepsize__', 'c']) assert result == ['a', 'c']
def compare_parameters( models, labels, extra_values=[], type: CompareParametersType = CompareParametersType.TEXT, param_names=None, summary_params=SummaryParams()): """ Create model parameters Parameters ---------- models : list Panda's data frames List of data frames for each model, containg sample values for multiple parameters (one parameter is one data frame column). Supply multiple data frames to compare parameter distributions. labels : list of str Names of the models in `models` list. extra_values : list of dict Additional values to be shown in the table: [ { "mu": 2.3, "sigma": 3.3 } ] type : CompareParametersType Format of values in the text table. param_names : list of str Names of parameters. Include all if None. Returns -------- df: Panda's data frame Table in Panda's format txt : str Table in text format """ if len(models) == 0: raise ValueError('Models list is empty') return if (len(models) + len(extra_values)) != len(labels): raise ValueError('Models list length is different from labels') return samples = models[0] column_names = list(samples) param_names = filter_param_names(column_names, param_names) df = pd.DataFrame(index=labels, columns=param_names) param_names_filtered = None for samples, label in zip(models, labels): column_names = list(samples) if param_names_filtered is None: param_names_filtered = filter_param_names(column_names, param_names) samples = samples[param_names_filtered] df_summary, _ = sample_summary(samples, params=summary_params) values = [ format_parameter(df_summary.loc[name], type) for name in param_names_filtered ] df.loc[label] = values # Add extra values # --------------- extra_labels = labels[len(models):] for data, label in zip(extra_values, extra_labels): column_names = list(data.keys()) values = [] for name in param_names_filtered: text_value = '' if name in data: text_value = format_value(data[name], type) values.append(text_value) df.loc[label] = values table = tabulate(df, headers=param_names_filtered, tablefmt="pipe", stralign="right") return df, table